#### 1. LASSO REGRESSION

In [1]:
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import numpy as np

# Sample synthetic data: height vs sport prediction (overfitting example)
np.random.seed(42)
X = np.random.rand(100, 1) * 2  # Heights between 0 and 2 meters
y = 3 + 2 * X + 4 * X**2 + np.random.randn(100, 1)  # Nonlinear target

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# 1. Linear Regression (prone to overfitting with high-degree polynomials)
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)
mse_lr = mean_squared_error(y_test, y_pred_lr)

# 2. Lasso Regression (L1 penalty - shrinks coefficients, can zero them out)
lasso = Lasso(alpha=0.1)
lasso.fit(X_train, y_train)
y_pred_lasso = lasso.predict(X_test)
mse_lasso = mean_squared_error(y_test, y_pred_lasso)

# 3. Ridge Regression (L2 penalty - shrinks coefficients but keeps them)
ridge = Ridge(alpha=0.1)
ridge.fit(X_train, y_train)
y_pred_ridge = ridge.predict(X_test)
mse_ridge = mean_squared_error(y_test, y_pred_ridge)

# Display results
print(f"Linear Regression MSE: {mse_lr:.4f}")
print(f"Lasso Regression MSE: {mse_lasso:.4f}")
print(f"Ridge Regression MSE: {mse_ridge:.4f}")

Linear Regression MSE: 3.7603
Lasso Regression MSE: 3.6972
Ridge Regression MSE: 3.7478


In [10]:
### CLASS WORK
# import libraries and load dataset
import numpy as np
import pandas as pd
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error, r2_score

In [14]:
diabetes =  load_diabetes()
df =pd.DataFrame(diabetes.data, columns=diabetes.feature_names)
df['target'] = diabetes.target

In [15]:
df.head()

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6,target
0,0.038076,0.05068,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019907,-0.017646,151.0
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.068332,-0.092204,75.0
2,0.085299,0.05068,0.044451,-0.00567,-0.045599,-0.034194,-0.032356,-0.002592,0.002861,-0.02593,141.0
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022688,-0.009362,206.0
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031988,-0.046641,135.0


In [17]:
df.describe()

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6,target
count,442.0,442.0,442.0,442.0,442.0,442.0,442.0,442.0,442.0,442.0,442.0
mean,-2.511817e-19,1.23079e-17,-2.245564e-16,-4.79757e-17,-1.3814990000000001e-17,3.9184340000000004e-17,-5.777179e-18,-9.04254e-18,9.293722000000001e-17,1.130318e-17,152.133484
std,0.04761905,0.04761905,0.04761905,0.04761905,0.04761905,0.04761905,0.04761905,0.04761905,0.04761905,0.04761905,77.093005
min,-0.1072256,-0.04464164,-0.0902753,-0.1123988,-0.1267807,-0.1156131,-0.1023071,-0.0763945,-0.1260971,-0.1377672,25.0
25%,-0.03729927,-0.04464164,-0.03422907,-0.03665608,-0.03424784,-0.0303584,-0.03511716,-0.03949338,-0.03324559,-0.03317903,87.0
50%,0.00538306,-0.04464164,-0.007283766,-0.005670422,-0.004320866,-0.003819065,-0.006584468,-0.002592262,-0.001947171,-0.001077698,140.5
75%,0.03807591,0.05068012,0.03124802,0.03564379,0.02835801,0.02984439,0.0293115,0.03430886,0.03243232,0.02791705,211.5
max,0.1107267,0.05068012,0.1705552,0.1320436,0.1539137,0.198788,0.1811791,0.1852344,0.1335973,0.1356118,346.0


#### Train Test Split

In [23]:
X = diabetes.data     # All the columns we think have a linear corr or linear r/shp to y
y = diabetes.target   # Whether they have diabetes or not

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=42)

# X_train, X_test, y_train, y_test are the split datasets â€” training inputs/outputs and testing inputs/outputs


Version 1

In [30]:
scaler = StandardScaler()
X_train_Scaled =  scaler.fit_transform(X_train)
X_test_Scaled = scaler.fit_transform(X_test)

Version 2

In [31]:
lr = LinearRegression()
lr.fit(X_train_Scaled, y_train)
y_pred = lr.predict(X_test_Scaled)

Evaluation V2

In [33]:
print("RSME:",  np.sqrt(mean_squared_error(y_test, y_pred)))
print("R2_Score:", r2_score(y_test, y_pred))
print("Coefficient:", lr.coef_)

RSME: 54.52386206698789
R2_Score: 0.4388889416894728
Coefficient: [  1.75375799 -11.51180908  25.60712144  16.82887167 -44.44885564
  24.64095356   7.67697768  13.1387839   35.16119521   2.35136365]


### Penalize (introduce Regularization)

In [51]:
## Define Alpha (1.0), initialize either ridge or lasso regression
## Ridge (Square the absolute value of our coefficients)

ridge = Ridge (alpha = 1.0)  ### penalty strength  0=ordinary linear regression, 1 =medium, 10, 100 are strong penaly strenghts(very small coefficients - ending up with underfit models)
ridge.fit(X_train_Scaled, y_train)   #training it on ridge regression (regularization)
pred_y_ridge =  ridge.predict(X_test_Scaled) # predicting on ridge regression
print("Ridge RSME: ", np.sqrt(mean_squared_error(y_test, pred_y_ridge)))


Ridge RSME:  54.371588006358564


In [54]:
## Lasso Regression  = Overfed
### The avsolute value of our coefficients
lasso = Lasso(alpha=1)
lasso.fit(X_train_Scaled, y_train)
pred_y_lasso = lasso.predict(X_test_Scaled)
print("Lasso RSME: ", np.sqrt(mean_squared_error(y_test, pred_y_lasso)))
print("Coefficient:",lasso.coef_)


Lasso RSME:  53.604840647137905
Coefficient: [ 0.68703212 -9.29751904 26.21922482 15.65731357 -8.2281718  -0.
 -9.02408714  3.42086059 22.63646533  2.09864712]


In [None]:
## Define Alpha (1.0), initialize either ridge or lasso regression
## Ridge (Square the absolute value of our coefficients)

ridge = Ridge (alpha = 1.0)  ### penalty strength  0=ordinary linear regression, 1 =medium, 10, 100 are strong penaly strenghts(very small coefficients - ending up with underfit models)
ridge.fit(X_train_Scaled, y_train)   #training it on ridge regression (regularization)
pred_y_ridge =  ridge.predict(X_test_Scaled) # predicting on ridge regression
print("Ridge RSME: ", np.sqrt(mean_squared_error(y_test, pred_y_ridge)))


Ridge RSME:  54.371588006358564


#### Lasso = L2
#### Ridge  = l2