In [27]:
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error
import numpy as np

In [36]:
df = pd.read_csv("diabetes.csv")

In [30]:
# Split into 80% train, 20% test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("Train shape:", X_train.shape)
print("Test shape :", X_test.shape)

Train shape: (614, 8)
Test shape : (154, 8)


In [37]:
print(df.columns)

Index(['pregnancies', 'glucose', 'diastolic', 'triceps', 'insulin', 'bmi',
       'dpf', 'age', 'diabetes'],
      dtype='object')


In [28]:
# Creating Linear Regression model (no regularization)
lin_reg = LinearRegression()

# Train model on training set
lin_reg.fit(X_train, y_train)

# Observe coefficients
print("Linear Regression coefficients:")
print(lin_reg.coef_)          # Coefficients for each feature
print("Intercept:", lin_reg.intercept_)

# Predictions on train and test sets
y_train_pred = lin_reg.predict(X_train)
y_test_pred = lin_reg.predict(X_test)

# Compute Mean Squared Error (MSE)
mse_train_lin = mean_squared_error(y_train, y_train_pred)
mse_test_lin = mean_squared_error(y_test, y_test_pred)

print("Linear Regression MSE (train):", mse_train_lin)
print("Linear Regression MSE (test) :", mse_test_lin)




Linear Regression coefficients:
[ 0.01873067  0.00621873 -0.00173983  0.00077564 -0.00019264  0.0134609
  0.13171851  0.00250584]
Intercept: -0.9319290291258022
Linear Regression MSE (train): 0.1564335817986174
Linear Regression MSE (test) : 0.1678075569793761


In [40]:


# Grid of alpha values (regularization strength)
alpha_values = np.logspace(-3, 3, 13)
param_grid = {"alpha": alpha_values}

# Ridge with CV
ridge = Ridge(random_state=42)
ridge_cv = GridSearchCV(
    estimator=ridge,
    param_grid=param_grid,
    scoring="neg_mean_squared_error",
    cv=5
)
ridge_cv.fit(X_train, y_train)

print("Best Ridge alpha:", ridge_cv.best_params_["alpha"])

# Lasso with CV
lasso = Lasso(random_state=42, max_iter=10000)
lasso_cv = GridSearchCV(
    estimator=lasso,
    param_grid=param_grid,
    scoring="neg_mean_squared_error",
    cv=5
)
lasso_cv.fit(X_train, y_train)

print("Best Lasso alpha:", lasso_cv.best_params_["alpha"])

# Best models
best_ridge = ridge_cv.best_estimator_
best_lasso = lasso_cv.best_estimator_

# Evaluate on train and test
from sklearn.metrics import mean_squared_error

y_train_pred_ridge = best_ridge.predict(X_train)
y_test_pred_ridge = best_ridge.predict(X_test)

y_train_pred_lasso = best_lasso.predict(X_train)
y_test_pred_lasso = best_lasso.predict(X_test)

print("Ridge MSE (train):", mean_squared_error(y_train, y_train_pred_ridge))
print("Ridge MSE (test) :", mean_squared_error(y_test, y_test_pred_ridge))

print("Lasso MSE (train):", mean_squared_error(y_train, y_train_pred_lasso))
print("Lasso MSE (test) :", mean_squared_error(y_test, y_test_pred_lasso))


Best Ridge alpha: 10.0
Best Lasso alpha: 0.001
Ridge MSE (train): 0.1574679653069518
Ridge MSE (test) : 0.17159157208036954
Lasso MSE (train): 0.15745426415499222
Lasso MSE (test) : 0.17142704792493244


In [39]:
# Coefficients
ridge_coef = best_ridge.coef_
lasso_coef = best_lasso.coef_

print("Ridge coefficients:", ridge_coef)
print("Lasso coefficients:", lasso_coef)

# Nonâ€‘zero coefficients (sparsity)
print("Non-zero Ridge coefficients:", (ridge_coef != 0).sum())
print("Non-zero Lasso coefficients:", (lasso_coef != 0).sum())

print("Baseline Linear MSE train/test:", mse_train_lin, mse_test_lin)


Ridge coefficients: [ 0.01038478  0.00565049 -0.00228489  0.00057635 -0.00027591  0.01506701
  0.0965685   0.00647988]
Lasso coefficients: [ 0.01031842  0.00564403 -0.00227653  0.00055532 -0.00027643  0.01504247
  0.10195102  0.00647779]
Non-zero Ridge coefficients: 8
Non-zero Lasso coefficients: 8
Baseline Linear MSE train/test: 0.1564335817986174 0.1678075569793761
