In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as stats
import seaborn as sns

from sklearn.datasets import load_diabetes, load_breast_cancer
from sklearn.feature_selection import mutual_info_regression
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.metrics import r2_score

# Autoreload of the files
%load_ext autoreload
%autoreload 2

In [None]:
# Load dataset
X, y = load_diabetes(return_X_y=True)
random_state = 4

# 1 - Linear regression
***

**Question :** Quick visualization to understand your data set.

**Question :** Split your data set into train and test sets, train a simple linear regression model and plot the performance of the model on the test set.

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print("MSE:", mean_squared_error(y_test, y_pred))
print("R2:", r2_score(y_test, y_pred))
print(f"MAPE: {mean_absolute_percentage_error(y_test, y_pred):.2f}")

# 2 - Ridge and Lasso regresssion
***

**Question :** Before diving into the modelisation, you have to prepare your cross-validation process for parametrization of Ridge and Lasso regressions. What do you do with your train set ? 

In [None]:
# Set up the cross validation
from re import A
from sklearn.linear_model import Ridge
from sklearn.model_selection import KFold

# Fit polynomial regression
alphas = [1, 6e-1, 2e-1, 1e-1, 8e-2, 4e-2, 1e-2]
avg_rmse = np.zeros(len(alphas))
for idx, alpha in enumerate(alphas):
    kf = KFold(n_splits=4)
    current_rmse = np.zeros(4)
    for i, (train_index, val_index) in enumerate(kf.split(y_train)):
        X_train_ = X_train[train_index]
        y_train_ = y_train[train_index]
        model = Ridge(alpha=alpha)
        model.fit(X_train_, y_train_)

        X_val_ = X_train[val_index]
        y_val_ = y_train[val_index]
        y_pred = model.predict(X_val_)
        current_rmse[i] = np.sqrt(r2_score(y_val_, y_pred))
    avg_rmse[idx] = np.mean(current_rmse)

In [None]:
for idx, alpha in enumerate(alphas):
    print(alpha, avg_rmse[idx])

In [None]:
model_final = Ridge(alpha=1e-8)
model_final.fit(X_train, y_train)


In [None]:
y_pred = model_final.predict(X_test)
print("MSE:", mean_squared_error(y_test, y_pred))
print("R2:", r2_score(y_test, y_pred))
print(f"MAPE: {mean_absolute_percentage_error(y_test, y_pred):.2f}")

In [None]:

plt.bar([str(a) for a in alphas], avg_rmse, width=0.3, color='darkblue')
plt.xlabel('Alpha')
plt.ylabel('RMSE')
plt.title('RMSE vs Alpha')
plt.grid(True, alpha=0.5)
plt.show()

### Parametrization

**Question :** You have a parameter lambda for each method ridge and lasso that you need to determine for your models. To do so, you will train and validate a model for each possible values of lambda from the following sets :

[1e0, 1e-1, 1e-2, 1e-3, 1e-4, 1e-5, 1e-6]

In [None]:
# Train and validate your model's parameters

### Final train with the best parameters

**Question :** Train your model with the best parameters found in the previous question. Then, plot the predictions of your model on the test set.

In [None]:
# Train and test your best model (you can compare the performances with the results find in linear regression)