In [6]:
from sklearn import datasets
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
import pandas as pd
import numpy as np

# Loading the Diabetes dataset
diabetes_X, diabetes_y = datasets.load_diabetes(return_X_y=True)

# Train-validation-test split
X_train, X_temp, y_train, y_temp = train_test_split(diabetes_X, diabetes_y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Data Preparation and Cross-Validation for Polynomial Degrees 1 to 8
degrees = list(range(1, 9))
cv_results = []

for degree in degrees:
    poly_features = PolynomialFeatures(degree=degree, include_bias=True)
    X_train_poly = poly_features.fit_transform(X_train)
    
    # Cross-validation with 5 folds
    r_squared_scores = cross_val_score(LinearRegression(), X_train_poly, y_train, cv=5, scoring='r2')
    mae_scores = -cross_val_score(LinearRegression(), X_train_poly, y_train, cv=5, scoring='neg_mean_absolute_error')
    
    # Calculating mean and standard deviation of scores
    mean_r_squared = np.mean(r_squared_scores)
    std_r_squared = np.std(r_squared_scores)
    mean_mae = np.mean(mae_scores)
    std_mae = np.std(mae_scores)
    
    cv_results.append([degree, mean_r_squared, std_r_squared, mean_mae, std_mae])

# Creating a DataFrame for the results
results_df = pd.DataFrame(cv_results, columns=['Degree', 'Mean R-Squared', 'Std R-Squared', 'Mean MAE', 'Std MAE'])

# Identifing the best model based on R-squared and MAE
best_model = results_df.loc[results_df['Mean R-Squared'].idxmax()]

# Print the table summarizing the cross-validation results
print(results_df)

# Print the best model and explanation for choosing it
print("\nBest Model: Degree {}".format(int(best_model['Degree'])))
print("Mean R-Squared: {:.2f}".format(best_model['Mean R-Squared']))
print("Mean MAE: {:.2f}".format(best_model['Mean MAE']))


   Degree  Mean R-Squared  Std R-Squared    Mean MAE     Std MAE
0       1        0.452281       0.127329   45.580788    1.609128
1       2       -0.089528       0.445264   62.386924   13.185748
2       3     -508.700734     355.193526  942.229340  141.054469
3       4      -45.379335      24.676917  295.558549   48.749112
4       5      -44.383008      23.772236  292.955639   47.967113
5       6      -44.385918      23.785306  292.951956   47.965431
6       7      -44.385782      23.785132  292.949499   47.965919
7       8      -44.385134      23.780852  292.953707   47.967797

Best Model: Degree 1
Mean R-Squared: 0.45
Mean MAE: 45.58


The degree 1 polynomial model exhibits the highest mean R-squared value (0.45) and the lowest mean MAE (45.58) among the tested degrees. This indicates a better overall fit and predictive accuracy compared to higher-degree models. The simplicity of the linear relationship captured by the degree 1 polynomial model seems to generalize well to the dataset, making it the optimal choice for this particular regression task. Higher-degree models likely introduce unnecessary complexity, leading to poorer performance and potential overfitting. Also, degree 0 was omitted as it's a constant model regardless of the input.