In [41]:
from sklearn import datasets
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
import pandas as pd
import numpy as np

# Loading the Diabetes dataset
diabetes_X, diabetes_y = datasets.load_diabetes(return_X_y=True)

# Train-validation-test split
X_train, X_temp, y_train, y_temp = train_test_split(diabetes_X, diabetes_y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Data Preparation and Cross-Validation for Polynomial Degrees 0 to 8
degrees = list(range(9))
cv_results = []

for degree in degrees:
    poly_features = PolynomialFeatures(degree=degree, include_bias=True)
    X_train_poly = poly_features.fit_transform(X_train)
    
    # Cross-validation with 5 folds
    r_squared_scores = cross_val_score(LinearRegression(), X_train_poly, y_train, cv=5, scoring='r2')
    mae_scores = -cross_val_score(LinearRegression(), X_train_poly, y_train, cv=5, scoring='neg_mean_absolute_error')
    mape_scores = -cross_val_score(LinearRegression(), X_train_poly, y_train, cv=5, scoring='neg_mean_absolute_percentage_error')
    
    # Calculating mean and standard deviation of scores
    mean_r_squared = np.mean(r_squared_scores)
    std_r_squared = np.std(r_squared_scores)
    mean_mae = np.mean(mae_scores)
    std_mae = np.std(mae_scores)
    mean_mape = np.mean(mape_scores)
    std_mape = np.std(mape_scores)
    
    cv_results.append([degree, mean_r_squared, std_r_squared, mean_mae, std_mae, mean_mape, std_mape])

# Creating a DataFrame for the results
results_df = pd.DataFrame(cv_results, columns=['Degree', 'Mean R-Squared', 'Std R-Squared', 'Mean MAE', 'Std MAE', 'Mean MAPE', 'Std MAPE'])

# Print the table summarizing the cross-validation results
display(results_df)

# Identify the best model based on R-squared, MAE, and MAPE
best_model = results_df.loc[(results_df['Mean R-Squared'] > 0) & 
                            (results_df['Mean MAE'] == results_df['Mean MAE'].min()) & 
                            (results_df['Mean MAPE'] == results_df['Mean MAPE'].min())]

# Print the best model and explanation for choosing it
print("\nBest Model: Degree {}".format(int(best_model['Degree'].iloc[0])))
print("Mean R-Squared: {:.2f}".format(best_model['Mean R-Squared'].values[0]))
print("Mean MAE: {:.2f}".format(best_model['Mean MAE'].values[0]))
print("Mean MAPE: {:.2f}".format(best_model['Mean MAPE'].values[0]))



Unnamed: 0,Degree,Mean R-Squared,Std R-Squared,Mean MAE,Std MAE,Mean MAPE,Std MAPE
0,0,-0.039901,0.048473,67.170145,7.652792,0.641334,0.079557
1,1,0.452281,0.127329,45.580788,1.609128,0.408418,0.049973
2,2,-0.089528,0.445264,62.386924,13.185748,0.538072,0.15355
3,3,-508.700734,355.193526,942.22934,141.054469,7.716127,1.608822
4,4,-45.379335,24.676917,295.558549,48.749112,2.391408,0.580308
5,5,-44.383008,23.772236,292.955639,47.967113,2.373367,0.57023
6,6,-44.385918,23.785306,292.951956,47.965431,2.373427,0.570204
7,7,-44.385782,23.785132,292.949499,47.965919,2.373416,0.570204
8,8,-44.385134,23.780852,292.953707,47.967797,2.373451,0.570138



Best Model: Degree 1
Mean R-Squared: 0.45
Mean MAE: 45.58
Mean MAPE: 0.41


The Degree 1 model is selected as the best choice due to its commendable performance in terms of moderate goodness of fit (R-Squared: 0.45), accurate predictions (MAE: 45.58, MAPE: 0.41%), interpretability, robustness to outliers, and generalizability. Its simplicity makes it an ideal option for making predictions in this dataset and real-world applications. However, it's worth noting that while the Degree 1 model excels in general predictions, it may not handle highly nonlinear data effectively. It's proficient at capturing linear relationships but may struggle with intricate nonlinear patterns. Understanding the dataset's complexity is crucial. If subtle nonlinearities are present, considering more complex models may enhance predictions and capture nuanced relationships. Careful evaluation of the data's nature is essential for appropriate model selection and improved prediction accuracy.