In [37]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, r2_score

In [7]:
df = pd.read_csv(r'C:\Users\edoar\Documents\dataset.csv')

In [8]:
df = df.drop(['Unnamed: 0',
 'album_name',
 'track_name'], axis=1)

In [9]:
df=df.dropna().drop_duplicates(subset=['track_id']).set_index('track_id')

In [10]:
# Group by the 'artists' column and calculate the average popularity for each artist
df['avg_artist_pop'] = df.groupby('artists')['popularity'].transform('mean')

In [11]:
df

Unnamed: 0_level_0,artists,popularity,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre,avg_artist_pop
track_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
5SuOikwiRyPMVoIQDJUgSV,Gen Hoshino,73,230666,False,0.676,0.4610,1,-6.746,0,0.1430,0.0322,0.000001,0.3580,0.7150,87.917,4,acoustic,51.571429
4qPNDBW1i3p13qLCt0Ki3A,Ben Woodward,55,149610,False,0.420,0.1660,1,-17.235,1,0.0763,0.9240,0.000006,0.1010,0.2670,77.489,4,acoustic,41.222222
1iJBSr7s7jYXzM8EGcbK5b,Ingrid Michaelson;ZAYN,57,210826,False,0.438,0.3590,0,-9.734,1,0.0557,0.2100,0.000000,0.1170,0.1200,76.332,4,acoustic,57.000000
6lfxq3CG4xtTiEg7opyCyx,Kina Grannis,71,201933,False,0.266,0.0596,0,-18.515,1,0.0363,0.9050,0.000071,0.1320,0.1430,181.740,3,acoustic,53.933333
5vjLSffimiIP26QG5WcN2K,Chord Overstreet,82,198853,False,0.618,0.4430,2,-9.681,1,0.0526,0.4690,0.000000,0.0829,0.1670,119.949,4,acoustic,41.727273
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2C3TZjDRiAzdyViavDJ217,Rainy Lullaby,21,384999,False,0.172,0.2350,5,-16.393,1,0.0422,0.6400,0.928000,0.0863,0.0339,125.995,5,world-music,23.500000
1hIz5L4IB9hN3WRYPOCGPw,Rainy Lullaby,22,385000,False,0.174,0.1170,0,-18.318,0,0.0401,0.9940,0.976000,0.1050,0.0350,85.239,4,world-music,23.500000
6x8ZfSoqDjuNa5SVP5QjvX,Cesária Evora,22,271466,False,0.629,0.3290,0,-10.895,0,0.0420,0.8670,0.000000,0.0839,0.7430,132.378,4,world-music,26.428571
2e6sXL2bYv4bSz6VTdnfLs,Michael W. Smith,41,283893,False,0.587,0.5060,7,-10.889,1,0.0297,0.3810,0.000000,0.2700,0.4130,135.960,4,world-music,35.714286


In [12]:
train, test = train_test_split(df, test_size=.2, random_state=1)

In [13]:
predictors = ["avg_artist_pop", "danceability", "energy", "key", "loudness", "mode", "speechiness", "acousticness", "instrumentalness", "liveness",
             "valence", "tempo"]
target = "popularity"

In [14]:
def ridge_fit(train, predictors, target, alpha):
    X = train[predictors].copy()
    y = train[[target]].copy()
    
    y_mean = y.mean()
    y = y - y_mean
    
    x_mean = X.mean()
    x_std = X.std()
    
    X = (X - x_mean)/x_std
    
    penalty = alpha * np.identity(X.shape[1])

    B = np.linalg.inv(X.T @ X + penalty) @X.T @ y #matrix multiplication
    diagonal_matrix = np.diag(1/(x_std))
    beta_hat = diagonal_matrix @ B
    beta_hat.index =  ["avg_artist_pop","danceability", "energy", "key", "loudness", "mode", "speechiness", "acousticness", "instrumentalness", "liveness",
             "valence", "tempo"]
    
    return beta_hat, x_mean, x_std

In [15]:
def ridge_predict(test, predictors, x_mean, x_std, beta_hat):
    test_X = test[predictors]
    
    predictions = test_X @ beta_hat
    return predictions

In [16]:
from sklearn.metrics import mean_absolute_error

errors_dict = {}

alphas = [20**i for i in range(-8, 8)]

In [17]:
kf = KFold(n_splits=5)

In [18]:
# Initialize dictionaries to store errors for each alpha
errors_dict_mae = {}
errors_dict_mse = {}
errors_dict_rmse = {}
errors_dict_r2 = {}

# Iterate over alpha values
for alpha in alphas:
    fold_errors_mae = []
    fold_errors_mse = []
    fold_errors_rmse = []
    fold_errors_r2 = []
    
    # Perform cross-validation
    for train_index, val_index in kf.split(train):
        train_fold = train.iloc[train_index]
        val_fold = train.iloc[val_index]
        
        beta_hat, x_mean, x_std = ridge_fit(train_fold, predictors, target, alpha)
        predictions = ridge_predict(val_fold, predictors, x_mean, x_std, beta_hat)
        
        # Calculate evaluation metrics
        fold_error_mae = mean_absolute_error(val_fold[target], predictions)
        fold_error_mse = mean_squared_error(val_fold[target], predictions)
        fold_error_rmse = np.sqrt(fold_error_mse)  # RMSE is the square root of MSE
        fold_error_r2 = r2_score(val_fold[target], predictions)
        
        fold_errors_mae.append(fold_error_mae)
        fold_errors_mse.append(fold_error_mse)
        fold_errors_rmse.append(fold_error_rmse)
        fold_errors_r2.append(fold_error_r2)
    
    # Store the lists of errors for this alpha in the dictionaries
    errors_dict_mae[alpha] = fold_errors_mae
    errors_dict_mse[alpha] = fold_errors_mse
    errors_dict_rmse[alpha] = fold_errors_rmse
    errors_dict_r2[alpha] = fold_errors_r2

# Find the best alpha based on the lowest average error (MAE, MSE, RMSE, or R²)
best_alpha_mae = min(errors_dict_mae, key=lambda k: np.mean(errors_dict_mae[k]))
best_alpha_mse = min(errors_dict_mse, key=lambda k: np.mean(errors_dict_mse[k]))
best_alpha_rmse = min(errors_dict_rmse, key=lambda k: np.mean(errors_dict_rmse[k]))
best_alpha_r2 = max(errors_dict_r2, key=lambda k: np.mean(errors_dict_r2[k]))  # Use max for R²

# Get the corresponding errors for the best alphas
best_alpha_errors_mae = errors_dict_mae[best_alpha_mae]
best_alpha_errors_mse = errors_dict_mse[best_alpha_mse]
best_alpha_errors_rmse = errors_dict_rmse[best_alpha_rmse]
best_alpha_errors_r2 = errors_dict_r2[best_alpha_r2]

print("Best Alpha (MAE):", best_alpha_mae)
print("Errors for Best Alpha (MAE):", best_alpha_errors_mae)
print()
print("Best Alpha (MSE):", best_alpha_mse)
print("Errors for Best Alpha (MSE):", best_alpha_errors_mse)
print()
print("Best Alpha (RMSE):", best_alpha_rmse)
print("Errors for Best Alpha (RMSE):", best_alpha_errors_rmse)
print()
print("Best Alpha (R²):", best_alpha_r2)
print("Errors for Best Alpha (R²):", best_alpha_errors_r2)


Best Alpha (MAE): 400
Errors for Best Alpha (MAE): [5.251772384718318, 5.218875131871342, 5.246186689705343, 5.197186328559762, 5.30634416745134]

Best Alpha (MSE): 400
Errors for Best Alpha (MSE): [102.88583800890424, 103.86364030331201, 104.91788491986789, 105.35896367221521, 106.73641813625943]

Best Alpha (RMSE): 400
Errors for Best Alpha (RMSE): [10.143265648148246, 10.191351250119485, 10.242943176639606, 10.264451455007968, 10.33133186652425]

Best Alpha (R²): 400
Errors for Best Alpha (R²): [0.7586924199092713, 0.7535161684694536, 0.751277867148691, 0.7524150998800646, 0.7508326350458298]


In [19]:
# Initialize a dictionary to store average metric values for each alpha
average_metrics_dict = {
    'Alpha': alphas,
    'MAE': [],
    'MSE': [],
    'RMSE': [],
    'R2': []
}

# Iterate over alpha values
for alpha in alphas:
    fold_errors_mae = []
    fold_errors_mse = []
    fold_errors_rmse = []
    fold_errors_r2 = []
    
    # Perform cross-validation
    for train_index, val_index in kf.split(train):
        train_fold = train.iloc[train_index]
        val_fold = train.iloc[val_index]
        
        beta_hat, x_mean, x_std = ridge_fit(train_fold, predictors, target, alpha)
        predictions = ridge_predict(val_fold, predictors, x_mean, x_std, beta_hat)
        
        # Calculate evaluation metrics
        fold_error_mae = mean_absolute_error(val_fold[target], predictions)
        fold_error_mse = mean_squared_error(val_fold[target], predictions)
        fold_error_rmse = np.sqrt(fold_error_mse)  # RMSE is the square root of MSE
        fold_error_r2 = r2_score(val_fold[target], predictions)
        
        fold_errors_mae.append(fold_error_mae)
        fold_errors_mse.append(fold_error_mse)
        fold_errors_rmse.append(fold_error_rmse)
        fold_errors_r2.append(fold_error_r2)
    
    # Calculate and store the average metric values for this alpha
    average_metrics_dict['MAE'].append(np.mean(fold_errors_mae))
    average_metrics_dict['MSE'].append(np.mean(fold_errors_mse))
    average_metrics_dict['RMSE'].append(np.mean(fold_errors_rmse))
    average_metrics_dict['R2'].append(np.mean(fold_errors_r2))

# Create a DataFrame from the dictionary
average_metrics_df = pd.DataFrame(average_metrics_dict)

# Display the DataFrame
print(average_metrics_df)

           Alpha        MAE          MSE       RMSE        R2
0   3.906250e-11   5.284530   104.814448  10.237701  0.753201
1   7.812500e-10   5.284530   104.814448  10.237701  0.753201
2   1.562500e-08   5.284530   104.814448  10.237701  0.753201
3   3.125000e-07   5.284530   104.814448  10.237701  0.753201
4   6.250000e-06   5.284530   104.814448  10.237701  0.753201
5   1.250000e-04   5.284530   104.814448  10.237701  0.753201
6   2.500000e-03   5.284529   104.814447  10.237701  0.753201
7   5.000000e-02   5.284520   104.814428  10.237700  0.753201
8   1.000000e+00   5.284339   104.814050  10.237682  0.753202
9   2.000000e+01   5.280766   104.806718  10.237323  0.753219
10  4.000000e+02   5.244073   104.752549  10.234669  0.753347
11  8.000000e+03   7.538760   130.076541  11.404908  0.693720
12  1.600000e+05  25.627249   903.154324  30.052478 -1.126692
13  3.200000e+06  32.692672  1479.537497  38.464661 -2.483920
14  6.400000e+07  33.195256  1525.918789  39.062913 -2.593135
15  1.28

In [20]:
# Train the ridge regression model on the entire training set with the best alpha
best_alpha = 400
beta_hat, x_mean, x_std = ridge_fit(train, predictors, target, best_alpha)
test_predictions = ridge_predict(test, predictors, x_mean, x_std, beta_hat)

# Evaluate the model on the test set
mae_test = mean_absolute_error(test[target], test_predictions)
mse_test = mean_squared_error(test[target], test_predictions)
rmse_test = np.sqrt(mse_test)
r2_test = r2_score(test[target], test_predictions)

print("Test Set MAE:", mae_test)
print("Test Set MSE:", mse_test)
print("Test Set RMSE:", rmse_test)
print("Test Set R²:", r2_test)



Test Set MAE: 5.066833553956097
Test Set MSE: 98.22817647767084
Test Set RMSE: 9.911012888583631
Test Set R²: 0.7655188734313114


In [21]:
beta_hat

Unnamed: 0,popularity
avg_artist_pop,0.992083
danceability,0.887614
energy,-0.046691
key,0.002503
loudness,-0.039834
mode,-0.122663
speechiness,-0.64772
acousticness,-0.405512
instrumentalness,-0.579143
liveness,-0.376141


## Categorical variables

In [22]:
encoded_df = pd.get_dummies(df, columns=['track_genre', "explicit"])

In [23]:
encoded_df

Unnamed: 0_level_0,artists,popularity,duration_ms,danceability,energy,key,loudness,mode,speechiness,acousticness,...,track_genre_swedish,track_genre_synth-pop,track_genre_tango,track_genre_techno,track_genre_trance,track_genre_trip-hop,track_genre_turkish,track_genre_world-music,explicit_False,explicit_True
track_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5SuOikwiRyPMVoIQDJUgSV,Gen Hoshino,73,230666,0.676,0.4610,1,-6.746,0,0.1430,0.0322,...,False,False,False,False,False,False,False,False,True,False
4qPNDBW1i3p13qLCt0Ki3A,Ben Woodward,55,149610,0.420,0.1660,1,-17.235,1,0.0763,0.9240,...,False,False,False,False,False,False,False,False,True,False
1iJBSr7s7jYXzM8EGcbK5b,Ingrid Michaelson;ZAYN,57,210826,0.438,0.3590,0,-9.734,1,0.0557,0.2100,...,False,False,False,False,False,False,False,False,True,False
6lfxq3CG4xtTiEg7opyCyx,Kina Grannis,71,201933,0.266,0.0596,0,-18.515,1,0.0363,0.9050,...,False,False,False,False,False,False,False,False,True,False
5vjLSffimiIP26QG5WcN2K,Chord Overstreet,82,198853,0.618,0.4430,2,-9.681,1,0.0526,0.4690,...,False,False,False,False,False,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2C3TZjDRiAzdyViavDJ217,Rainy Lullaby,21,384999,0.172,0.2350,5,-16.393,1,0.0422,0.6400,...,False,False,False,False,False,False,False,True,True,False
1hIz5L4IB9hN3WRYPOCGPw,Rainy Lullaby,22,385000,0.174,0.1170,0,-18.318,0,0.0401,0.9940,...,False,False,False,False,False,False,False,True,True,False
6x8ZfSoqDjuNa5SVP5QjvX,Cesária Evora,22,271466,0.629,0.3290,0,-10.895,0,0.0420,0.8670,...,False,False,False,False,False,False,False,True,True,False
2e6sXL2bYv4bSz6VTdnfLs,Michael W. Smith,41,283893,0.587,0.5060,7,-10.889,1,0.0297,0.3810,...,False,False,False,False,False,False,False,True,True,False


In [24]:
new_df = encoded_df.drop(['artists'], axis=1)

In [25]:
train, test = train_test_split(new_df, test_size=.2, random_state=1)

In [26]:
predictors = ['track_genre_acoustic',
 'track_genre_afrobeat',
 'track_genre_alt-rock',
 'track_genre_alternative',
 'track_genre_ambient',
 'track_genre_anime',
 'track_genre_black-metal',
 'track_genre_bluegrass',
 'track_genre_blues',
 'track_genre_brazil',
 'track_genre_breakbeat',
 'track_genre_british',
 'track_genre_cantopop',
 'track_genre_chicago-house',
 'track_genre_children',
 'track_genre_chill',
 'track_genre_classical',
 'track_genre_club',
 'track_genre_comedy',
 'track_genre_country',
 'track_genre_dance',
 'track_genre_dancehall',
 'track_genre_death-metal',
 'track_genre_deep-house',
 'track_genre_detroit-techno',
 'track_genre_disco',
 'track_genre_disney',
 'track_genre_drum-and-bass',
 'track_genre_dub',
 'track_genre_dubstep',
 'track_genre_edm',
 'track_genre_electro',
 'track_genre_electronic',
 'track_genre_emo',
 'track_genre_folk',
 'track_genre_forro',
 'track_genre_french',
 'track_genre_funk',
 'track_genre_garage',
 'track_genre_german',
 'track_genre_gospel',
 'track_genre_goth',
 'track_genre_grindcore',
 'track_genre_groove',
 'track_genre_grunge',
 'track_genre_guitar',
 'track_genre_happy',
 'track_genre_hard-rock',
 'track_genre_hardcore',
 'track_genre_hardstyle',
 'track_genre_heavy-metal',
 'track_genre_hip-hop',
 'track_genre_honky-tonk',
 'track_genre_house',
 'track_genre_idm',
 'track_genre_indian',
 'track_genre_indie',
 'track_genre_indie-pop',
 'track_genre_industrial',
 'track_genre_iranian',
 'track_genre_j-dance',
 'track_genre_j-idol',
 'track_genre_j-pop',
 'track_genre_j-rock',
 'track_genre_jazz',
 'track_genre_k-pop',
 'track_genre_kids',
 'track_genre_latin',
 'track_genre_latino',
 'track_genre_malay',
 'track_genre_mandopop',
 'track_genre_metal',
 'track_genre_metalcore',
 'track_genre_minimal-techno',
 'track_genre_mpb',
 'track_genre_new-age',
 'track_genre_opera',
 'track_genre_pagode',
 'track_genre_party',
 'track_genre_piano',
 'track_genre_pop',
 'track_genre_pop-film',
 'track_genre_power-pop',
 'track_genre_progressive-house',
 'track_genre_psych-rock',
 'track_genre_punk',
 'track_genre_punk-rock',
 'track_genre_r-n-b',
 'track_genre_reggae',
 'track_genre_reggaeton',
 'track_genre_rock',
 'track_genre_rock-n-roll',
 'track_genre_rockabilly',
 'track_genre_romance',
 'track_genre_sad',
 'track_genre_salsa',
 'track_genre_samba',
 'track_genre_sertanejo',
 'track_genre_show-tunes',
 'track_genre_singer-songwriter',
 'track_genre_ska',
 'track_genre_sleep',
 'track_genre_soul',
 'track_genre_spanish',
 'track_genre_study',
 'track_genre_swedish',
 'track_genre_synth-pop',
 'track_genre_tango',
 'track_genre_techno',
 'track_genre_trance',
 'track_genre_trip-hop',
 'track_genre_turkish',
 'track_genre_world-music',
 'explicit_False',
 'explicit_True',
 'danceability',
 'energy',
 'key',
 'loudness',
 'mode',
 'speechiness',
 'acousticness',
 'instrumentalness',
 'liveness',
 'valence',
 'tempo',
 'avg_artist_pop']
target = "popularity"

In [27]:
def ridge_fit(train, predictors, target, alpha):
    X = train[predictors].copy()
    y = train[[target]].copy()
    
    y_mean = y.mean()
    y = y - y_mean
    
    x_mean = X.mean()
    x_std = X.std()
    
    X = (X - x_mean)/x_std
    
    penalty = alpha * np.identity(X.shape[1])

    B = np.linalg.inv(X.T @ X + penalty) @X.T @ y #matrix multiplication
    diagonal_matrix = np.diag(1/(x_std))
    beta_hat = diagonal_matrix @ B
    beta_hat.index =  ['track_genre_acoustic',
 'track_genre_afrobeat',
 'track_genre_alt-rock',
 'track_genre_alternative',
 'track_genre_ambient',
 'track_genre_anime',
 'track_genre_black-metal',
 'track_genre_bluegrass',
 'track_genre_blues',
 'track_genre_brazil',
 'track_genre_breakbeat',
 'track_genre_british',
 'track_genre_cantopop',
 'track_genre_chicago-house',
 'track_genre_children',
 'track_genre_chill',
 'track_genre_classical',
 'track_genre_club',
 'track_genre_comedy',
 'track_genre_country',
 'track_genre_dance',
 'track_genre_dancehall',
 'track_genre_death-metal',
 'track_genre_deep-house',
 'track_genre_detroit-techno',
 'track_genre_disco',
 'track_genre_disney',
 'track_genre_drum-and-bass',
 'track_genre_dub',
 'track_genre_dubstep',
 'track_genre_edm',
 'track_genre_electro',
 'track_genre_electronic',
 'track_genre_emo',
 'track_genre_folk',
 'track_genre_forro',
 'track_genre_french',
 'track_genre_funk',
 'track_genre_garage',
 'track_genre_german',
 'track_genre_gospel',
 'track_genre_goth',
 'track_genre_grindcore',
 'track_genre_groove',
 'track_genre_grunge',
 'track_genre_guitar',
 'track_genre_happy',
 'track_genre_hard-rock',
 'track_genre_hardcore',
 'track_genre_hardstyle',
 'track_genre_heavy-metal',
 'track_genre_hip-hop',
 'track_genre_honky-tonk',
 'track_genre_house',
 'track_genre_idm',
 'track_genre_indian',
 'track_genre_indie',
 'track_genre_indie-pop',
 'track_genre_industrial',
 'track_genre_iranian',
 'track_genre_j-dance',
 'track_genre_j-idol',
 'track_genre_j-pop',
 'track_genre_j-rock',
 'track_genre_jazz',
 'track_genre_k-pop',
 'track_genre_kids',
 'track_genre_latin',
 'track_genre_latino',
 'track_genre_malay',
 'track_genre_mandopop',
 'track_genre_metal',
 'track_genre_metalcore',
 'track_genre_minimal-techno',
 'track_genre_mpb',
 'track_genre_new-age',
 'track_genre_opera',
 'track_genre_pagode',
 'track_genre_party',
 'track_genre_piano',
 'track_genre_pop',
 'track_genre_pop-film',
 'track_genre_power-pop',
 'track_genre_progressive-house',
 'track_genre_psych-rock',
 'track_genre_punk',
 'track_genre_punk-rock',
 'track_genre_r-n-b',
 'track_genre_reggae',
 'track_genre_reggaeton',
 'track_genre_rock',
 'track_genre_rock-n-roll',
 'track_genre_rockabilly',
 'track_genre_romance',
 'track_genre_sad',
 'track_genre_salsa',
 'track_genre_samba',
 'track_genre_sertanejo',
 'track_genre_show-tunes',
 'track_genre_singer-songwriter',
 'track_genre_ska',
 'track_genre_sleep',
 'track_genre_soul',
 'track_genre_spanish',
 'track_genre_study',
 'track_genre_swedish',
 'track_genre_synth-pop',
 'track_genre_tango',
 'track_genre_techno',
 'track_genre_trance',
 'track_genre_trip-hop',
 'track_genre_turkish',
 'track_genre_world-music',
 'explicit_False',
 'explicit_True',
 'danceability',
 'energy',
 'key',
 'loudness',
 'mode',
 'speechiness',
 'acousticness',
 'instrumentalness',
 'liveness',
 'valence',
 'tempo',
 'avg_artist_pop']
    
    return beta_hat, x_mean, x_std

In [28]:
def ridge_predict(test, predictors, x_mean, x_std, beta_hat):
    test_X = test[predictors]
    
    predictions = test_X @ beta_hat
    return predictions

In [29]:
errors_dict = {}

alphas = [20**i for i in range(-8, 8)]

In [30]:
for alpha in alphas:
    beta_hat, x_mean, x_std = ridge_fit(train, predictors, target, alpha)
    predictions = ridge_predict(test, predictors, x_mean, x_std, beta_hat)
    error = mean_absolute_error(test[target], predictions)
    
    # Store the error in the dictionary with alpha as the key
    errors_dict[alpha] = [error]

In [31]:
errors_dict

{3.90625e-11: [5.736629986303227],
 7.8125e-10: [5.184015751069055],
 1.5625e-08: [5.16244269046153],
 3.125e-07: [5.161541312209975],
 6.25e-06: [5.161510342948345],
 0.000125: [5.161510645469622],
 0.0025: [5.161510936228232],
 0.05: [5.161518199407494],
 1: [5.161663620679337],
 20: [5.1646057699141465],
 400: [5.23855380690129],
 8000: [8.276135103759724],
 160000: [25.500070158373614],
 3200000: [32.51877013193513],
 64000000: [33.07454190723147],
 1280000000: [33.103287232163446]}

In [32]:
kf = KFold(n_splits=5)

In [33]:
# Initialize dictionaries to store errors for each alpha
errors_dict_mae = {}
errors_dict_mse = {}
errors_dict_rmse = {}
errors_dict_r2 = {}

# Iterate over alpha values
for alpha in alphas:
    fold_errors_mae = []
    fold_errors_mse = []
    fold_errors_rmse = []
    fold_errors_r2 = []
    
    # Perform cross-validation
    for train_index, val_index in kf.split(train):
        train_fold = train.iloc[train_index]
        val_fold = train.iloc[val_index]
        
        beta_hat, x_mean, x_std = ridge_fit(train_fold, predictors, target, alpha)
        predictions = ridge_predict(val_fold, predictors, x_mean, x_std, beta_hat)
        
        # Calculate evaluation metrics
        fold_error_mae = mean_absolute_error(val_fold[target], predictions)
        fold_error_mse = mean_squared_error(val_fold[target], predictions)
        fold_error_rmse = np.sqrt(fold_error_mse)  # RMSE is the square root of MSE
        fold_error_r2 = r2_score(val_fold[target], predictions)
        
        fold_errors_mae.append(fold_error_mae)
        fold_errors_mse.append(fold_error_mse)
        fold_errors_rmse.append(fold_error_rmse)
        fold_errors_r2.append(fold_error_r2)
    
    # Store the lists of errors for this alpha in the dictionaries
    errors_dict_mae[alpha] = fold_errors_mae
    errors_dict_mse[alpha] = fold_errors_mse
    errors_dict_rmse[alpha] = fold_errors_rmse
    errors_dict_r2[alpha] = fold_errors_r2

# Find the best alpha based on the lowest average error (MAE, MSE, RMSE, or R²)
best_alpha_mae = min(errors_dict_mae, key=lambda k: np.mean(errors_dict_mae[k]))
best_alpha_mse = min(errors_dict_mse, key=lambda k: np.mean(errors_dict_mse[k]))
best_alpha_rmse = min(errors_dict_rmse, key=lambda k: np.mean(errors_dict_rmse[k]))
best_alpha_r2 = max(errors_dict_r2, key=lambda k: np.mean(errors_dict_r2[k]))  # Use max for R²

# Get the corresponding errors for the best alphas
best_alpha_errors_mae = errors_dict_mae[best_alpha_mae]
best_alpha_errors_mse = errors_dict_mse[best_alpha_mse]
best_alpha_errors_rmse = errors_dict_rmse[best_alpha_rmse]
best_alpha_errors_r2 = errors_dict_r2[best_alpha_r2]

print("Best Alpha (MAE):", best_alpha_mae)
print("Errors for Best Alpha (MAE):", best_alpha_errors_mae)
print()
print("Best Alpha (MSE):", best_alpha_mse)
print("Errors for Best Alpha (MSE):", best_alpha_errors_mse)
print()
print("Best Alpha (RMSE):", best_alpha_rmse)
print("Errors for Best Alpha (RMSE):", best_alpha_errors_rmse)
print()
print("Best Alpha (R²):", best_alpha_r2)
print("Errors for Best Alpha (R²):", best_alpha_errors_r2)


Best Alpha (MAE): 1.5625e-08
Errors for Best Alpha (MAE): [5.388450598744739, 5.281628118799283, 5.298684715255397, 5.36154310564516, 5.423975267377857]

Best Alpha (MSE): 1.5625e-08
Errors for Best Alpha (MSE): [102.95834972862515, 103.02037837837361, 104.31719962389495, 105.75787347811585, 106.01879473839733]

Best Alpha (RMSE): 1.5625e-08
Errors for Best Alpha (RMSE): [10.14683939602008, 10.149895486081302, 10.213579177932433, 10.283864715082354, 10.296542853715383]

Best Alpha (R²): 1.5625e-08
Errors for Best Alpha (R²): [0.7585223515310309, 0.7555173541551821, 0.7527018734379993, 0.7514776946417401, 0.7525078676814845]


In [34]:
# Initialize a dictionary to store average metric values for each alpha
average_metrics_dict = {
    'Alpha': alphas,
    'MAE': [],
    'MSE': [],
    'RMSE': [],
    'R2': []
}

# Iterate over alpha values
for alpha in alphas:
    fold_errors_mae = []
    fold_errors_mse = []
    fold_errors_rmse = []
    fold_errors_r2 = []
    
    # Perform cross-validation
    for train_index, val_index in kf.split(train):
        train_fold = train.iloc[train_index]
        val_fold = train.iloc[val_index]
        
        beta_hat, x_mean, x_std = ridge_fit(train_fold, predictors, target, alpha)
        predictions = ridge_predict(val_fold, predictors, x_mean, x_std, beta_hat)
        
        # Calculate evaluation metrics
        fold_error_mae = mean_absolute_error(val_fold[target], predictions)
        fold_error_mse = mean_squared_error(val_fold[target], predictions)
        fold_error_rmse = np.sqrt(fold_error_mse)  # RMSE is the square root of MSE
        fold_error_r2 = r2_score(val_fold[target], predictions)
        
        fold_errors_mae.append(fold_error_mae)
        fold_errors_mse.append(fold_error_mse)
        fold_errors_rmse.append(fold_error_rmse)
        fold_errors_r2.append(fold_error_r2)
    
    # Calculate and store the average metric values for this alpha
    average_metrics_dict['MAE'].append(np.mean(fold_errors_mae))
    average_metrics_dict['MSE'].append(np.mean(fold_errors_mse))
    average_metrics_dict['RMSE'].append(np.mean(fold_errors_rmse))
    average_metrics_dict['R2'].append(np.mean(fold_errors_r2))

# Create a DataFrame from the dictionary
average_metrics_df = pd.DataFrame(average_metrics_dict)

# Display the DataFrame
print(average_metrics_df)


           Alpha        MAE          MSE       RMSE        R2
0   3.906250e-11   5.743850   107.981477  10.390840  0.745729
1   7.812500e-10   5.351916   104.421927  10.218518  0.754128
2   1.562500e-08   5.350856   104.414519  10.218144  0.754145
3   3.125000e-07   5.351032   104.415659  10.218200  0.754143
4   6.250000e-06   5.351043   104.415744  10.218204  0.754143
5   1.250000e-04   5.351043   104.415747  10.218204  0.754143
6   2.500000e-03   5.351044   104.415750  10.218204  0.754143
7   5.000000e-02   5.351052   104.415807  10.218207  0.754142
8   1.000000e+00   5.351222   104.416945  10.218262  0.754140
9   2.000000e+01   5.354686   104.440194  10.219399  0.754085
10  4.000000e+02   5.449648   105.089035  10.251085  0.752559
11  8.000000e+03   9.290449   158.843205  12.602991  0.626005
12  1.600000e+05  26.723292   939.076644  30.644307 -1.211263
13  3.200000e+06  32.751480  1479.083165  38.458755 -2.482850
14  6.400000e+07  33.198152  1525.877410  39.062383 -2.593037
15  1.28

In [35]:
# Train the ridge regression model on the entire training set with the best alpha
best_alpha = 7.8125e-10
beta_hat, x_mean, x_std = ridge_fit(train, predictors, target, best_alpha)
test_predictions = ridge_predict(test, predictors, x_mean, x_std, beta_hat)

# Evaluate the model on the test set
mae_test = mean_absolute_error(test[target], test_predictions)
mse_test = mean_squared_error(test[target], test_predictions)
rmse_test = np.sqrt(mse_test)
r2_test = r2_score(test[target], test_predictions)

print("Test Set MAE:", mae_test)
print("Test Set MSE:", mse_test)
print("Test Set RMSE:", rmse_test)
print("Test Set R²:", r2_test)

Test Set MAE: 5.184015751069055
Test Set MSE: 97.77940625353891
Test Set RMSE: 9.888346992978093
Test Set R²: 0.7665901357869642
