## Ridge Regression

Download the Spotify Tracks Dataset and perform ridge regression to predict the tracks’ popularity. Note that this dataset contains both numerical and categorical features. 

The student is thus required to follow these guidelines:

1. train the model using only the numerical features,
2. appropriately handle the categorical features (for example, with one-hot encoding or other techniques) and use them together with the numerical ones to train the model, in both cases, experiment with different training parameters, 
3. use 5-fold cross validation to compute your risk estimates,
4. thoroughly discuss and compare the performance of the model

The student is required to implement from scratch (without using libraries, such as Scikit-learn) the code for the ridge regression, while it is not mandatory to do so for the implementation of the 5-fold cross-validation.
Optional: Instead of regular ridge regression, implement kernel ridge regression using a Gaussian kernel.


In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("dataset.csv", index_col=0)
print(df.dtypes)

track_id             object
artists              object
album_name           object
track_name           object
popularity            int64
duration_ms           int64
explicit               bool
danceability        float64
energy              float64
key                   int64
loudness            float64
mode                  int64
speechiness         float64
acousticness        float64
instrumentalness    float64
liveness            float64
valence             float64
tempo               float64
time_signature        int64
track_genre          object
dtype: object


In [3]:
col_list_numerical = ['duration_ms','danceability','energy','key','loudness','mode','speechiness','acousticness','instrumentalness','liveness','valence','tempo','time_signature','popularity']

df_num = df[col_list_numerical]
df_num

Unnamed: 0_level_0,duration_ms,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,popularity
n,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0,230666,0.676,0.4610,1,-6.746,0,0.1430,0.0322,0.000001,0.3580,0.7150,87.917,4,73
1,149610,0.420,0.1660,1,-17.235,1,0.0763,0.9240,0.000006,0.1010,0.2670,77.489,4,55
2,210826,0.438,0.3590,0,-9.734,1,0.0557,0.2100,0.000000,0.1170,0.1200,76.332,4,57
3,201933,0.266,0.0596,0,-18.515,1,0.0363,0.9050,0.000071,0.1320,0.1430,181.740,3,71
4,198853,0.618,0.4430,2,-9.681,1,0.0526,0.4690,0.000000,0.0829,0.1670,119.949,4,82
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
113995,384999,0.172,0.2350,5,-16.393,1,0.0422,0.6400,0.928000,0.0863,0.0339,125.995,5,21
113996,385000,0.174,0.1170,0,-18.318,0,0.0401,0.9940,0.976000,0.1050,0.0350,85.239,4,22
113997,271466,0.629,0.3290,0,-10.895,0,0.0420,0.8670,0.000000,0.0839,0.7430,132.378,4,22
113998,283893,0.587,0.5060,7,-10.889,1,0.0297,0.3810,0.000000,0.2700,0.4130,135.960,4,41


In [4]:
df_num_train = df_num.sample(frac=0.8,random_state=1)
print(df_num_train.shape)
df_num_test = df_num.drop(df_num_train.index)
print(df_num_test.shape)

(91200, 14)
(22800, 14)


In [6]:
df_num_y = df_num['popularity']
df_num_x = df_num.drop('popularity',axis=1)
#df_num_x = pd.DataFrame(scaler.fit_transform(df_num_x.to_numpy()))

df_num_train_y = df_num_train['popularity']
df_num_train_x = df_num_train.drop('popularity',axis=1)
#df_num_train_x = pd.DataFrame(scaler.fit_transform(df_num_train_x.to_numpy()))

df_num_test_y = df_num_test['popularity']
df_num_test_x = df_num_test.drop('popularity',axis=1)
#df_num_test_x = pd.DataFrame(scaler.fit_transform(df_num_test_x.to_numpy()))

In [7]:
df_num_train_x.iloc[0]

duration_ms         239893.000000
danceability             0.850000
energy                   0.660000
key                     11.000000
loudness                -4.846000
mode                     0.000000
speechiness              0.169000
acousticness             0.417000
instrumentalness         0.000051
liveness                 0.209000
valence                  0.622000
tempo                  101.947000
time_signature           4.000000
Name: 21719, dtype: float64

In [8]:
from linearReg import LinearReg
lin = LinearReg() 
lin.fit(df_num_train_x,df_num_train_y)

In [9]:
pred = lin.predict(df_num_test_x)
print(pred)
score = lin.r2_score(df_num_test_y,pred)
print(score) # 

[34.01025199373907, 36.20308985361978, 35.49282096325105, 34.46588769998675, 37.02719522298152, 38.473377455886, 33.44296783165149, 33.44296783165149, 37.43542867535001, 38.740485396597975, 33.65332560761563, 38.435216599911904, 35.59523705635394, 35.11434870081281, 36.165504967733895, 36.074122050369894, 37.12655174940663, 37.35053671823948, 37.35053671823948, 37.35053671823948, 37.35053671823948, 34.108275015980006, 36.797705395644485, 37.792447728776, 38.9224068670844, 37.8699536797109, 36.99244680770561, 36.458860905234175, 30.957624445430216, 34.79832036173438, 31.43909432616067, 35.33655315486811, 35.61038884385531, 37.677570017129405, 33.72602653764109, 38.27104169078602, 35.296765816856144, 29.833196777858582, 39.330449606109354, 35.33142384815095, 36.840180190901364, 37.32366387568267, 30.848310836648945, 36.06391568827962, 36.6221334242214, 35.35066375238616, 37.36633366001882, 33.99037007343992, 28.69596783311392, 29.844576351550923, 35.76218860016088, 40.181819494368845, 35

In [10]:
from sklearn.linear_model import LinearRegression
sk_mlr = LinearRegression()

# fit scikit-learn's LR to our data
sk_mlr.fit(df_num_train_x, df_num_train_y)

# predicts and scores
sk_score = sk_mlr.score(df_num_test_x, df_num_test_y)
print(f'Scikit-Learn\'s Final R^2 score: {sk_score}')

Scikit-Learn's Final R^2 score: 0.022133883201548965


In [13]:
from ridgeReg import RidgeReg
rid = RidgeReg() 
rid.fit(df_num_train_x,df_num_train_y,0.1)

In [15]:
pred = rid.predict(df_num_test_x)

score = rid.r2_score(df_num_test_y,pred)
print(score)

0.022133904087806355


In [21]:
from sklearn.model_selection import KFold
print("Find best value for alpha for ridge regression")
kf = KFold(n_splits=10)
best_a = 0;
best = -9999;
for alpha in np.linspace(60,65,10):
    scores = []
    for train,test in kf.split(df_num):
        x_train, y_train = df_num_x.loc[df_num_x.index[train]], df_num_y.loc[df_num_y.index[train]]
        x_test, y_test = df_num_x.loc[df_num_x.index[test]], df_num_y.loc[df_num_y.index[test]]

        rid = RidgeReg() 
        rid.fit(x_train,y_train, alpha)
        pred = rid.predict(x_test)
        score = rid.r2_score(y_test,pred)
        scores.append(score)
    print(scores)
    avg = sum(scores)/len(scores)
    if(avg>best):
        best = avg
        best_a = alpha
    print(f"alpha {alpha}: {avg}")
print(best_a)
# -0.010612969266184435

Find best value for alpha for ridge regression
[0.005973711742886412, 0.00933368147776592, 0.036941674101224065, 0.022402927511268045, -0.005308103010642773, -0.08336059699510634, -0.03211737248061697, -0.028064482934420543, -0.04106615955097692, 0.00919289995244743]
alpha 60.0: -0.010607182018617167
[0.005962671137399234, 0.00932708490440426, 0.036921571811516674, 0.022406898842175615, -0.005309314778813468, -0.08335647971210669, -0.03210571004896856, -0.02806792173791761, -0.04105222801965591, 0.009202607613363778]
alpha 60.55555555555556: -0.010607081998860268
[0.005951640158814886, 0.009320453255332484, 0.03690149223264105, 0.022410841204715037, -0.00531052501321172, -0.08335238570763903, -0.03209409251078443, -0.028071362617163365, -0.041038380190983936, 0.009212274542269805]
alpha 61.111111111111114: -0.010607004464600922
[0.00594061880127339, 0.00931378673552774, 0.03688143531185406, 0.022414754730417652, -0.005311733715253286, -0.08334831487714123, -0.03208251967540976, -0.0280

In [None]:

rid = RidgeReg() 

kf = KFold(n_splits=5)
scores_lin = []
scores_ridge = []
for train,test in kf.split(df_num):
    x_train, y_train = df_num_x.loc[df_num_x.index[train]], df_num_y.loc[df_num_y.index[train]]
    x_test, y_test = df_num_x.loc[df_num_x.index[test]], df_num_y.loc[df_num_y.index[test]]
    
    lin = LinearReg() 
    lin.fit(x_train,y_train)
    pred = lin.predict(x_test)
    score = lin.r2_score(y_test,pred)
    scores_lin.append(score)
    
    rid = RidgeReg() 
    rid.fit(x_train,y_train, 1)
    pred = rid.predict(x_test)
    score = rid.r2_score(y_test,pred)
    scores_ridge.append(score)
       
print(scores_lin)
print(max(scores_lin))

print(scores_ridge)
print(max(scores_ridge))


[ 2.12232348e-06  9.77006303e+00 -7.53058369e-01 -2.14468504e-03
  1.10881979e-02 -5.31735623e-01 -1.45557472e+01 -2.00132221e+00
 -8.51490078e+00  1.72284594e+00 -8.87059428e+00  1.40952432e-02
  9.87127569e-01]
29.899210125244615
[-2.76089995e-07  7.73269787e+00 -1.14164409e+00 -5.13940160e-02
 -1.04265804e-02 -5.89533047e-01 -1.31573070e+01 -1.28718553e+00
 -7.11130865e+00  2.31019326e+00 -9.18054577e+00  1.16912125e-02
  1.04925966e+00]
30.704523971364985
[-5.11121481e-06  1.02618038e+01 -2.92558440e+00 -4.16847976e-02
  5.21265267e-02 -4.83140573e-01 -1.15679142e+01 -1.50366240e+00
 -7.89757210e+00  1.76340669e+00 -1.15043892e+01  1.88159845e-02
  1.18565948e+00]
33.39646764318901
[-4.57276424e-06  1.01536830e+01 -3.86443988e+00 -1.20784395e-02
  2.00045191e-01 -4.57164088e-01 -9.93494771e+00 -1.76768742e+00
 -8.07349254e+00  1.20888755e+00 -1.03784056e+01  1.40403544e-02
  7.21634071e-01]
34.88436259569198
[-2.02351590e-06  4.70237663e+00 -6.96513146e+00 -3.90245526e-02
  3.81570

In [52]:
from sklearn.linear_model import Ridge, RidgeCV
sk_ridge = RidgeCV(alphas=[0.1,1,2,3,10,5.4,4.8,4.9,60,62.7,100,200,4.7,4.6,4.5,4.4,4.39,4.38,4.37,4.36])
sk_ridge = sk_ridge.fit(df_num_train_x,df_num_train_y)
print(sk_ridge.alpha_)
sk_score = sk_ridge.score(df_num_test_x, df_num_test_y)
print(f'Scikit-Learn\'s Final R^2 score: {sk_score}')

4.37
Scikit-Learn's Final R^2 score: 0.0221343720366024


In [None]:
ridge = RidgeReg() 
ridge.fit(df_num_train_x,df_num_train_y,0.1)
pred = ridge.predict(df_num_test_x)

score = ridge.r2_score(df_num_test_y,pred)
print(score)

0.022133904087806355
