# Machine Learning Model Training: Predict if Song will be Liked by Me

In [1]:
import pandas as pd
df_ready_to_used = pd.read_csv('./data/dataset_eda_and_preprocessed.csv').drop(columns='Unnamed: 0')
df_ready_to_used

Unnamed: 0,artists,track_genre,popularity,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,target
0,The Killers;Ryan Pardey,rock,0,245106,False,0.588000,0.847000,8,-4.164000,1,0.070500,0.060100,0.000000e+00,0.307000,0.662000,120.041000,0
1,John Denver,singer-songwriter,62,245533,False,0.406000,0.188000,7,-13.039000,1,0.032400,0.852000,0.000000e+00,0.105000,0.629000,120.757000,0
2,Skindred,metal,0,183760,False,0.583000,0.915000,0,-4.007000,1,0.081100,0.002140,0.000000e+00,0.395000,0.880000,172.097000,0
3,Lyn Lapid,songwriter,58,190243,False,0.492000,0.227000,2,-10.679000,1,0.027700,0.772000,6.570000e-05,0.106000,0.229000,137.697000,0
4,Alphaxone,iranian,11,434000,False,0.144000,0.167000,1,-24.642000,0,0.041900,0.877000,9.670000e-01,0.114000,0.039300,103.660000,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11339,Eagles,rock,76,393640,False,0.533000,0.360000,1,-12.633000,1,0.028200,0.080500,1.030000e-05,0.063100,0.299000,131.229000,1
11340,Clean Bandit;Zara Larsson,dance,76,212733,False,0.715000,0.605000,0,-5.128000,0,0.042800,0.239000,1.410000e-05,0.189000,0.454000,122.956000,1
11341,Martin Garrix;Bebe Rexha,house,80,195854,False,0.660916,0.433850,4,-8.280813,0,0.032385,0.091762,2.478347e-05,0.212396,0.431065,103.553402,1
11342,Fall Out Boy,rock,82,228458,False,0.607713,0.475443,8,-6.713909,1,0.040751,0.199800,6.372013e-07,0.350642,0.522158,170.416324,1


## Split Train and Test

We will use the ratio of training/validation/test = **60/20/20**

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer

df_full_train, df_test = train_test_split(df_ready_to_used, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.2, random_state=42)

df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

y_train = df_train.target.values
y_val = df_val.target.values
y_test = df_test.target.values

del df_train['target']
del df_val['target']
del df_test['target']

dv = DictVectorizer(sparse=False)

train_dict = df_train.to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val.to_dict(orient='records')
X_val = dv.transform(val_dict)

test_dict = df_test.to_dict(orient='records')
X_test = dv.transform(test_dict)

## Training Untuned Version of SGD Classifier

In [3]:
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import roc_auc_score

model = SGDClassifier(loss='log_loss', random_state=42)
model.fit(X_train, y_train)


In [4]:
model.get_params()

{'alpha': 0.0001,
 'average': False,
 'class_weight': None,
 'early_stopping': False,
 'epsilon': 0.1,
 'eta0': 0.0,
 'fit_intercept': True,
 'l1_ratio': 0.15,
 'learning_rate': 'optimal',
 'loss': 'log_loss',
 'max_iter': 1000,
 'n_iter_no_change': 5,
 'n_jobs': None,
 'penalty': 'l2',
 'power_t': 0.5,
 'random_state': 42,
 'shuffle': True,
 'tol': 0.001,
 'validation_fraction': 0.1,
 'verbose': 0,
 'warm_start': False}

In [5]:
y_pred = model.predict_proba(X_train)[:, 1]
print(f"Score on training set: {roc_auc_score(y_train, y_pred)}")

Score on training set: 0.5


In [6]:
y_pred = model.predict_proba(X_val)[:, 1]
print(f"Score on validation set: {roc_auc_score(y_val, y_pred)}")

Score on validation set: 0.5


In [7]:
y_pred = model.predict_proba(X_test)[:, 1]
print(f"Score on test set: {roc_auc_score(y_test, y_pred)}")

Score on test set: 0.5


## Train a Tuned Version of SGDClassifier

In [9]:
from sklearn.model_selection import RandomizedSearchCV

param_grid = {
    'alpha': [0.0001, 0.001, 0.01, 0.1],
    'tol': [0.0001, 0.001, 0.01, 0.1],
    'penalty' : ['l2', 'l1', 'elasticnet', None],
    'eta0': [0.0001, 0.001, 0.01],
    'learning_rate' : ['constant', 'optimal', 'invscaling', 'adaptive'],
    'max_iter': [100, 500],
}
grid_search = RandomizedSearchCV(
    SGDClassifier(loss='log_loss', random_state=42),
    param_distributions=param_grid,
    n_iter=10,
    cv=5, verbose=3, n_jobs=-1, scoring='roc_auc', random_state=42
)
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[CV 3/5] END alpha=0.1, eta0=0.01, learning_rate=optimal, max_iter=500, penalty=l2, tol=0.1;, score=0.533 total time=   8.0s
[CV 2/5] END alpha=0.1, eta0=0.01, learning_rate=optimal, max_iter=500, penalty=l2, tol=0.1;, score=0.534 total time=  11.8s
[CV 3/5] END alpha=0.01, eta0=0.01, learning_rate=adaptive, max_iter=100, penalty=l1, tol=0.01;, score=0.864 total time=  13.3s
[CV 2/5] END alpha=0.01, eta0=0.01, learning_rate=adaptive, max_iter=100, penalty=l1, tol=0.01;, score=0.897 total time=  14.0s
[CV 1/5] END alpha=0.01, eta0=0.01, learning_rate=adaptive, max_iter=100, penalty=l1, tol=0.01;, score=0.898 total time=  15.8s
[CV 5/5] END alpha=0.01, eta0=0.01, learning_rate=adaptive, max_iter=100, penalty=l1, tol=0.01;, score=0.850 total time=  16.5s
[CV 4/5] END alpha=0.01, eta0=0.01, learning_rate=adaptive, max_iter=100, penalty=l1, tol=0.01;, score=0.861 total time=  16.3s
[CV 1/5] END alpha=0.1, eta0=0.01, learning_rate=optimal, max_iter=500, penalty=l2, tol=0.1;, score=0.522 tota

In [10]:
final_model = grid_search.best_estimator_
final_model

We get the best model of `SGDClassfier` with parameter alpha=0.01, eta0=0.01, learning_rate=adaptive, max_iter=100 and penalty=l1

In [11]:
y_pred = final_model.predict_proba(X_train)[:, 1]
print(f"Score on training set: {roc_auc_score(y_train, y_pred)}")

Score on training set: 0.8091939881637943


In [12]:
y_pred = final_model.predict_proba(X_val)[:, 1]
print(f"Score on validation set: {roc_auc_score(y_val, y_pred)}")

Score on validation set: 0.8289620691581475


In [13]:
y_pred = final_model.predict_proba(X_test)[:, 1]
print(f"Score on test set: {roc_auc_score(y_test, y_pred)}")

Score on test set: 0.7998814604077762


## Trying the second model: LightGBM

In [14]:
!pip install lightgbm

Defaulting to user installation because normal site-packages is not writeable
Collecting lightgbm
  Downloading lightgbm-4.2.0-py3-none-manylinux_2_28_x86_64.whl.metadata (19 kB)
Downloading lightgbm-4.2.0-py3-none-manylinux_2_28_x86_64.whl (3.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: lightgbm
Successfully installed lightgbm-4.2.0


In [22]:
from lightgbm import LGBMClassifier

lgb_params = {
    'min_child_weight': [1, 5, 10],
    'num_leaves': [10, 20, 30, 40],
    'boosting_type': ['gbdt', 'dart'],
    'learning_rate': [0.5, 0.1, 0.05, 0.01, 0.005],
    'n_estimators': [10, 50, 100, 500]
}


lgb = LGBMClassifier(objective='binary', nthread=8, seed=1, verbosity=1)
random_search = RandomizedSearchCV(
    lgb,
    param_distributions=lgb_params,
    n_iter=30,
    scoring='roc_auc',
    n_jobs=4, cv=5, verbose=3, random_state=42
)

random_search.fit(X_train, y_train)

Fitting 5 folds for each of 30 candidates, totalling 150 fits
[LightGBM] [Info] Number of positive: 2932, number of negative: 2876
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001309 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2990
[LightGBM] [Info] Number of data points in the train set: 5808, number of used features: 179
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.504821 -> initscore=0.019284
[LightGBM] [Info] Start training from score 0.019284
[LightGBM] [Info] Number of positive: 2932, number of negative: 2876
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.010077 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2980
[LightGBM] [Info] Number of data points in the train set: 5808, number of used features: 174
[LightGBM] [Info] [binary:

In [24]:
final_model = random_search.best_estimator_
{
    key: final_model.get_params()[key]
    for key in final_model.get_params()
    if key in lgb_params
}

{'boosting_type': 'gbdt',
 'learning_rate': 0.5,
 'min_child_weight': 1,
 'n_estimators': 100,
 'num_leaves': 20}

In [25]:
y_pred = final_model.predict_proba(X_train)[:, 1]
print(f"Score on training set: {roc_auc_score(y_train, y_pred)}")

Score on training set: 1.0


In [28]:
y_pred = final_model.predict_proba(X_val)[:, 1]
print(f"Score on validation set: {roc_auc_score(y_val, y_pred)}")

Score on validation set: 0.997280938883667


In [29]:
y_pred = final_model.predict_proba(X_test)[:, 1]
print(f"Score on test set: {roc_auc_score(y_test, y_pred)}")

Score on test set: 0.9961274475510886


## Result of Experiment

In [30]:
res = pd.DataFrame({
    "Training Approach": [
        "Untuned SGDClassifier",
        "Tuned SGDClassifier",
        "Tuned LGBM",
    ],
    "ROC-AUC on Training Set": [
        0.5,
        0.8091939881637943,
        1.0,
    ],
    "ROC-AUC on Validation Set": [
        0.5,
        0.8289620691581475,
        0.997280938883667,
    ],
    "ROC-AUC on Test Set": [
        0.5,
        0.7998814604077762,
        0.9961274475510886,
    ],
})
res

Unnamed: 0,Training Approach,ROC-AUC on Training Set,ROC-AUC on Validation Set,ROC-AUC on Test Set
0,Untuned SGDClassifier,0.5,0.5,0.5
1,Tuned SGDClassifier,0.809194,0.828962,0.799881
2,Tuned LGBM,1.0,0.997281,0.996127


> Summary: Tuned LGBM gives the ROC-AUC boost both in training, validation, and test set. It also gives the best ROC-AUC compared to SGDClassifier both tuned and untuned version. In this experiment, we can also see that SGDClassifier performing well when it's tuned rather than plain SGDClassifier

> So, we'll take the Tuned LGBM as a chosen model that will be deployed to production grade code

Choosen model is:

```python
model = LGBMClassifier(
  boosting_type='gbdt',
  learning_rate=0.5,
  min_child_weight=1,
  n_estimators=100,
  num_leaves=20,
  random_state=42
)
```