# ITMAL Search Quest for Spotity Track popularity regression model

Using the great analysis code by CEF we will first perform a RandomGridSearch on diffent models and parameters. Based on the results the best model will be selected and we will perform a more extensive GridSearch for the optimal hyperpameters to define the best model for predicting the popularity feature of our data set.

In [1]:
import sys
import os
# sys.path.append("../include/")
basePath = os.getcwd()
basePath = os.path.split(basePath)[0]
basePath = os.path.split(basePath)[0]
sys.path.append(os.path.join(basePath, "include"))

### Helper functions

In [2]:
# credit to CEF with modification

from time import time
import numpy as np

from sklearn import svm
from sklearn.linear_model import SGDClassifier

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split
from sklearn.metrics import classification_report, f1_score
from sklearn import datasets

from libitmal import dataloaders_v3 as itmaldataloaders


import pandas as pd
from sklearn.preprocessing import LabelBinarizer, LabelEncoder

data_csv_path = basePath + "/Slutprojekt/csv/spotifyDBData_preprocessed_onehotenc.csv"
spotifyDBData = pd.read_csv(data_csv_path, sep=',', header=0)
input_features = [
#     "popularity",
    "acousticness",
    "danceability",
    "energy",
    "instrumentalness",
    "liveness",
    "loudness",
    "speechiness",
    "tempo",
    "valence",
    "genre_alternative",
    "genre_anime",
    "genre_blues",
    "genre_childrens-music",
    "genre_classical",
    "genre_comedy",
    "genre_country",
    "genre_dance",
    "genre_electronic",
    "genre_folk",
    "genre_hip-hop",
    "genre_indie",
    "genre_jazz",
    "genre_movie",
    "genre_opera",
    "genre_pop",
    "genre_rnb",
    "genre_rap",
    "genre_reggae",
    "genre_reggaeton",
    "genre_rock",
    "genre_ska",
    "genre_soul",
    "genre_soundtrack",
    "genre_world",
    "mode_major",
    "mode_minor",
    "time_signature_3-4",
    "time_signature_4-4",
    "time_signature_5-4"
]

def getSpotifyBinarizedXY():
    X = np.array(spotifyDBData[input_features])

    lb = LabelBinarizer()
    # lb.fit(np.array(spotifyDBData['genre']))
    y = lb.fit_transform(np.array(spotifyDBData['genre']))

    print("Binarized Label to the following classes")

    return X, y

def getSpotifyIntLabeledXY():
    X = np.array(spotifyDBData[input_features])

    label_encoder = LabelEncoder() 
    y = spotifyDBData['popularity']
    # print(y.unique())

    return X, y


#%%
currmode="N/A" # GLOBAL var!

def SearchReport(model): 
    
    def GetBestModelCTOR(model, best_params):
        def GetParams(best_params):
            r=""          
            for key in sorted(best_params):
                value = best_params[key]
                t = "'" if str(type(value))=="<class 'str'>" else ""
                if len(r)>0:
                    r += ','
                r += f'{key}={t}{value}{t}'  
            return r            
        try:
            p = GetParams(best_params)
            return type(model).__name__ + '(' + p + ')' 
        except:
            return "N/A(1)"
        
    print("\nBest model set found on train set:")
    print()
    print(f"\tbest parameters={model.best_params_}")
    print(f"\tbest '{model.scoring}' score={model.best_score_}")
    print(f"\tbest index={model.best_index_}")
    print()
    print(f"Best estimator CTOR:")
    print(f"\t{model.best_estimator_}")
    print()
    try:
        print(f"Grid scores ('{model.scoring}') on development set:")
        means = model.cv_results_['mean_test_score']
        stds  = model.cv_results_['std_test_score']
        i=0
        for mean, std, params in zip(means, stds, model.cv_results_['params']):
            print("\t[%2d]: %0.3f (+/-%0.03f) for %r" % (i, mean, std * 2, params))
            i += 1
    except:
        print("WARNING: the random search do not provide means/stds")
    
    global currmode                
#     assert "f1_micro"==str(model.scoring), f"come on, we need to fix the scoring to be able to compare model-fits! Your scoreing={str(model.scoring)}...remember to add scoring='f1_micro' to the search"   
    return f"best: dat={currmode}, score={model.best_score_:0.5f}, model={GetBestModelCTOR(model.estimator,model.best_params_)}", model.best_estimator_ 

def ClassificationReport(model, X_test, y_test, target_names=None):
    assert X_test.shape[0]==y_test.shape[0]
    print("\nDetailed classification report:")
    print("\tThe model is trained on the full development set.")
    print("\tThe scores are computed on the full evaluation set.")
    print()
    y_true, y_pred = y_test, model.predict(X_test)                 
    print(classification_report(y_true, y_pred, target_names))
    print()
    
def FullReport(model, X_test, y_test, t):
    print(f"SEARCH TIME: {t:0.2f} sec")
    beststr, bestmodel = SearchReport(model)
    #ClassificationReport(model, X_test, y_test)    
    print(f"CTOR for best model: {bestmodel}\n")
    print(f"{beststr}\n")
    return beststr, bestmodel
    
def LoadAndSetupData(mode, test_size=0.3):
    assert test_size>=0.0 and test_size<=1.0
    
    def ShapeToString(Z):
        n = Z.ndim
        s = "("
        for i in range(n):
            s += f"{Z.shape[i]:5d}"
            if i+1!=n:
                s += ";"
        return s+")"

    global currmode
    currmode=mode
    print(f"DATA: {currmode}..")
    
    if mode=='moon':
        X, y = itmaldataloaders.MOON_GetDataSet(n_samples=5000, noise=0.2)
        itmaldataloaders.MOON_Plot(X, y)
    elif mode=='mnist':
        X, y = itmaldataloaders.MNIST_GetDataSet(fetchmode=False)
        if X.ndim==3:
            X=np.reshape(X, (X.shape[0], -1))
    elif mode=='iris':
        X, y = itmaldataloaders.IRIS_GetDataSet()
    elif mode=='spotify_binarized':
        X, y = getSpotifyBinarizedXY()
    elif mode=='spotify_intlabels':
        X, y = getSpotifyIntLabeledXY()
    else:
        raise ValueError(f"could not load data for that particular mode='{mode}'")
        
    print(f'  org. data:  X.shape      ={ShapeToString(X)}, y.shape      ={ShapeToString(y)}')

    assert X.ndim==2
    assert X.shape[0]==y.shape[0]
    # assert y.ndim==1 or (y.ndim==2 and y.shape[1]==0)    
    
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=0, shuffle=True
    )
    
    print(f'  train data: X_train.shape={ShapeToString(X_train)}, y_train.shape={ShapeToString(y_train)}')
    print(f'  test data:  X_test.shape ={ShapeToString(X_test)}, y_test.shape ={ShapeToString(y_test)}')
    print()
    
    return X_train, X_test, y_train, y_test

print('OK')

OK


In [3]:
from sklearn.linear_model import Ridge, Lasso, LogisticRegression, SGDRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.tree import DecisionTreeRegressor

# TODO: Qd..(in code and text..)
# Setup data
X_train, X_test, y_train, y_test = LoadAndSetupData('spotify_intlabels') # 'iris', 'moon', or 'mnist'

search_space = [ 
    (Ridge(), {
        'fit_intercept': (True, False),
        'normalize':(True, False),
        'solver' : ('auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga')
    }),
#     (Lasso(), {
#         'fit_intercept': (True, False),
#         'normalize':(True, False),
#         'precompute': (True, False)    
#     }),
    (SGDRegressor(), {
        'loss': ('squared_loss', 'huber', 'epsilon_insensitive', 'squared_epsilon_insensitive'),
        'penalty': ('none', 'l2', 'l1', 'elasticnet'),
        'alpha':[0.0001, 0.0002, 0.0003, 0.0004, 0.0005, 0.0006, 0.0007, 0.0008, 0.0009, 0.001, 0.0011, 0.0012, 0.0013, 0.0014, 0.0015],
        'fit_intercept': (True, False),
    }),
#     (SVR(), {
#         'kernel': ('rbf', 'linear', 'poly', 'sigmoid', 'precomputed'),
#         'degree': (0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
#                   21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39),
#         'epsilon': (0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0),
#         'shrinking': (True, False)
#     }),
#     (
#     KNeighborsRegressor(n_jobs=-1), {
#     'n_neighbors':[3,4,5],
#     'weights':('uniform', 'distance'),
#     'algorithm':('ball_tree', 'kd_tree', 'brute'),
#     'p':[2,3,4],    
#     }),
#     (GaussianProcessRegressor(), {
#         'kernal': ('rbf', 'linear', 'poly', 'sigmoid', 'precomputed')
#         'degree': (0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
#                   21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39),
#         'epsilon': (0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0),
#         'shrinking': (True, False)
#     }),
#     (DecisionTreeRegressor(), {
#     'criterion' : ('mse', 'friedman_mse', 'mae'),
#     'splitter' : ('best', 'random'),  
#     })
]


CV=5
VERBOSE=0

for model, tuning_parameters in search_space:
# Run Randomized Search - RandomizedSearchCV for the model
    start = time()
    random_tuned = RandomizedSearchCV(
        model, 
        tuning_parameters, 
        random_state=42, 
        n_iter=8, 
        cv=CV, 
        scoring='r2', 
        verbose=10, 
        n_jobs=-1, 
        iid=True)
    random_tuned.fit(X_train, y_train)
    t = time()-start

    # Report result
    b0, m0= FullReport(random_tuned , X_test, y_test, t)
    print(b0)




DATA: spotify_intlabels..
  org. data:  X.shape      =(222593;   39), y.shape      =(222593)
  train data: X_train.shape=(155815;   39), y_train.shape=(155815)
  test data:  X_test.shape =(66778;   39), y_test.shape =(66778)

Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    1.4s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    1.9s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    2.8s
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    3.1s
[Parallel(n_jobs=-1)]: Done  30 out of  40 | elapsed:    3.8s remaining:    1.3s
[Parallel(n_jobs=-1)]: Done  35 out of  40 | elapsed:    4.8s remaining:    0.7s
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:    6.1s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:    6.1s finished


SEARCH TIME: 6.35 sec

Best model set found on train set:

	best parameters={'solver': 'svd', 'normalize': False, 'fit_intercept': True}
	best 'r2' score=0.6628795040449617
	best index=2

Best estimator CTOR:
	Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='svd', tol=0.001)

Grid scores ('r2') on development set:
	[ 0]: 0.663 (+/-0.005) for {'solver': 'cholesky', 'normalize': False, 'fit_intercept': True}
	[ 1]: 0.662 (+/-0.005) for {'solver': 'sparse_cg', 'normalize': False, 'fit_intercept': False}
	[ 2]: 0.663 (+/-0.005) for {'solver': 'svd', 'normalize': False, 'fit_intercept': True}
	[ 3]: 0.663 (+/-0.005) for {'solver': 'auto', 'normalize': False, 'fit_intercept': False}
	[ 4]: 0.535 (+/-0.003) for {'solver': 'auto', 'normalize': True, 'fit_intercept': True}
	[ 5]: 0.663 (+/-0.005) for {'solver': 'sag', 'normalize': False, 'fit_intercept': True}
	[ 6]: 0.663 (+/-0.005) for {'solver': 'lsqr', 'normalize': True, 'fit_in

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    0.8s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    1.0s
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    1.5s
[Parallel(n_jobs=-1)]: Done  30 out of  40 | elapsed:    1.7s remaining:    0.6s
[Parallel(n_jobs=-1)]: Done  35 out of  40 | elapsed:    2.0s remaining:    0.3s
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:    2.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:    2.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


SEARCH TIME: 2.15 sec

Best model set found on train set:

	best parameters={'precompute': True, 'normalize': True, 'fit_intercept': True}
	best 'r2' score=-2.5560023697490665e-05
	best index=0

Best estimator CTOR:
	Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=True, positive=False, precompute=True, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

Grid scores ('r2') on development set:
	[ 0]: -0.000 (+/-0.000) for {'precompute': True, 'normalize': True, 'fit_intercept': True}
	[ 1]: -0.000 (+/-0.000) for {'precompute': False, 'normalize': True, 'fit_intercept': True}
	[ 2]: -0.000 (+/-0.000) for {'precompute': True, 'normalize': False, 'fit_intercept': True}
	[ 3]: -0.000 (+/-0.000) for {'precompute': False, 'normalize': False, 'fit_intercept': True}
	[ 4]: -6.458 (+/-0.114) for {'precompute': True, 'normalize': True, 'fit_intercept': False}
	[ 5]: -6.458 (+/-0.114) for {'precompute': False, 'normalize': True, 'fit_intercept': Fa

[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    0.8s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    1.0s
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    1.5s
[Parallel(n_jobs=-1)]: Done  30 out of  40 | elapsed:    1.8s remaining:    0.6s
[Parallel(n_jobs=-1)]: Done  35 out of  40 | elapsed:    1.9s remaining:    0.3s
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:    2.2s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:    2.2s finished


SEARCH TIME: 2.45 sec

Best model set found on train set:

	best parameters={'penalty': 'l1', 'loss': 'epsilon_insensitive', 'fit_intercept': True, 'alpha': 0.0013}
	best 'r2' score=0.658702545001534
	best index=2

Best estimator CTOR:
	SGDRegressor(alpha=0.0013, average=False, early_stopping=False, epsilon=0.1,
       eta0=0.01, fit_intercept=True, l1_ratio=0.15,
       learning_rate='invscaling', loss='epsilon_insensitive',
       max_iter=None, n_iter=None, n_iter_no_change=5, penalty='l1',
       power_t=0.25, random_state=None, shuffle=True, tol=None,
       validation_fraction=0.1, verbose=0, warm_start=False)

Grid scores ('r2') on development set:
	[ 0]: 0.655 (+/-0.008) for {'penalty': 'l2', 'loss': 'epsilon_insensitive', 'fit_intercept': True, 'alpha': 0.0003}
	[ 1]: 0.546 (+/-0.005) for {'penalty': 'l1', 'loss': 'squared_epsilon_insensitive', 'fit_intercept': False, 'alpha': 0.0013}
	[ 2]: 0.659 (+/-0.006) for {'penalty': 'l1', 'loss': 'epsilon_insensitive', 'fit_intercept':

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    0.8s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    3.5s
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    5.4s
[Parallel(n_jobs=-1)]: Done  30 out of  40 | elapsed:    6.1s remaining:    2.0s
[Parallel(n_jobs=-1)]: Done  35 out of  40 | elapsed:    6.5s remaining:    0.9s
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed: 42.3min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed: 42.3min finished


SEARCH TIME: 2538.98 sec

Best model set found on train set:

	best parameters={'shrinking': True, 'kernel': 'rbf', 'epsilon': 0.4, 'degree': 11}
	best 'r2' score=0.20627145276483255
	best index=4

Best estimator CTOR:
	SVR(C=1.0, cache_size=200, coef0=0.0, degree=11, epsilon=0.4,
  gamma='auto_deprecated', kernel='rbf', max_iter=-1, shrinking=True,
  tol=0.001, verbose=False)

Grid scores ('r2') on development set:
	[ 0]: -0.218 (+/-0.007) for {'shrinking': True, 'kernel': 'poly', 'epsilon': 0.8, 'degree': 31}
	[ 1]: -47.775 (+/-1.193) for {'shrinking': False, 'kernel': 'sigmoid', 'epsilon': 0.1, 'degree': 35}
	[ 2]: -0.218 (+/-0.007) for {'shrinking': True, 'kernel': 'rbf', 'epsilon': 0.7, 'degree': 8}
	[ 3]: -0.218 (+/-0.007) for {'shrinking': True, 'kernel': 'poly', 'epsilon': 1.0, 'degree': 12}
	[ 4]: 0.206 (+/-0.024) for {'shrinking': True, 'kernel': 'rbf', 'epsilon': 0.4, 'degree': 11}
	[ 5]: -0.218 (+/-0.007) for {'shrinking': False, 'kernel': 'poly', 'epsilon': 1.0, 'degree': 

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    3.0s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    4.8s
[Parallel(n_jobs=-1)]: Done  19 out of  30 | elapsed:    8.6s remaining:    5.0s
[Parallel(n_jobs=-1)]: Done  23 out of  30 | elapsed: 64.1min remaining: 19.5min
[Parallel(n_jobs=-1)]: Done  27 out of  30 | elapsed: 75.8min remaining:  8.4min
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed: 92.5min finished


SEARCH TIME: 5551.12 sec

Best model set found on train set:

	best parameters={'splitter': 'best', 'criterion': 'friedman_mse'}
	best 'r2' score=0.37571718673599874
	best index=2

Best estimator CTOR:
	DecisionTreeRegressor(criterion='friedman_mse', max_depth=None,
           max_features=None, max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, presort=False, random_state=None,
           splitter='best')

Grid scores ('r2') on development set:
	[ 0]: 0.374 (+/-0.020) for {'splitter': 'best', 'criterion': 'mse'}
	[ 1]: 0.369 (+/-0.017) for {'splitter': 'random', 'criterion': 'mse'}
	[ 2]: 0.376 (+/-0.019) for {'splitter': 'best', 'criterion': 'friedman_mse'}
	[ 3]: 0.369 (+/-0.014) for {'splitter': 'random', 'criterion': 'friedman_mse'}
	[ 4]: 0.372 (+/-0.012) for {'splitter': 'best', 'criterion': 'mae'}
	[ 5]: 0.365 (+/-0.014) for {'splitter': 'random', 'crite

In [4]:
model = LinearRegression()
model = model.fit(X_train, y_train)

NameError: name 'LinearRegression' is not defined

In [None]:
print(model.predict(X_test[200].reshape(1, -1)))
print(y_test[200])

In [None]:
import sklearn
sorted(sklearn.metrics.SCORERS.keys())