# Model Building

This notebook is the third of five notebooks containing the central work for the project.

This notebook contains the model building process. This includes the grid search building, the running pipeline and the extraction code. 

In [None]:
# Gooogle Drive connection to your compute instance
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Imports
import pandas as pd
import os

# classifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import  KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
import sklearn.svm as svm
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import CategoricalNB, GaussianNB
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier

# model selection
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold

# evaluation
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

# result storage
import pickle
from pathlib import Path


### GridSearch Builders

In [None]:
""" 
Decision Tree 
"""
def get_dtree_grid(): 
  # classifier
  dtc = DecisionTreeClassifier(random_state=42)

  # parameter for grid search
  parameters = {
      'criterion': ['gini', 'entropy'],
      'splitter': ['best', 'random'],
      'max_depth': [4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20], #[4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,30,40,50,70,90,120,150]
      'min_samples_leaf': [0.005, 0.01], #[0.005, 0.01, 0.02, 0.05, 0.1]
      'min_impurity_decrease': [0.005, 0.01], #[0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.2]
      'ccp_alpha': [0.01, 0.05, 0.1] # [0, 0.001, 0.005, 0.01, 0.05, 0.1]
  }
  # GridSearch Object
  return GridSearchCV(dtc, param_grid=parameters, scoring="accuracy", n_jobs=-1, verbose=1, cv=10)


""" 
Random Forrest
"""
def get_rforrest_grid():
  # classifier
  clf = RandomForestClassifier(random_state=42)

  # parameter for grid search
  parameters = { 
      'n_estimators': [100, 200, 300, 400, 800, 1200], # the bigger the better is the generalization error
      'max_depth': [5, 10, 15, 20, 25],
      'min_samples_split': [2, 20, 50, 100, 200], # delete mini values -> overfitting (now 20 instead of 2)
      'min_impurity_decrease': [0, 0.002, 0.008, 0.016, 0.032],
      'max_features': ["auto", "sqrt", "log2"],
      'min_samples_leaf': [1, 10, 50, 100, 200],
      'ccp_alpha': [0, 0.1] # pruning
  }
  # GridSearch Object
  return GridSearchCV(clf, param_grid=parameters, scoring="accuracy", n_jobs=-1, verbose=1, cv=10)


""" 
K-Neighbors 
"""
def get_knn_grid():
  # classifier
  knn = KNeighborsClassifier(random_state=42)

  # parameter for grid search
  parameters ={'n_neighbors':[2, 3, 5, 7, 11, 15, 19, 23, 27, 31, 35, 39], # start at higher values than before to avoid overfitting
              'weights': ['uniform', 'distance'], # weight options
              'leaf_size': [1, 3, 5, 30, 50], # default leaf size is included & much lower & higher value
              'algorithm':['auto', 'kd_tree'] 
              }

  # GridSearch Object
  return GridSearchCV(knn, param_grid=parameters, scoring="accuracy", n_jobs=-1, verbose=1, cv=10)


""" 
XGBoost 
"""
def get_xgboost_grid():
  # classifier
  xgb = XGBClassifier(random_state=42)

  # parameter for grid search
  parameters = {'min_child_weight': [3, 10],
                'max_depth': [3, 10],
                'n_estimators': [400, 800, 1200] 
        }

  # GridSearch Object
  return GridSearchCV(xgb, param_grid=parameters, scoring="accuracy", n_jobs=-1, verbose=1, cv=10)


""" 
Support Vector Machine
"""
def get_svc_grid():
  # classifier
  sv = svm.SVC(random_state=42)

  # parameter for grid search
  parameters = {
      'kernel': ['linear', 'rbf'],
      'C': [0.01,0.1, 1, 10],
      'gamma': [0.01, 0.1, 1, 10]
  }

  # GridSearch Object
  return GridSearchCV(estimator=sv, param_grid=parameters, scoring="accuracy", n_jobs=-1, verbose=1, cv=10)


""" 
Multilayer Perceptron
"""
def get_mlp_grid():
  # classifier
  NN = MLPClassifier(random_state=42)

  # parameter for grid search
  param_grid = {
      "hidden_layer_sizes": [(64, 128)], # tried [(128), (64, 128), (128, 64), (64, 128, 32)] -> best was (64, 128),
      "alpha": [0.3, 0.35, 0.4], # tried  [0.001, 0.01, 0.05, 0.1, 0.2, 0.25, 0.3, 0.35, 0.4, 0.5] -> best was 0.35
      "n_iter_no_change": [15], # tried [10, 15, 20]  -> best was 15
      "max_iter": [200], # tried [150, 200, 250, 300] -> best was 200
  }

  # GridSearch Object
  return GridSearchCV(estimator=NN, param_grid=param_grid, scoring="accuracy", n_jobs=-1, verbose=1, cv=10)


"""
Logistic Regression
"""
def get_logreg_grid():
  # classifier
  lr = LogisticRegression(random_state=42)

  # parameter for grid search
  param_grid = {
    "penalty": ["l1", "l2", "elasticnet", "none"], # every possible kind of penalty is tried
    "tol": [1e-4, 0, 1],
    "C": [0.001, 0.01, 0.1, 1, 10], 
    "class_weight": ["balanced", "None", "dict"], # even though we dont need to balance bc only interested in accuracy, we try to better handle imbalanced classes
    "solver": ["sag", "saga", "newton-cg"] # for bigger & multi class datasets
    # dual is not needed
  }

  # GridSearch Object
  return GridSearchCV(estimator=lr, param_grid=param_grid, scoring="accuracy", n_jobs=-1, verbose=1, cv=10)



In [None]:
base_dir = "/content/drive/My Drive/Spotify Song Classification/data/"
preprocessed_dir = base_dir+"preprocessed/"
estimator_dir = base_dir+"results/estimators/"

datasets = os.listdir(preprocessed_dir)

classifiers = {
    "Decision Tree": get_dtree_grid,
    "Random Forrest": get_rforrest_grid,
    "K-Neighbors": get_knn_grid,
    "Support Vector Machine": get_svc_grid,
    "Multilayer Perceptron": get_mlp_grid,
    "Logistic Regression": get_logreg_grid,
    "XGBoost": get_xgboost_grid
}

'\n    "Decision Tree": get_dtree_grid,\n    "Random Forrest": get_rforrest_grid,\n    "K-Neighbors": get_knn_grid,\n    "Support Vector Machine": get_svc_grid,\n    "Multilayer Perceptron": get_mlp_grid,\n    "Logistic Regression": get_logreg_grid,\n    "XGBoost": get_xgboost_grid\n'

In [None]:
for dataset in datasets:
  
  dataset_location = preprocessed_dir + dataset
  x_train = pd.read_csv(dataset_location+"/x_train.csv").drop("Unnamed: 0", axis=1)
  y_train = pd.read_csv(dataset_location+"/y_train.csv").drop("Unnamed: 0", axis=1)
  x_test = pd.read_csv(dataset_location+"/x_test.csv").drop("Unnamed: 0", axis=1)
  y_test = pd.read_csv(dataset_location+"/y_test.csv").drop("Unnamed: 0", axis=1)

  for approach, grid_getter in classifiers.items():
    print("Fitting {} for dataset {}.".format(approach, dataset))

    # Get gridsearch object
    gs = grid_getter()

    # Fitting
    gs.fit(x_train, y_train.values.ravel())

    # Prediction with test data
    y_pred = gs.predict(x_test)
    accuracy = accuracy_score(y_pred, y_test)

    print("Accuracy was {}.".format(accuracy))

    result = {
        "dataset": dataset,
        "approach": approach,
        "grid": gs.best_estimator_,
        "accuracy": accuracy
    }

    
    Path(estimator_dir+"/"+str(approach)).mkdir(parents=True, exist_ok=True)
    result_path = estimator_dir+"/"+str(approach)+"/"+str(round(accuracy,5))+"_"+approach+"_"+dataset+".result"

    with open(result_path, "wb") as f:
      pickle.dump(result, f, protocol=4)

    print("Saved result to disk.\n")
  print("============================")

Fitting Decision Tree for dataset NoOutlierHandling_NoScaling_Binning10_FeatureSelectionFalse.
Fitting 5 folds for each of 96 candidates, totalling 480 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  88 tasks      | elapsed:    5.0s
[Parallel(n_jobs=-1)]: Done 388 tasks      | elapsed:   30.5s


Accuracy was 0.6022744736437683.
Saved result to disk.



[Parallel(n_jobs=-1)]: Done 480 out of 480 | elapsed:   41.0s finished


In [None]:
print(result.get("grid").score(x_train,y_train))
print(result.get("grid").score(x_test,y_test))

In [None]:
# DEMO: Load single result

res_path = "/content/drive/My Drive/Spotify Song Classification/data/results/estimators/Random Forest/0.67450_Random Forrest_NoOutlierHandling_RobustScaling_BinningFalse_FeatureSelectionFalse.result"

with open(res_path, "rb") as f:
  result = pickle.load(f)

#display(result)

estimator = result["grid"]
#estimator.predict(y_test)

In [None]:
display(result)

{'accuracy': 0.6745043798985708,
 'approach': 'Random Forrest',
 'dataset': 'NoOutlierHandling_RobustScaling_BinningFalse_FeatureSelectionFalse',
 'grid': RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                        criterion='gini', max_depth=25, max_features='auto',
                        max_leaf_nodes=None, max_samples=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, n_estimators=300,
                        n_jobs=None, oob_score=False, random_state=0, verbose=0,
                        warm_start=False)}