# 05 - Model and Prediction

Model selection and tuning of the model

## Data files needed to run this notebook:
- X_train.pkl.gz
- X_test.pkl.gz
- y_test_.pkl.gz
- y_train.pkl.gz

all the results from notebook 04

## Settings:
- set `COLAB = True` if you run this on Colab. Data can be placed in the root directory

In [11]:
# setup
import sys
import subprocess
import pkg_resources
from collections import Counter
import re
from numpy import log, mean, matmul


required = {'spacy', 'scikit-learn', 'numpy', 
            'pandas', 'torch', 'matplotlib',
            'transformers', 'allennlp==0.9.0'}
installed = {pkg.key for pkg in pkg_resources.working_set}
missing = required - installed

if missing:
    python = sys.executable
    subprocess.check_call([python, '-m', 'pip', 'install', *missing], stdout=subprocess.DEVNULL)
import spacy
import numpy as np
import pandas as pd

# SciKit Learn
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import normalize
from sklearn.metrics import accuracy_score
from sklearn.svm import LinearSVC
from sklearn.svm import SVC


# Spacy
from spacy.lang.en import English
en = English()

# !python -m spacy download en_core_web_md # includes GloVe Vectors
# !python -m spacy download en_core_web_sm
# !python -m spacy download en

# import en_core_web_sm
# import en_core_web_md


# PyTorch
import torch
# import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader

from sklearn.model_selection import train_test_split

# File managment
import os
from os import listdir
from pathlib import Path
import pickle
import gzip

In [12]:
LOAD_DATA = False # read save data or regenerate data
SAVE_DATA = False # overwrite generated data? 

COLAB = False

In [13]:
if COLAB:
  # Google Colab
  path = "./"
  device = torch.device("cuda:0") # use GPU, change 
else:
  # Laptop
  path = "./data/"
  device = torch.device("cpu")
#   !pip install ipywidgets
#   !jupyter nbextension enable --py widgetsnbextension

In [14]:
# df_total = pd.read_pickle(f'{path}df_total_cleaned.pkl.gz')

X_train = pd.read_pickle(f'{path}X_train.pkl.gz')
y_train = pd.read_pickle(f'{path}y_train.pkl.gz')



In [15]:
text_cols = ["SName", "Lyric", "Artist"]
genres = list(pd.DataFrame(y_train)["Genre"].unique())

In [16]:
y_train

19167     Hip Hop
88089     Hip Hop
41285     Hip Hop
12664        Rock
52249     Hip Hop
           ...   
127879      Metal
80491        Rock
66750        Rock
133337      Metal
99994        Rock
Name: Genre, Length: 8400, dtype: object

In [17]:
test_size = 0.3
tmp = X_train.drop(text_cols,axis=1)
tmp

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,758,759,760,761,762,763,764,765,766,767
19167,0.001492,-0.273682,0.150513,-0.246460,-0.050476,-0.072327,0.447510,0.275391,-0.255859,-0.431396,...,0.408203,-1.025391,-0.107178,-0.021561,0.352051,0.488770,-0.528320,-0.062469,0.002211,0.423340
88089,0.037231,0.150635,0.496826,0.196777,-0.322998,-0.077271,0.991211,0.152954,-0.482910,-0.359619,...,0.160156,-0.338623,0.003363,0.037933,0.612305,0.417236,-0.303955,-0.520508,0.493164,0.340820
41285,-0.041718,-0.246704,-0.162720,0.137085,-0.043579,-0.229004,0.499023,0.824219,0.036102,-0.441650,...,0.738770,-0.558594,-0.090698,-0.180176,0.313232,0.296143,-0.604004,-0.382080,0.578613,0.207520
12664,-0.325684,0.143311,-0.226196,-0.002642,-0.749023,0.127930,0.955078,0.399902,-0.119263,-0.517090,...,0.154175,-0.275879,-0.046143,-0.173706,0.242676,0.716797,-0.252197,-0.583008,0.321533,0.259033
52249,0.061066,0.076416,0.814453,-0.269531,-0.344238,0.150757,0.812500,0.367432,-0.025284,-0.302490,...,-0.408936,-0.527344,-0.401855,0.095398,0.742676,0.281250,-0.496582,-0.437256,0.364258,0.228882
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
127879,-0.169678,0.041229,0.236816,-0.042542,-0.390625,-0.249268,0.797852,0.419189,-0.269531,-0.881348,...,-0.206543,-0.823730,-0.051056,-0.105835,0.450439,0.190796,-0.207031,-0.497314,0.233032,-0.636230
80491,-0.017868,0.076111,0.343506,-0.726562,-0.599609,0.000035,0.419678,0.360107,0.517578,-0.833008,...,0.453857,-0.279053,-0.021866,-0.483154,0.136719,0.191284,-0.330811,-0.092896,0.070190,0.769043
66750,-0.179810,-0.208984,-0.071289,0.151611,-0.094727,-0.214478,0.788574,0.704102,-0.184082,-0.365723,...,0.017578,-0.797852,0.142456,-0.049011,0.182617,0.599609,-0.137939,-0.099487,-0.040466,0.257568
133337,-0.553223,0.139282,0.312256,0.208862,-0.581055,-0.247070,0.706543,0.520508,0.349121,-0.726074,...,-0.300049,-0.753906,-0.207153,-0.024521,0.374512,0.519043,0.003078,-0.461670,0.013908,-0.375977


In [18]:
X_train_set, X_val_set, y_train_set, y_val_set = train_test_split(tmp, y_train, test_size=test_size, random_state=0, shuffle = True, stratify = y_train)

In [19]:
from sklearn.svm import LinearSVC
from sklearn.svm import SVC

def train_SVC(x,y,x_val, y_val):
  model = LinearSVC(max_iter=2000)
  model.fit(x, y)
  val_preds = model.predict(x_val)
  acc = accuracy_score(y_val, val_preds)
  print(f"Accuracy: {round(acc,2)}")
  return (acc, model, val_preds)

In [21]:
(acc, model, predictions) = train_SVC(X_train_set, y_train_set, X_val_set, y_val_set)

# running rock,hiphop, metal: -> 92%
# running pop,hiphop, metal: -> 85%
# running 4 genres -> 75%
# running pop, rock, hiphop _> 0.67

Accuracy: 0.92




In [22]:
# confusiont matrix
from sklearn.metrics import confusion_matrix

In [23]:
def print_confustion_matrix(model, y_val_set, predictions):
  cm = confusion_matrix(y_val_set, predictions)
  df = pd.DataFrame(cm, columns = model.classes_, index= model.classes_)
  print(df)
  
  
  

In [25]:
print_confustion_matrix(model, y_val_set, predictions)
  

         Hip Hop  Metal  Rock
Hip Hop      747      0    93
Metal          0    840     0
Rock          99      4   737


In [26]:
def wrong_classifications(X_train, y, predictions, genres):
  print("Truth - predicted")
  predictions_df = pd.DataFrame(predictions, columns = ["Genre_Predicted"])
  truth_df = pd.DataFrame(y)
  truth_df.columns = ["Genre_Truth"]
  combined_df = pd.concat([truth_df.reset_index(drop=True), predictions_df.reset_index(drop=True)], axis=1)
  for i in genres:
    for j in genres:
      if i!=j:
        idx = combined_df.query(f"Genre_Truth =='{i}' != Genre_Predicted=='{j}'").index
        if len(idx)>0:
          print("------------------------------")
          print(f"{i} - {j}")
          print("------------------------------")
          print(X_train.iloc[idx]["Lyric"])
 

In [28]:
 
wrong_classifications(X_train, y_val_set,predictions  , genres)

Truth - predicted
------------------------------
Hip Hop - Rock
------------------------------
6121      previously on ashanti "always there when you c...
19946     Before I get started. polo this beat is retart...
126819    Patrz   w siebie i widz   coraz mniej Powoli p...
33512     Fatjoe: TS. Thalía: Hey baby. Fatjoe: Yeah. Th...
126051    I felt the ground start to shake  Oh God  oh G...
                                ...                        
30991     Its 2002, everything was totally new. We were ...
21676     It's time to make a difference. I know he's ba...
112568    Cross me once more fool me cross me twice and ...
6192       why dont you take me tonight. take me away wh...
66616     she said that she'd take it off right here. ta...
Name: Lyric, Length: 93, dtype: object
------------------------------
Rock - Hip Hop
------------------------------
26589     I didn't hear you leave,. I wonder how am I st...
59383     Your breath is sweet. Your eyes are like two j...
49296    

In [29]:
from sklearn.metrics import classification_report
print(classification_report(y_val_set, predictions, target_names=genres))

              precision    recall  f1-score   support

     Hip Hop       0.88      0.89      0.89       840
        Rock       1.00      1.00      1.00       840
       Metal       0.89      0.88      0.88       840

    accuracy                           0.92      2520
   macro avg       0.92      0.92      0.92      2520
weighted avg       0.92      0.92      0.92      2520



In [30]:
# Try different kernels and values of hyper parameters to see if we can improve the score
# this method uses cross validation so we could use the whole data set (we do ot so we can use the val to get the test score of the fitted model, with the parameters. )

In [31]:
%%time
from sklearn import svm, datasets
from sklearn.model_selection import GridSearchCV
parameters = {'kernel':('linear', 'rbf', 'poly', 'sigmoid'), 'C':[1, 10]}
svc = svm.SVC()
clf = GridSearchCV(svc, parameters)
clf.fit(X_train_set, y_train_set)


CPU times: user 3min 38s, sys: 1.36 s, total: 3min 39s
Wall time: 3min 40s


GridSearchCV(estimator=SVC(),
             param_grid={'C': [1, 10],
                         'kernel': ('linear', 'rbf', 'poly', 'sigmoid')})

In [40]:
print(clf.best_params_)

{'C': 1, 'kernel': 'poly'}


In [41]:
list(zip(clf.cv_results_['params'], clf.cv_results_['mean_test_score']))

[({'C': 1, 'kernel': 'linear'}, 0.6551020408163264),
 ({'C': 1, 'kernel': 'rbf'}, 0.6914965986394558),
 ({'C': 1, 'kernel': 'poly'}, 0.6954081632653061),
 ({'C': 1, 'kernel': 'sigmoid'}, 0.6741496598639456),
 ({'C': 10, 'kernel': 'linear'}, 0.6341836734693878),
 ({'C': 10, 'kernel': 'rbf'}, 0.6952380952380952),
 ({'C': 10, 'kernel': 'poly'}, 0.6948979591836735),
 ({'C': 10, 'kernel': 'sigmoid'}, 0.6066326530612245)]

In [42]:
pd.DataFrame(clf.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_kernel,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,19.877298,0.402755,3.374521,0.407636,1,linear,"{'C': 1, 'kernel': 'linear'}",0.677721,0.643707,0.643707,0.657313,0.653061,0.655102,0.012493,6
1,17.557154,0.28668,4.539178,0.157519,1,rbf,"{'C': 1, 'kernel': 'rbf'}",0.703231,0.671769,0.702381,0.697279,0.682823,0.691497,0.01228,4
2,16.346417,0.287686,4.132863,0.193988,1,poly,"{'C': 1, 'kernel': 'poly'}",0.709184,0.679422,0.701531,0.69898,0.687925,0.695408,0.010503,1
3,18.480275,0.319307,4.496841,0.170986,1,sigmoid,"{'C': 1, 'kernel': 'sigmoid'}",0.673469,0.665816,0.676871,0.685374,0.669218,0.67415,0.006747,5
4,49.943227,4.006421,3.071111,0.224241,10,linear,"{'C': 10, 'kernel': 'linear'}",0.651361,0.62585,0.621599,0.643707,0.628401,0.634184,0.011376,7
5,16.324641,0.397191,4.291165,0.102518,10,rbf,"{'C': 10, 'kernel': 'rbf'}",0.695578,0.696429,0.685374,0.69898,0.69983,0.695238,0.005175,2
6,16.098225,0.767679,4.077977,0.116028,10,poly,"{'C': 10, 'kernel': 'poly'}",0.69898,0.701531,0.688776,0.693878,0.691327,0.694898,0.004731,3
7,13.405732,0.453057,3.527203,0.207801,10,sigmoid,"{'C': 10, 'kernel': 'sigmoid'}",0.62415,0.589286,0.606293,0.617347,0.596088,0.606633,0.012914,8


Sigmoid is doing the worst
- Linear betwee 0.91 and 0.92
- rbf: 0.93, 0.94
- Poly: 0.94 (may be overfitting) but ar the best scores
- Sigmoid, 0.93, 0.90

# SVC: 
Choose the best kernel and optimize that one.
We have vectors in a many dimensional space, so we don't really know what is the valid choice. We just try to optimize the problem, not trying to explain what happens under the hood. 

## Radial Kernel - RBF

In [43]:
%%time
from sklearn import svm, datasets
from sklearn.model_selection import GridSearchCV
parameters = {'kernel':(['rbf']), 'C':[0.5, 1, 5, 10,20]}
svc = svm.SVC()
clf = GridSearchCV(svc, parameters)
clf.fit(X_train_set, y_train_set)


CPU times: user 9min 15s, sys: 1.62 s, total: 9min 17s
Wall time: 9min 19s


GridSearchCV(estimator=SVC(),
             param_grid={'C': [0.5, 1, 5, 10, 20], 'kernel': ['rbf']})

In [32]:
print(clf.best_params_)

{'C': 10, 'kernel': 'poly'}


In [33]:
list(zip(clf.cv_results_['params'], clf.cv_results_['mean_test_score']))

[({'C': 1, 'kernel': 'linear'}, 0.9285714285714285),
 ({'C': 1, 'kernel': 'rbf'}, 0.9396258503401361),
 ({'C': 1, 'kernel': 'poly'}, 0.9430272108843537),
 ({'C': 1, 'kernel': 'sigmoid'}, 0.9333333333333333),
 ({'C': 10, 'kernel': 'linear'}, 0.9171768707482993),
 ({'C': 10, 'kernel': 'rbf'}, 0.9430272108843537),
 ({'C': 10, 'kernel': 'poly'}, 0.94421768707483),
 ({'C': 10, 'kernel': 'sigmoid'}, 0.9047619047619048)]

- For 3 genres: Best value for RBF: {'C': 5, 'kernel': 'rbf'} 0.94
- For 4 genres: ({'C': 5, 'kernel': 'rbf'}, 0.7687074829931972),

## Polynomial Kernel
Optimize _C_ value and degrees of the polynomial approximation.

In [34]:
%%time
parameters = {'kernel':(['poly']), 'C':[0.5, 1, 5, 10,20], 'degree':[2,3,4]}
svc = svm.SVC()
clf = GridSearchCV(svc, parameters)
clf.fit(X_train_set, y_train_set)

CPU times: user 6min 44s, sys: 1.8 s, total: 6min 46s
Wall time: 6min 47s


GridSearchCV(estimator=SVC(),
             param_grid={'C': [0.5, 1, 5, 10, 20], 'degree': [2, 3, 4],
                         'kernel': ['poly']})

In [35]:
print(clf.best_params_)

{'C': 5, 'degree': 4, 'kernel': 'poly'}


In [36]:
pd.DataFrame(clf.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_degree,param_kernel,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,4.745979,0.055409,1.436245,0.015884,0.5,2,poly,"{'C': 0.5, 'degree': 2, 'kernel': 'poly'}",0.941327,0.937925,0.937075,0.92602,0.948129,0.938095,0.007183,15
1,4.645759,0.034395,1.396418,0.010783,0.5,3,poly,"{'C': 0.5, 'degree': 3, 'kernel': 'poly'}",0.938776,0.940476,0.937925,0.92602,0.948129,0.938265,0.007106,14
2,4.715809,0.077743,1.41259,0.020062,0.5,4,poly,"{'C': 0.5, 'degree': 4, 'kernel': 'poly'}",0.936224,0.940476,0.937925,0.929422,0.95068,0.938946,0.006917,13
3,4.152552,0.06465,1.227479,0.037168,1.0,2,poly,"{'C': 1, 'degree': 2, 'kernel': 'poly'}",0.938776,0.944728,0.940476,0.927721,0.94898,0.940136,0.007147,11
4,4.157955,0.080079,1.234929,0.037848,1.0,3,poly,"{'C': 1, 'degree': 3, 'kernel': 'poly'}",0.943878,0.946429,0.942177,0.932823,0.94983,0.943027,0.005717,6
5,4.287799,0.078636,1.266656,0.031609,1.0,4,poly,"{'C': 1, 'degree': 4, 'kernel': 'poly'}",0.943878,0.948129,0.943878,0.934524,0.953231,0.944728,0.006155,2
6,3.611723,0.021955,1.027986,0.020077,5.0,2,poly,"{'C': 5, 'degree': 2, 'kernel': 'poly'}",0.943878,0.947279,0.940476,0.930272,0.947279,0.941837,0.006309,8
7,3.823354,0.063456,1.103924,0.033731,5.0,3,poly,"{'C': 5, 'degree': 3, 'kernel': 'poly'}",0.948129,0.945578,0.942177,0.932823,0.94983,0.943707,0.006022,4
8,4.06864,0.06154,1.197467,0.03381,5.0,4,poly,"{'C': 5, 'degree': 4, 'kernel': 'poly'}",0.94983,0.948129,0.943027,0.936224,0.94898,0.945238,0.005091,1
9,3.61497,0.191798,1.060812,0.018255,10.0,2,poly,"{'C': 10, 'degree': 2, 'kernel': 'poly'}",0.94983,0.94898,0.940476,0.931973,0.942177,0.942687,0.006489,7


A lot of calculations, but the differences are very small. So the poly nomial with 2 degrees and C=5 will do just fine. Better to have less degrees, than more to prevent overfitting.

For 4 categories:
- {'C': 1, 'degree': 3, 'kernel': 'poly'}


# Stochastic gradient descent

In [49]:
# from sklearn.preprocessing import LabelEncoder
# le = LabelEncoder()
# y_train_num = le.fit_transform(y_train_set)
# y_val_num = le.transform(y_val_set)


In [50]:
import numpy as np
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

# Always scale the input. The most convenient way is to use a pipeline.
clf = make_pipeline(StandardScaler(),
                    SGDClassifier(max_iter=1000, tol=1e-3))

clf.fit(X_train_set, y_train_set)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('sgdclassifier', SGDClassifier())])

In [51]:
predictions = clf.predict(X_val_set)

In [52]:
accuracy_score(y_val_set, predictions)

0.6484126984126984

The performance here is worse.

# KNeigbors

In [42]:
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors=3)
neigh.fit(X_train_set, y_train_set)

predictions = neigh.predict(X_val_set)
accuracy_score(y_val_set, predictions)
# print(neigh.predict_proba([[0.9]]))

0.888095238095238

So this one in less accurate. But let's see if we can improve the results. 

In [None]:
# Tuning of the results

In [43]:
%%time 
parameters = {'n_neighbors':[3,4,5,6,7]}
clf = GridSearchCV(neigh, parameters)
clf.fit(X_train_set, y_train_set)

CPU times: user 3min 5s, sys: 1.05 s, total: 3min 6s
Wall time: 3min 7s


GridSearchCV(estimator=KNeighborsClassifier(n_neighbors=3),
             param_grid={'n_neighbors': [3, 4, 5, 6, 7]})

In [44]:
print(clf.best_params_)

{'n_neighbors': 5}


In [45]:
pd.DataFrame(clf.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_neighbors,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.208835,0.006383,7.194835,0.013695,3,{'n_neighbors': 3},0.911565,0.897109,0.914116,0.893707,0.909014,0.905102,0.008149,3
1,0.202581,0.002646,7.242447,0.019047,4,{'n_neighbors': 4},0.903912,0.880952,0.90051,0.887755,0.891156,0.892857,0.008384,5
2,0.221947,0.028751,7.263931,0.022776,5,{'n_neighbors': 5},0.92517,0.904762,0.912415,0.897959,0.920918,0.912245,0.010024,1
3,0.203689,0.003995,7.293332,0.010813,6,{'n_neighbors': 6},0.911565,0.893707,0.903912,0.894558,0.917517,0.904252,0.009324,4
4,0.20239,0.004232,7.298517,0.043988,7,{'n_neighbors': 7},0.917517,0.903061,0.907313,0.905612,0.92517,0.911735,0.008321,2


In [46]:
neigh = KNeighborsClassifier(n_neighbors=clf.best_params_["n_neighbors"])
neigh.fit(X_train_set, y_train_set)

predictions = neigh.predict(X_val_set)
accuracy_score(y_val_set, predictions)
# print(neigh.predict_proba([[0.9]]))

0.8952380952380953


A little improvement over the 3 neighbors, but not so much

# Naive Bayes

In [47]:
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()
clf.fit(X_train_set, y_train_set)

predictions = clf.predict(X_val_set)
accuracy_score(y_val_set, predictions)

0.9087301587301587

So this option also performs less. The assumption for Guassian distribution is probably als not valid

# Neural Network

In [48]:
from sklearn.neural_network import MLPClassifier
from sklearn.datasets import make_classification

clf = MLPClassifier(random_state=1, hidden_layer_sizes=(500,500), max_iter=500).fit(X_train_set, y_train_set)

predictions = clf.predict(X_val_set)
accuracy_score(y_val_set, predictions)

0.9444444444444444

In [49]:
clf.predict_proba(X_val_set[0:10])

array([[1.06823831e-18, 1.00000000e+00, 4.39361605e-23],
       [1.00000000e+00, 5.75843456e-16, 1.89994343e-15],
       [8.51008301e-06, 3.02647639e-08, 9.99991460e-01],
       [8.09631690e-20, 1.00000000e+00, 2.84926628e-15],
       [2.19418652e-11, 7.23780273e-13, 1.00000000e+00],
       [1.00000000e+00, 1.21130592e-12, 1.50231330e-10],
       [5.89613746e-08, 2.35962731e-09, 9.99999939e-01],
       [1.00000000e+00, 2.32063741e-16, 1.82761602e-15],
       [9.74750658e-01, 2.44990193e-08, 2.52493172e-02],
       [2.53563641e-09, 1.10371443e-11, 9.99999997e-01]])

In [50]:
predictions = clf.predict(X_val_set)
accuracy_score(y_val_set, predictions)

0.9444444444444444

In [51]:
clf.score(X_val_set, y_val_set)

0.9444444444444444

In [52]:
# All the results are in the range of 92%-94%
print_confustion_matrix(clf, y_val_set, predictions)

         Hip Hop  Metal  Rock
Hip Hop      762      0    78
Metal          1    839     0
Rock          58      3   779


- different classifier have marginal effect
- There must be something in the dataset in the Metal, it's classified correctly almost too much compared to the other genres. But is not clear what its. 
    - Some different languages. 

Best model: RBC for SVC
({'C': 5, 'kernel': 'rbf'}, 0.7687074829931972) for 4 category classifier.

Neural net: 500x500 hidden layers.
 - Pop and rock are the genres most closely related.

# Result of the chosen model

- For the front end we'll set up the system with SVC, Radial kernel, with C=5
- We fit the model on the complete training set
- We evaluatoin on the test set

In [53]:
X_train.head()

Unnamed: 0,SName,Lyric,Artist,0,1,2,3,4,5,6,...,758,759,760,761,762,763,764,765,766,767
19167,Back To Sleep,"I know it's late, I know it's late. And baby I...",Chris Brown,0.001492,-0.273682,0.150513,-0.24646,-0.050476,-0.072327,0.44751,...,0.408203,-1.025391,-0.107178,-0.021561,0.352051,0.48877,-0.52832,-0.062469,0.002211,0.42334
88089,The Best,The Best. Soulja Boy. Soulja! Soulja! Soulja! ...,Soulja Boy,0.037231,0.150635,0.496826,0.196777,-0.322998,-0.077271,0.991211,...,0.160156,-0.338623,0.003363,0.037933,0.612305,0.417236,-0.303955,-0.520508,0.493164,0.34082
41285,Just Askin',"Wassup, in your world?. And are you still cool...",Iggy Azalea,-0.041718,-0.246704,-0.16272,0.137085,-0.043579,-0.229004,0.499023,...,0.73877,-0.558594,-0.090698,-0.180176,0.313232,0.296143,-0.604004,-0.38208,0.578613,0.20752
12664,You Wear A Crown But You're No King,You'll never stop 'til you get what you want. ...,Blessthefall,-0.325684,0.143311,-0.226196,-0.002642,-0.749023,0.12793,0.955078,...,0.154175,-0.275879,-0.046143,-0.173706,0.242676,0.716797,-0.252197,-0.583008,0.321533,0.259033
52249,Kevin Gates,Workout. Tell. Workout. Tell. Gates. Gates. Ga...,Kevin Gates,0.061066,0.076416,0.814453,-0.269531,-0.344238,0.150757,0.8125,...,-0.408936,-0.527344,-0.401855,0.095398,0.742676,0.28125,-0.496582,-0.437256,0.364258,0.228882


In [54]:
y_train

19167     Hip Hop
88089     Hip Hop
41285     Hip Hop
12664        Rock
52249     Hip Hop
           ...   
127879      Metal
80491        Rock
66750        Rock
133337      Metal
99994        Rock
Name: Genre, Length: 8400, dtype: object

In [55]:
X_train_final = X_train.drop(text_cols, axis=1)
X_train_final

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,758,759,760,761,762,763,764,765,766,767
19167,0.001492,-0.273682,0.150513,-0.246460,-0.050476,-0.072327,0.447510,0.275391,-0.255859,-0.431396,...,0.408203,-1.025391,-0.107178,-0.021561,0.352051,0.488770,-0.528320,-0.062469,0.002211,0.423340
88089,0.037231,0.150635,0.496826,0.196777,-0.322998,-0.077271,0.991211,0.152954,-0.482910,-0.359619,...,0.160156,-0.338623,0.003363,0.037933,0.612305,0.417236,-0.303955,-0.520508,0.493164,0.340820
41285,-0.041718,-0.246704,-0.162720,0.137085,-0.043579,-0.229004,0.499023,0.824219,0.036102,-0.441650,...,0.738770,-0.558594,-0.090698,-0.180176,0.313232,0.296143,-0.604004,-0.382080,0.578613,0.207520
12664,-0.325684,0.143311,-0.226196,-0.002642,-0.749023,0.127930,0.955078,0.399902,-0.119263,-0.517090,...,0.154175,-0.275879,-0.046143,-0.173706,0.242676,0.716797,-0.252197,-0.583008,0.321533,0.259033
52249,0.061066,0.076416,0.814453,-0.269531,-0.344238,0.150757,0.812500,0.367432,-0.025284,-0.302490,...,-0.408936,-0.527344,-0.401855,0.095398,0.742676,0.281250,-0.496582,-0.437256,0.364258,0.228882
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
127879,-0.169678,0.041229,0.236816,-0.042542,-0.390625,-0.249268,0.797852,0.419189,-0.269531,-0.881348,...,-0.206543,-0.823730,-0.051056,-0.105835,0.450439,0.190796,-0.207031,-0.497314,0.233032,-0.636230
80491,-0.017868,0.076111,0.343506,-0.726562,-0.599609,0.000035,0.419678,0.360107,0.517578,-0.833008,...,0.453857,-0.279053,-0.021866,-0.483154,0.136719,0.191284,-0.330811,-0.092896,0.070190,0.769043
66750,-0.179810,-0.208984,-0.071289,0.151611,-0.094727,-0.214478,0.788574,0.704102,-0.184082,-0.365723,...,0.017578,-0.797852,0.142456,-0.049011,0.182617,0.599609,-0.137939,-0.099487,-0.040466,0.257568
133337,-0.553223,0.139282,0.312256,0.208862,-0.581055,-0.247070,0.706543,0.520508,0.349121,-0.726074,...,-0.300049,-0.753906,-0.207153,-0.024521,0.374512,0.519043,0.003078,-0.461670,0.013908,-0.375977


In [56]:
model = SVC(max_iter=4000, kernel='rbf', C=5)

In [57]:
model.fit(X_train_final, y_train)

SVC(C=5, max_iter=4000)

In [58]:
pickle.dump(model, gzip.open(f'{path}{"final_lyrics_model.pkl.gz"}', 'wb'))

In [59]:
# load the test set
X_test = pd.read_pickle(f'{path}X_test.pkl.gz')
y_test = pd.read_pickle(f'{path}y_test.pkl.gz')

print(X_test.shape, y_test.shape)

(3600, 771) (3600,)


In [60]:
X_test_final = X_test.drop(text_cols, axis=1)
X_test_final

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,758,759,760,761,762,763,764,765,766,767
89956,-0.244995,-0.127075,0.097412,-0.187256,-0.259521,-0.194092,0.568359,0.562012,-0.143677,-0.538086,...,0.078186,-0.501465,-0.123047,-0.450195,0.400635,0.800293,-0.675293,0.069397,0.398193,0.586426
33634,-0.034454,-0.149658,0.664062,-0.213867,-0.482178,-0.253418,0.167114,0.550293,-0.196533,-0.573242,...,-0.312744,-0.751465,0.074219,-0.265137,-0.074219,0.669434,-0.548828,-0.318848,0.070435,0.294678
16861,0.141235,-0.208374,0.282715,-0.068359,-0.137085,-0.065552,0.525391,0.498535,-0.515137,-0.383545,...,0.154785,-0.604492,-0.254639,-0.252930,0.430420,0.540527,-0.599121,-0.480957,0.131470,0.092529
92314,0.345215,0.182495,0.065552,-0.339355,-0.525391,0.010788,0.584473,0.826660,-0.712402,-0.538574,...,-0.051819,-0.761719,-0.501953,-0.466064,0.818848,0.695801,-0.710449,-0.107666,0.366455,0.580566
95289,-0.081360,0.009384,0.453613,-0.090454,-0.411133,0.203369,0.451904,0.871094,-0.225098,-1.244141,...,-0.079285,-0.294922,-0.215332,-0.411865,0.302002,0.394043,-0.223267,-0.057770,0.276611,0.663574
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
118197,-0.474121,0.379395,-0.052002,0.240479,-0.168701,0.211792,0.625488,0.508789,-0.714844,-0.219116,...,0.605469,-0.621094,-0.462158,-0.062866,0.413330,0.089294,-0.097717,-0.487793,0.321045,0.369385
152448,-0.527832,0.535156,0.091675,-0.161011,-0.460449,-0.021591,0.848633,0.563477,-0.372070,-0.105286,...,0.512207,-0.675781,-0.113892,0.237061,-0.147217,0.264648,-0.349609,-0.316895,0.336182,0.521973
66130,-0.265137,-0.281738,0.449219,-0.094727,-0.453857,-0.122620,0.360352,0.526367,0.146240,-0.671875,...,-0.164673,-0.479248,0.071411,-0.120239,-0.066345,0.391846,-0.296387,-0.319336,0.227051,0.458984
104573,-0.182373,-0.352783,0.146484,-0.426514,-0.583984,0.259277,0.880859,0.958008,-0.179932,-0.843750,...,0.082947,-0.488770,-0.828125,-0.507324,0.489746,0.223267,-0.271973,-0.354004,0.679199,0.349854


In [61]:
test_predictions = model.predict(X_test_final)

In [62]:
test_predictions.shape

(3600,)

In [63]:
y_test.shape

(3600,)

In [64]:
acc = accuracy_score(y_test, test_predictions)
print(f"Accuracy: {round(acc,2)}")

Accuracy: 0.95


In [65]:
print_confustion_matrix(model, y_test, test_predictions)

         Hip Hop  Metal  Rock
Hip Hop     1063      0   137
Metal          0   1200     0
Rock          57      1  1142


In [66]:
print(classification_report(y_test, test_predictions, target_names=genres))

              precision    recall  f1-score   support

     Hip Hop       0.95      0.89      0.92      1200
        Rock       1.00      1.00      1.00      1200
       Metal       0.89      0.95      0.92      1200

    accuracy                           0.95      3600
   macro avg       0.95      0.95      0.95      3600
weighted avg       0.95      0.95      0.95      3600

