# SVM Classifier for Spotify Tracks

### Imports

In [1]:
# Other
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

# Sklearn
import sklearn.datasets as ds
import sklearn.neighbors as nb

from sklearn.svm import SVC
from sklearn import preprocessing
from sklearn import model_selection as ms
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, recall_score

### Functions and Utils

In [2]:
# To evaluate SVM
def evaluate_classifier(clf, test_data, test_answers, parval=None):
    '''
    Separated from printer because maybe we need this return values
    '''
    test_predicted = clf.predict(test_data)
    
    accuracy = accuracy_score(test_answers, test_predicted)
    recall = recall_score(test_answers, test_predicted, average="macro")
    f_measure = 2 * accuracy * recall / (accuracy + recall)
    conf_matrix = confusion_matrix(test_answers, test_predicted)
    
    if parval:
        num_supports = np.sum(clf.n_support_), np.sum(np.abs(clf.dual_coef_) == parval['C'])
        prop_supports = np.sum(clf.n_support_) / test_data.shape[0]                                          
        return accuracy, recall, f_measure, conf_matrix, num_supports, prop_supports, parval
    return accuracy, recall, f_measure, conf_matrix

                                                  
def result_printer(accuracy, recall, f_measure, conf_matrix, num_supports=None, prop_supports=None, parval=None):
    print(conf_matrix)
    print("Accuracy: {}".format(accuracy))
    print("Recall/Sensitivity:", recall)
    print("F - Measure:", f_measure)
    if parval:
        print("Best value of parameter C found: ", parval)
        print("Number of supports:", num_supports[0], "(", num_supports[1], "of them have slacks)")
        print("Prop. of supports:", prop_supports)
    else:
        print("Default value of C parameter: 1.0")


### Open and Preparating Data

In [3]:
PATH = "../datasets/SpotifyDataset.csv"
df = pd.read_csv(PATH, header=0)

# Delete first column
df.drop(df.columns[df.columns.str.contains('unnamed',case = False)], axis = 1, inplace = True)

# Pop popularity
x = df.drop(['popularity'], axis=1).values
y = df['popularity'].values

# Normalize data
min_max_scaler = preprocessing.MinMaxScaler()
x_norm = min_max_scaler.fit_transform(x)

# 30% of data for testing
(x_train, x_test, y_train, y_test) = ms.train_test_split(x_norm, y, test_size=0.3, stratify=y, random_state=1)

### Train and Test Classifier

Shared variables

In [4]:
K = 5
CS = np.logspace(-3, 11, num=15)

##### Code for Linear Classifier:
1) No Cross - Validation + C = 1.0

In [5]:
# No Cross - Validation + Default C = 1.0
SVM = SVC(kernel='linear')
SVM.fit(x_train, y_train)
result_printer(*evaluate_classifier(SVM, x_test, y_test))

[[1134   89  153]
 [ 297  141  170]
 [ 173   63  388]]
Accuracy: 0.6376533742331288
Recall/Sensitivity: 0.559276891169
F - Measure: 0.595899037886
Default value of C parameter: 1.0


2) Cross - Validation + Best C value

In [141]:
# Cross - Validaiton
SVC_ = SVC(kernel="linear")
param_grid = {'C': CS}

# K-fold Cross-Validation
grid_search = GridSearchCV(SVC_, param_grid, cv=K)
grid_search.fit(x_train, y_train)

# Plot the K-fold Cross-Validation accuracy depending on C
scores = grid_search.cv_results_["mean_test_score"]
plt.semilogx(CS, scores)
plt.show()

# Best C
parval = grid_search.best_params_
cvacc = cross_val_score(SVC(kernel="linear", C=parval['C']), X=x_train, y=y_train, cv=K, scoring="accuracy")
print("Acc. {}-fold cross on train data:".format(K), cvacc.mean())

# Train
SVM = SVC(kernel="linear", C=parval['C'])
SVM.fit(x_train, y_train)
result_printer(*evaluate_classifier(SVM, x_test, y_test, parval))

KeyboardInterrupt: 

##### Code for Polynomial Classifier:
1) No Cross - Validation + C = 1.0 + Degree 2

In [7]:
SVM = SVC(kernel='poly', degree=2)
SVM.fit(x_train, y_train)
result_printer(*evaluate_classifier(SVM, x_test, y_test))



[[1219    5  152]
 [ 432   13  163]
 [ 239    5  380]]
Accuracy: 0.6180981595092024
Recall/Sensitivity: 0.505419033571
F - Measure: 0.556108222206
Default value of C parameter: 1.0


2) No Cross - Validation + C = 1.0 + Degree 3

In [9]:
SVM = SVC(kernel='poly', degree=3)
SVM.fit(x_train, y_train)
result_printer(*evaluate_classifier(SVM, x_test, y_test))



[[1243    0  133]
 [ 461    0  147]
 [ 269    0  355]]
Accuracy: 0.6127300613496932
Recall/Sensitivity: 0.490751093222
F - Measure: 0.544998790802
Default value of C parameter: 1.0


3) Cross - Validation + Best C value + Best degree (2)

In [None]:
SVC_ = SVC(kernel="poly", degree=2)
param_grid = {'C': CS}

# K-fold Cross-Validation
grid_search = GridSearchCV(SVC_, param_grid, cv=K)
grid_search.fit(x_train, y_train)

# Plot the K-fold Cross-Validation accuracy depending on C
scores = grid_search.cv_results_["mean_test_score"]
plt.semilogx(CS, scores)
plt.show()

# Best C
parval = grid_search.best_params_
cvacc = cross_val_score(SVC(kernel="poly", degree=2, C=parval['C']), X=x_train, y=y_train, cv=K, scoring="accuracy")
print("Acc. {}-fold cross on train data:".format(K), cvacc.mean())

# Train
SVM = SVC(kernel="poly", degree=2, C=parval['C'])
SVM.fit(x_train, y_train)
result_printer(*evaluate_classifier(SVM, x_test, y_test, parval))





##### Code for RBF Classifier:
1) No Cross - Validation + C = 1.0

In [10]:
SVM = SVC() 
SVM.fit(x_train, y_train)
result_printer(*evaluate_classifier(SVM, x_test, y_test))



[[1192   32  152]
 [ 332  107  169]
 [ 213   23  388]]
Accuracy: 0.6468558282208589
Recall/Sensitivity: 0.554686927889
F - Measure: 0.597236295285
Default value of C parameter: 1.0


2) Cross - Validation + Best C value

In [None]:
SVC_ = SVC()
param_gris = {'C': CS, "gamma": [0.000001, 0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10]}

# K-fold Cross-Validation
grid_search = GridSearchCV(SVC_, PARAM_GRID, cv=K)
grid_search.fit(x_train, y_train)
parval=grid_search.best_params_

# We'll show in a grid, the accuracy for each combination of parameters tester
scores = grid_search.cv_results_["mean_test_score"]
scores = np.array(scores).reshape(len(param_grid['C']), len(param_grid["gamma"]))

plt.matshow(scores)
plt.xlabel("gamma")
plt.ylabel('C')
plt.colorbar()
plt.xticks(np.arange(len(param_grid["gamma"])), param_grid["gamma"], rotation="vertical")
plt.yticks(np.arange(len(param_grid['C'])), param_grid['C'])
plt.show()
print("\nBest combination of parameters found:", parval)

cvacc = cross_val_score(SVC(C=parval['C'], gamma=parval["gamma"]) , X=x_train,  y=y_train, cv=10, scoring="accuracy")
print("\nAcc. {}-fold cross on train data:".format(K), cvacc.mean())

# Train
SVM = SVC(C=parval['C'], gamma=parval["gamma"]) 
SVM.fit(X_train, y_train)
result_printer(*evaluate_classifier(SVM, x_text, y_test, parval))

# Conclusions
First of all we started by just training our support vector machines without cross-validation and with a default C value equal to 1. In this scenarios, the best classifier was the one with the RBF kernel, followed by the linear, and the polynomial with different degrees. However, as cross-validation and the best C value were omitted, we proceeded to consider them.

Given we do not know much about our data, we started by assuming the simplest hypothesis space with a 10-fold cross-validation. However, several problems appeared because of these assumptions. At the beggining, we tried to train a <strong>Linear Support Vector Machine</strong>, but it consumed a lot of computational resourses in cross-validation no matter if k was 10 or 5. In both cases the training was interrupted because it was lasting over an hour. This is why we decided that this type of kernel was not appropriate for the problem, which indicates that our data cannot be linearly separated.
<br><br>
Afterwards, we trained a <strong>Polynomial Support Vector Machine</strong> with a 5-fold cross-validation. This SVM was trained in less than 10 minutes and outputs the following:
<ul>
    <li><strong>Conf Matrix</strong>: [[1205, 53, 118], [327, 137, 144], [210, 50, 364]]</li>
    <li><strong>Accuracy</strong>: 0.6541</li>
    <li><strong>Recall/Sensitivity</strong>: 0.5615</li>
    <li><strong>F - Measure</strong>: 0.6043</li>
    <li><strong>Best C</strong>: 1000.0</li>
    <li><strong>N° Supports</strong>: 3758 (5887 have slacks)</li>
    <li><strong>Prop of Supports</strong>: 1.44095</li>
</ul>
Despite this result is better than those without cross-validation, we did not get a good classifier so we tried to improve it by increasing k to 10. However, the program passed the one-hour threshold we defined for training so we decided that K = 10 is too much. Finally, we proceeded to train an SVM with <strong>RBF kernel</strong>, and got the following results:
<ul>
    <li><strong>Conf Matrix</strong>: [[, , ], [, , ], [, , ]]</li>
    <li><strong>Accuracy</strong>: </li>
    <li><strong>Recall/Sensitivity</strong>: </li>
    <li><strong>F - Measure</strong>: </li>
    <li><strong>Best C</strong>: </li>
    <li><strong>N° Supports</strong>: ( have slacks)</li>
    <li><strong>Prop of Supports</strong>: </li>
</ul>