
# Support Vector Machine


## Initialization: courtesy of Tove

In [28]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.special import logit
import seaborn as sns

from sklearn import preprocessing
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.svm import SVC
from sklearn.feature_selection import RFE, SequentialFeatureSelector  # RFE only works for linear SVM
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report, confusion_matrix, ConfusionMatrixDisplay


#plt.style.use("seaborn-v0_8-colorblind")
# The following lets us import cleanup.py
# Specifically it adds SF295_G007 to the python system path

from cleanup import removeRows


In [2]:
data_processor = removeRows("../project_train.csv")
data_processor.process_rows();
data = data_processor.df


Row 85 will be removed:

Row 95 will be removed:


In [3]:
# Create the pairplot with scatter plots for off-diagonal and probability distributions (KDE) on diagonal
#sns.pairplot(data, hue = "Label", diag_kws={'fill': False}, corner=True)

## Transforming data - courtesy of Tove

In [4]:
vars_to_transform = [
    "danceability",
    "energy",
    "speechiness",
    "acousticness",
    "liveness",
    "valence",
]

transformed_data = data.copy()
transformed_data[vars_to_transform] = transformed_data[vars_to_transform].transform(logit)

In [5]:
#sns.pairplot(transformed_data[vars_to_transform + ["Label"]], hue = "Label", diag_kws={'fill': False})

In [6]:
scaled_data = transformed_data.copy()
scaled_data[vars_to_transform + ["loudness", "tempo"]] = preprocessing.scale(
    scaled_data[vars_to_transform + ["loudness", "tempo"]]
)
#sns.pairplot(scaled_data[vars_to_transform + ["loudness", "tempo", "Label"]], hue = "Label", diag_kws={'fill': False})

In [8]:
all_predictors = scaled_data.drop([] + ["Label"], axis=1)
response = scaled_data["Label"]

# We drop_first because one of the variables in each of key,
# mode is determined by the values of the others
# (for example mode != 0 implies mode = 1)
encoded_data = pd.get_dummies(
    all_predictors, columns=["key", "mode"], drop_first=True
)

predictors_train, predictors_test, response_train, response_test = train_test_split(encoded_data, response, test_size=0.2, random_state=0)

## Training the SVM
First, a linear SVM is tried

In [9]:
linear_svm = SVC(kernel='linear', random_state=0)
linear_svm.fit(predictors_train, response_train)

In [10]:
# Evaluate Linear SVM
response_pred_linear = linear_svm.predict(predictors_test)
accuracy_linear = accuracy_score(response_test, response_pred_linear)
roc_auc_linear = roc_auc_score(response_test, response_pred_linear)
print(f"Linear SVM Accuracy: {accuracy_linear * 100:.2f}%")
confusion_matrix(response_test, response_pred_linear)

Linear SVM Accuracy: 76.24%


array([[42, 11],
       [13, 35]], dtype=int64)

### <span style="color:red">We see similar results as with linear regression, which is sort of expected.</span>

Now, a non-linear SVM with rbf (radial basis function) kernel:

In [11]:
rbf_svm = SVC(kernel='rbf', random_state=0)
rbf_svm.fit(predictors_train, response_train)

In [12]:
# Evaluate Nonlinear SVM
response_pred_rbf = rbf_svm.predict(predictors_test)
accuracy_rbf = accuracy_score(response_test, response_pred_rbf)
print(f"Nonlinear SVM (RBF) Accuracy: {accuracy_rbf * 100:.2f}%")
confusion_matrix(response_test, response_pred_rbf)

Nonlinear SVM (RBF) Accuracy: 83.17%


array([[43, 10],
       [ 7, 41]], dtype=int64)

### <span style="color:red">The results show improvement in the 'true negatives' (correctly classified likes) by significantly reducing the 'false positives' (instances where likes were incorrectly classified as dislikes).</span>

## An attempt at feature optimization

In [13]:
# RFE (recursive feature elimination) technique
accuracies = {}
features = {}
for i in range(1, predictors_train.shape[1]+1):
    rfe = RFE(estimator=SVC(kernel='linear'), n_features_to_select=i)
    rfe.fit(predictors_train, response_train)
    
    # Check which features are selected
    selected_features = rfe.support_  # Boolean mask of selected features    
    predictors_train_rfe = predictors_train.loc[:, selected_features]
    predictors_test_rfe = predictors_test.loc[:, selected_features]
    
    # Train again with the reduced feature set
    linear_svm_rfe = SVC(kernel='linear', random_state=0)
    linear_svm_rfe.fit(predictors_train_rfe, response_train)
    response_pred_linear_rfe = linear_svm_rfe.predict(predictors_test_rfe)
    
    # Evaluate the Linear SVM with selected features
    accuracy_linear_rfe = accuracy_score(response_test, response_pred_linear_rfe)
    print(f"Accuracy with {i} features: {accuracy_linear_rfe * 100:.2f}%")
    confusion_matrix(response_test, response_pred_linear_rfe)
    accuracies[i] = accuracy_linear_rfe
    features[i] = predictors_train_rfe.columns

print(f"The best accuracy is {accuracies[max(accuracies, key=accuracies.get)]} with {max(accuracies, key=accuracies.get)} features.")
print(f'The selected features are {features[max(accuracies, key=accuracies.get)]}')

Accuracy with 1 features: 73.27%
Accuracy with 2 features: 69.31%
Accuracy with 3 features: 70.30%
Accuracy with 4 features: 74.26%
Accuracy with 5 features: 71.29%
Accuracy with 6 features: 72.28%
Accuracy with 7 features: 72.28%
Accuracy with 8 features: 72.28%
Accuracy with 9 features: 72.28%
Accuracy with 10 features: 74.26%
Accuracy with 11 features: 77.23%
Accuracy with 12 features: 77.23%
Accuracy with 13 features: 75.25%
Accuracy with 14 features: 75.25%
Accuracy with 15 features: 76.24%
Accuracy with 16 features: 75.25%
Accuracy with 17 features: 74.26%
Accuracy with 18 features: 75.25%
Accuracy with 19 features: 74.26%
Accuracy with 20 features: 75.25%
Accuracy with 21 features: 76.24%
The best accuracy is 0.7722772277227723 with 11 features.
The selected features are Index(['danceability', 'energy', 'loudness', 'speechiness', 'acousticness',
       'instrumentalness', 'tempo', 'key_5', 'key_6', 'key_8', 'key_10'],
      dtype='object')


In [14]:
accuracies = {}
features = {}
for i in range(1, predictors_train.shape[1]):
    sfs = SequentialFeatureSelector(estimator=SVC(kernel='rbf'), n_features_to_select=i)  # Forward selection process
    sfs.fit(predictors_train, response_train)
    
    selected_features = sfs.get_support()  # Boolean mask of selected features    
    predictors_train_sfs = predictors_train.loc[:, selected_features]
    predictors_test_sfs = predictors_test.loc[:, selected_features]
    
    # Train again with the reduced feature set
    rbf_svm_sfs = SVC(kernel='rbf', random_state=0)
    rbf_svm_sfs.fit(predictors_train_sfs, response_train)
    response_pred_rbf_sfs = rbf_svm_sfs.predict(predictors_test_sfs)
    
    # Evaluate the Linear SVM with selected features
    accuracy_rbf_sfs = accuracy_score(response_test, response_pred_rbf_sfs)
    print(f"Accuracy with {i} features: {accuracy_rbf_sfs * 100:.2f}%")
    confusion_matrix(response_test, response_pred_rbf_sfs)
    
    accuracies[i] = accuracy_rbf_sfs
    features[i] = predictors_train_sfs.columns

print(f"The best accuracy is {accuracies[max(accuracies, key=accuracies.get)]} with {max(accuracies, key=accuracies.get)} features.")
print(f'The selected features are {features[max(accuracies, key=accuracies.get)]}')


Accuracy with 1 features: 69.31%
Accuracy with 2 features: 76.24%
Accuracy with 3 features: 81.19%
Accuracy with 4 features: 80.20%
Accuracy with 5 features: 80.20%
Accuracy with 6 features: 80.20%
Accuracy with 7 features: 80.20%
Accuracy with 8 features: 80.20%
Accuracy with 9 features: 82.18%
Accuracy with 10 features: 81.19%
Accuracy with 11 features: 81.19%
Accuracy with 12 features: 82.18%
Accuracy with 13 features: 82.18%
Accuracy with 14 features: 82.18%
Accuracy with 15 features: 81.19%
Accuracy with 16 features: 81.19%
Accuracy with 17 features: 82.18%
Accuracy with 18 features: 82.18%
Accuracy with 19 features: 83.17%
Accuracy with 20 features: 83.17%
The best accuracy is 0.8316831683168316 with 19 features.
The selected features are Index(['danceability', 'energy', 'loudness', 'speechiness', 'acousticness',
       'instrumentalness', 'liveness', 'valence', 'key_1', 'key_2', 'key_3',
       'key_4', 'key_5', 'key_6', 'key_7', 'key_8', 'key_9', 'key_10',
       'key_11'],
   

### <span style="color:red">We can see that the improvement in accuracy by adding more features after i=4 is marginal. However, the specificity score - TN/(TN+FP) - improves with higher number of features, see below.</span>
Let's look at the model with 4 features

In [15]:
sfs = SequentialFeatureSelector(estimator=SVC(kernel='rbf'), n_features_to_select=4)  # Forward selection process
sfs.fit(predictors_train, response_train)

selected_features = sfs.get_support()  # Boolean mask of selected features    
predictors_train_sfs = predictors_train.loc[:, selected_features]
predictors_test_sfs = predictors_test.loc[:, selected_features]
print(predictors_test_sfs.columns)

rbf_svm_optimized = SVC(kernel='rbf', random_state=0)
rbf_svm_optimized.fit(predictors_train_sfs, response_train)
response_pred_rbf_optimized = rbf_svm_optimized.predict(predictors_test_sfs)

# Evaluate the Linear SVM with selected features
accuracy_rbf_sfs = accuracy_score(response_test, response_pred_rbf_optimized)
print(f"Accuracy: {accuracy_rbf_sfs * 100:.2f}%")
confusion_matrix(response_test, response_pred_rbf_optimized)

Index(['danceability', 'energy', 'speechiness', 'liveness'], dtype='object')
Accuracy: 80.20%


array([[45,  8],
       [12, 36]], dtype=int64)



Now a model with 9 features (cashing out the additional 2% accuracy):

In [16]:
sfs = SequentialFeatureSelector(estimator=SVC(kernel='rbf'), n_features_to_select=9)  # Forward selection process
sfs.fit(predictors_train, response_train)

selected_features = sfs.get_support()  # Boolean mask of selected features    
predictors_train_sfs = predictors_train.loc[:, selected_features]
predictors_test_sfs = predictors_test.loc[:, selected_features]
print(predictors_test_sfs.columns)

rbf_svm_optimized = SVC(kernel='rbf', random_state=0)
rbf_svm_optimized.fit(predictors_train_sfs, response_train)
response_pred_rbf_optimized = rbf_svm_optimized.predict(predictors_test_sfs)

# Evaluate the Linear SVM with selected features
accuracy_rbf_sfs = accuracy_score(response_test, response_pred_rbf_optimized)
print(f"Accuracy: {accuracy_rbf_sfs * 100:.2f}%")
confusion_matrix(response_test, response_pred_rbf_optimized)

Index(['danceability', 'energy', 'speechiness', 'liveness', 'key_1', 'key_5',
       'key_8', 'key_10', 'key_11'],
      dtype='object')
Accuracy: 82.18%


array([[45,  8],
       [10, 38]], dtype=int64)

And finally 19 features that provides with the highest score:

In [17]:
sfs = SequentialFeatureSelector(estimator=SVC(kernel='rbf'), n_features_to_select=19)  # Forward selection process
sfs.fit(predictors_train, response_train)

selected_features = sfs.get_support()  # Boolean mask of selected features    
predictors_train_sfs = predictors_train.loc[:, selected_features]
predictors_test_sfs = predictors_test.loc[:, selected_features]
print(predictors_test_sfs.columns)

rbf_svm_optimized = SVC(kernel='rbf', random_state=0)
rbf_svm_optimized.fit(predictors_train_sfs, response_train)
response_pred_rbf_optimized = rbf_svm_optimized.predict(predictors_test_sfs)

# Evaluate the Linear SVM with selected features
accuracy_rbf_sfs = accuracy_score(response_test, response_pred_rbf_optimized)
print(f"Accuracy: {accuracy_rbf_sfs * 100:.2f}%")
confusion_matrix(response_test, response_pred_rbf_optimized)

Index(['danceability', 'energy', 'loudness', 'speechiness', 'acousticness',
       'instrumentalness', 'liveness', 'valence', 'key_1', 'key_2', 'key_3',
       'key_4', 'key_5', 'key_6', 'key_7', 'key_8', 'key_9', 'key_10',
       'key_11'],
      dtype='object')
Accuracy: 83.17%


array([[44,  9],
       [ 8, 40]], dtype=int64)

### We can see that the different "key" values play an important role in improving the specificity score of the predictions. To cross-check the impact of logit transform and the introduction of interaction terms:

1. Without any data processing

In [32]:
all_predictors_orig = data.drop([] + ["Label"], axis=1)

predictors_train_orig, predictors_test_orig, response_train_orig, response_test_orig = train_test_split(all_predictors_orig, response, test_size=0.2, random_state=0)

accuracies = {}
features = {}
for i in range(1, predictors_train_orig.shape[1]):
    sfs = SequentialFeatureSelector(estimator=SVC(kernel='rbf'), n_features_to_select=i)  # Forward selection process
    sfs.fit(predictors_train_orig, response_train_orig)
    
    selected_features = sfs.get_support()  # Boolean mask of selected features    
    predictors_train_sfs = predictors_train_orig.loc[:, selected_features]
    predictors_test_sfs = predictors_test_orig.loc[:, selected_features]
    
    # Train again with the reduced feature set
    rbf_svm_sfs = SVC(kernel='rbf', random_state=0)
    rbf_svm_sfs.fit(predictors_train_sfs, response_train_orig)
    response_pred_rbf_sfs = rbf_svm_sfs.predict(predictors_test_sfs)
    
    # Evaluate the Linear SVM with selected features
    accuracy_rbf_sfs = accuracy_score(response_test_orig, response_pred_rbf_sfs)
    print(f"Accuracy with {i} features: {accuracy_rbf_sfs * 100:.2f}%")
    confusion_matrix(response_test_orig, response_pred_rbf_sfs)
    
    accuracies[i] = accuracy_rbf_sfs
    features[i] = predictors_train_sfs.columns

print(f"The best accuracy is {accuracies[max(accuracies, key=accuracies.get)]} with {max(accuracies, key=accuracies.get)} features.")
print(f'The selected features are {features[max(accuracies, key=accuracies.get)]}')

# With all features
rbf_svm = SVC(kernel='rbf', random_state=0)
rbf_svm.fit(predictors_train_orig, response_train_orig)
response_pred_rbf = rbf_svm.predict(predictors_test_orig)
accuracy_rbf = accuracy_score(response_test_orig, response_pred_rbf)
print(f"Nonlinear SVM (RBF) Accuracy (11 features): {accuracy_rbf * 100:.2f}%")
print(confusion_matrix(response_test_orig, response_pred_rbf))


# With optimum features
sfs = SequentialFeatureSelector(estimator=SVC(kernel='rbf'), n_features_to_select=max(accuracies, key=accuracies.get))  # Forward selection process
sfs.fit(predictors_train_orig, response_train_orig)

selected_features = sfs.get_support()  # Boolean mask of selected features    
predictors_train_sfs = predictors_train_orig.loc[:, selected_features]
predictors_test_sfs = predictors_test_orig.loc[:, selected_features]

# Train again with the reduced feature set
rbf_svm_sfs = SVC(kernel='rbf', random_state=0)
rbf_svm_sfs.fit(predictors_train_sfs, response_train_orig)
response_pred_rbf_sfs = rbf_svm_sfs.predict(predictors_test_sfs)

# Evaluate the Linear SVM with selected features
accuracy_rbf_sfs = accuracy_score(response_test_orig, response_pred_rbf_sfs)
print(f"Accuracy with {max(accuracies, key=accuracies.get)} features: {accuracy_rbf_sfs * 100:.2f}%")
print(confusion_matrix(response_test_orig, response_pred_rbf_sfs))

Accuracy with 1 features: 70.30%
Accuracy with 2 features: 77.23%
Accuracy with 3 features: 81.19%
Accuracy with 4 features: 81.19%
Accuracy with 5 features: 79.21%
Accuracy with 6 features: 78.22%
Accuracy with 7 features: 79.21%
Accuracy with 8 features: 80.20%
Accuracy with 9 features: 59.41%
Accuracy with 10 features: 75.25%
The best accuracy is 0.8118811881188119 with 3 features.
The selected features are Index(['danceability', 'speechiness', 'liveness'], dtype='object')
Nonlinear SVM (RBF) Accuracy (11 features): 71.29%
[[45  8]
 [21 27]]
Accuracy with 3 features: 81.19%
[[47  6]
 [13 35]]


2. With scaling introduced:

In [33]:
scaled_data_orig = data.copy()
headers = scaled_data_orig.columns[:-1]

scaled_data_orig[headers] = preprocessing.scale(scaled_data_orig[headers])

all_predictors_orig_scaled = scaled_data_orig.drop([] + ["Label"], axis=1)

predictors_train_orig_scaled, predictors_test_orig_scaled, response_train_orig_scaled, response_test_orig_scaled = train_test_split(all_predictors_orig_scaled, response, test_size=0.2, random_state=0)

accuracies = {}
features = {}
for i in range(1, predictors_train_orig_scaled.shape[1]):
    sfs = SequentialFeatureSelector(estimator=SVC(kernel='rbf'), n_features_to_select=i)  # Forward selection process
    sfs.fit(predictors_train_orig_scaled, response_train_orig_scaled)

    selected_features = sfs.get_support()  # Boolean mask of selected features    
    predictors_train_sfs = predictors_train_orig_scaled.loc[:, selected_features]
    predictors_test_sfs = predictors_test_orig_scaled.loc[:, selected_features]

    # Train again with the reduced feature set
    rbf_svm_sfs = SVC(kernel='rbf', random_state=0)
    rbf_svm_sfs.fit(predictors_train_sfs, response_train_orig_scaled)
    response_pred_rbf_sfs = rbf_svm_sfs.predict(predictors_test_sfs)

    # Evaluate the Linear SVM with selected features
    accuracy_rbf_sfs = accuracy_score(response_test_orig_scaled, response_pred_rbf_sfs)
    print(f"Accuracy with {i} features: {accuracy_rbf_sfs * 100:.2f}%")
    confusion_matrix(response_test_orig_scaled, response_pred_rbf_sfs)

    accuracies[i] = accuracy_rbf_sfs
    features[i] = predictors_train_sfs.columns

print(f"The best accuracy is {accuracies[max(accuracies, key=accuracies.get)]} with {max(accuracies, key=accuracies.get)} features.")
print(f'The selected features are {features[max(accuracies, key=accuracies.get)]}')

# With all features
rbf_svm = SVC(kernel='rbf', random_state=0)
rbf_svm.fit(predictors_train_orig_scaled, response_train_orig_scaled)
response_pred_rbf = rbf_svm.predict(predictors_test_orig_scaled)
accuracy_rbf = accuracy_score(response_test_orig_scaled, response_pred_rbf)
print(f"Nonlinear SVM (RBF) Accuracy (11 features): {accuracy_rbf * 100:.2f}%")
print(confusion_matrix(response_test_orig_scaled, response_pred_rbf))

# With the optimum features
sfs = SequentialFeatureSelector(estimator=SVC(kernel='rbf'), n_features_to_select=max(accuracies, key=accuracies.get))  # Forward selection process
sfs.fit(predictors_train_orig_scaled, response_train_orig_scaled)

selected_features = sfs.get_support()  # Boolean mask of selected features    
predictors_train_sfs = predictors_train_orig_scaled.loc[:, selected_features]
predictors_test_sfs = predictors_test_orig_scaled.loc[:, selected_features]

rbf_svm_sfs = SVC(kernel='rbf', random_state=0)
rbf_svm_sfs.fit(predictors_train_sfs, response_train_orig_scaled)
response_pred_rbf_sfs = rbf_svm_sfs.predict(predictors_test_sfs)

accuracy_rbf_sfs = accuracy_score(response_test_orig_scaled, response_pred_rbf_sfs)
print(f"Accuracy with {max(accuracies, key=accuracies.get)} features: {accuracy_rbf_sfs * 100:.2f}%")
print(confusion_matrix(response_test_orig_scaled, response_pred_rbf_sfs))


Accuracy with 1 features: 70.30%
Accuracy with 2 features: 76.24%
Accuracy with 3 features: 75.25%
Accuracy with 4 features: 74.26%
Accuracy with 5 features: 79.21%
Accuracy with 6 features: 79.21%
Accuracy with 7 features: 79.21%
Accuracy with 8 features: 79.21%
Accuracy with 9 features: 78.22%
Accuracy with 10 features: 78.22%
The best accuracy is 0.7920792079207921 with 5 features.
The selected features are Index(['danceability', 'energy', 'speechiness', 'acousticness', 'liveness'], dtype='object')
Nonlinear SVM (RBF) Accuracy (11 features): 78.22%
[[46  7]
 [15 33]]
Accuracy with 5 features: 79.21%
[[44  9]
 [12 36]]


# <span style="color:red">Intermediate conclusions</span>

### At this point, I have the following interpretation:
1. Non-linear SVM is good at classifying music that the person doesn't like. Even without any scaling or pre-processing, the true negatives are just as high.
2. The model has trouble at recognizing music that is liked, thus the relatively high number of false positive, i.e. low specificity score.
3. By introducing scaling, the specificity improves, as it is expected by reading literature.
4. Reducing the number of features has marginal effect in accuracy, thus it is highly desired to achieve the same accuracy with fewer features to improve generalization. 
5. Introducing interaction terms for key and mode improves accuracy and specificity. 'danceability', 'energy', 'speechiness', 'liveness', 'key_1', 'key_5', 'key_8', 'key_10', 'key_11' are the most important features.
6. To improve the specificity score, literature suggests:
    1. adjusting the class weight, i.e. adjusting the weights inversely proportional to class frequency --> class_weight='balanced'
    2. tuning hyperparameters. Increasing the regularization parameter might help, but may also lead to overfitting. Lowering gamma makes the decision boundary simpler which might reduce false positives --> GridSearchCV or RandomizedSearchCV
    3. Adjusting the decision threshold with decision_function() to make the model more conservative in predicting positives.


## Hyperparameter tuning