# What's in this notebook?
Developing and testing models.
- Models:
    - Logistic regression
    - KNN
    - SVM
    - Random Forest
    - XGBoost
    
- Evaluation Metrics:
    - ROC-AUC
    - Accuracy score
    - F1 score
    
- Other things to note:
    - PCA used in conjunction with models

In [1]:
import pandas as pd
import numpy as np
import matplotlib as plt
import seaborn as sns
import pickle
# I'll import sklearn models as needed

In [31]:
with open('model_data.pickle', 'rb') as f:
    data = pickle.load(f)

In [69]:
pd.set_option('display.max_columns', 500)

In [71]:
len(data.columns)

26

In [32]:
data[data.isnull().any(axis=1)]

Unnamed: 0_level_0,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,...,sec_duration_range,seg_duration_range,sec_loudness_range,sec_key_range,sec_tempo_range,sec_mode_range,sec_time_signature_range,no_unique_pitches,no_unique_timbres,mean_pitch
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4cCio6f3kmmufjWVsEfMu0,0.964,0.294,16027,0.0783,0.923,0,0.597,-23.044,1,0.04,...,,,,,,,,328,130,0.319427
6AyIvQ7Npap06gZzsHU3Hy,0.927,0.179,15427,0.141,0.916,7,0.189,-21.892,0,0.0555,...,,,,,,,,347,160,0.387112


In [33]:
data.drop(['4cCio6f3kmmufjWVsEfMu0', '6AyIvQ7Npap06gZzsHU3Hy'], inplace=True)

In [34]:
X = data.drop('ballet', axis=1)
y = data['ballet']

In [35]:
from sklearn.preprocessing import StandardScaler
# scaling data
scaler = StandardScaler()
scaler.fit(X)
X_scaled = pd.DataFrame(scaler.transform(X))
columns = set(data.columns) - {'ballet'}
X_scaled.columns = columns
X_scaled


  return self.partial_fit(X, y)
  """


Unnamed: 0,no_unique_timbres,tempo,valence,danceability,instrumentalness,duration_ms,acousticness,speechiness,sec_time_signature_range,seg_duration_range,...,sec_key_range,liveness,time_signature,sec_mode_range,sec_duration_range,no_unique_pitches,no_segments,mean_pitch,mode,no_sections
0,0.381652,-0.868784,-0.647385,-1.020334,0.961208,-0.597610,-0.345203,-3.066970,0.661342,-0.399516,...,-0.114207,-0.114207,-0.751827,-0.206105,-0.293466,0.466921,-0.695168,-0.311422,-0.049848,-0.230527
1,-3.916531,-0.868784,-0.816355,-0.581445,0.636082,-1.174172,-0.418860,-2.066163,0.661342,-0.250176,...,-0.616202,-0.616202,-0.902028,-0.585274,-0.550939,0.466921,-0.695168,-0.477337,-1.275272,-0.303023
2,0.566415,-1.271590,-0.650258,-0.380097,0.569852,1.708639,3.735404,-0.018884,-1.512077,-0.360216,...,-0.614479,-0.614479,0.348017,0.552233,-0.621966,0.466921,-0.695168,0.131017,-0.301217,0.188947
3,0.391376,0.828753,-0.828020,1.725991,-1.462189,0.843796,4.663484,0.347440,-1.512077,-0.175506,...,-1.294138,-1.294138,-0.875000,0.173064,-0.641776,0.466921,0.617859,-0.138594,-0.254085,0.222484
4,0.673384,0.713665,-0.918203,1.793956,0.726394,1.132077,-0.433591,-0.151547,0.661342,0.614424,...,-0.860099,-0.860099,-0.646890,-1.343613,-0.608379,-2.141691,-0.695168,-0.581033,-0.615428,-0.555043
5,0.371927,-1.134924,-0.072567,-0.671500,0.774561,1.132077,0.796483,-0.731865,0.661342,-0.301266,...,-0.300452,-0.300452,-0.499783,-0.206105,-0.515867,0.466921,1.274372,0.587282,0.484312,0.517400
6,0.595588,-1.142117,-0.640317,-0.900884,-1.170177,1.132077,1.555152,-1.407847,-1.512077,0.048504,...,-1.064762,-1.064762,-0.968621,0.931402,-0.635773,0.466921,0.617859,-1.085690,0.500022,-1.069639
7,0.498344,0.116650,-0.646282,0.715001,0.503623,-0.885891,0.752289,-0.325042,0.661342,-0.037956,...,-0.798268,-0.798268,-0.822735,-0.585274,-0.567937,0.466921,-1.351682,0.310758,-0.364059,0.349508
8,0.780352,-0.127910,-0.913520,-1.040214,-0.122547,1.420358,0.258786,-2.457153,0.661342,-0.049746,...,-1.318813,-1.318813,-1.190055,-0.585274,-0.793056,-2.141691,-0.695168,-3.083578,-0.772534,-3.736436
9,0.391376,-1.490256,-0.324250,-0.888140,0.792624,1.708639,-0.455688,-1.476179,0.661342,-0.085116,...,0.187176,0.187176,-0.600975,0.552233,1.006568,0.466921,0.617859,0.213974,0.028705,0.239879


In [100]:
data

Unnamed: 0_level_0,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence,ballet,no_segments,no_sections,sec_duration_range,seg_duration_range,sec_loudness_range,sec_key_range,sec_tempo_range,sec_mode_range,sec_time_signature_range,no_unique_pitches,no_unique_timbres,mean_pitch
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1
20BHajwSZPTbGbsFSxurtU,0.948,0.1610,142160,0.00474,0.946000,3,0.1220,-41.235,1,0.0356,100.496,4,0.0321,1,338,7,36.51099,36.51099,12.996,8.0,17.155,1.0,1.0,806,288,0.441292
36Jo9Y3bVip2mLvBNbgNaN,0.506,0.1610,91173,0.05640,0.838000,1,0.1120,-35.230,1,0.0394,132.687,3,0.0365,1,251,6,24.73630,24.73630,11.151,7.0,9.006,1.0,1.0,782,210,0.437831
4PJDfvW1Bw3a8fZy0C6Bjq,0.967,0.1050,141293,0.08010,0.816000,11,0.6760,-22.946,0,0.0366,66.158,3,0.0347,1,406,6,24.77672,24.77672,26.506,10.0,6.758,1.0,1.0,870,272,0.461314
1Bi0JfOAavN55nKBYBoONe,0.949,0.3970,87653,0.32800,0.141000,8,0.8020,-20.748,0,0.0413,103.014,1,0.2310,1,325,6,8.83480,8.83480,11.483,9.0,6.131,1.0,3.0,831,275,0.462915
1lxgxl9HWcFbx7FYjH0yFl,0.978,0.3810,60440,0.33600,0.868000,9,0.1100,-23.742,1,0.0614,106.060,4,0.0797,1,249,4,19.01551,19.01551,14.285,5.0,7.188,0.0,1.0,767,252,0.425802
69ftg7xbry9efOJuA2rr0a,0.947,0.1240,315613,0.04580,0.884000,9,0.2770,-27.224,1,0.0381,88.513,4,0.0379,1,1024,14,32.14247,32.14247,16.092,8.0,10.116,1.0,4.0,936,322,0.476991
4bUmsjPY6B6UnrJWiqXjaX,0.970,0.1230,144293,0.01880,0.238000,9,0.3800,-31.280,0,0.0470,78.477,4,0.0375,1,494,7,14.21498,14.21498,10.333,11.0,6.321,1.0,3.0,694,323,0.401239
7plfKTRZxPXE05euh0wQmQ,0.960,0.2980,142493,0.20900,0.794000,2,0.2710,-24.783,1,0.0448,175.134,4,0.2930,1,579,8,20.46582,20.46582,12.125,7.0,8.468,1.0,0.0,896,268,0.468978
0QKbNkXMmZ7xgNHGqdkEnL,0.989,0.2640,61853,0.00240,0.586000,10,0.2040,-37.576,1,0.0445,53.377,4,0.3950,1,136,4,8.25603,8.25603,7.613,7.0,1.343,0.0,1.0,405,242,0.273948
5ytXZyg6GzZQWkBvAsjkHN,0.949,0.0746,239667,0.02030,0.890000,11,0.1070,-31.690,1,0.0436,79.035,4,0.0359,1,766,10,43.58014,43.58014,14.849,10.0,58.301,1.0,3.0,882,293,0.463745


## Functions Used All Over

In [86]:
# function to evaluate models

from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score

def evaluation(y_test, y_pred):
    print('classification report: \n')
    print(classification_report(y_test, y_pred))
    print("ROC AUC:", roc_auc_score(y_test, y_pred))
    print("Accuracy score:", accuracy_score(y_test, y_pred))

## Basic Models
All data thrown into the basic model, just to see what happens.

### Logistic Regression

In [None]:
from sklearn.model_selection import train_test_split
# use scaled data for interpretability
# have random state for reproducability
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, 
                                                    test_size=0.2, 
                                                    random_state=42)

In [81]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(solver='lbfgs')
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)

In [82]:
coefs = {X_scaled.columns[i]: lr.coef_[0][i] for i in range (len(lr.coef_[0]))}
coefs

{'no_unique_timbres': -0.0809215629759104,
 'tempo': 0.1298113896062884,
 'valence': -1.5970014888784207,
 'danceability': -0.06548472702465596,
 'instrumentalness': 0.07591734104412191,
 'duration_ms': 0.09373171212513265,
 'acousticness': 0.3981428317384476,
 'speechiness': -0.03295304071578723,
 'sec_time_signature_range': 0.16578279981599361,
 'seg_duration_range': 0.04261170554647233,
 'sec_tempo_range': 0.0780097898424133,
 'sec_loudness_range': 0.031085584769035186,
 'energy': -0.1802578782587597,
 'key': 0.5490815555359821,
 'loudness': -1.422808074514123,
 'sec_key_range': -0.24980193734109948,
 'liveness': -0.24980193734109948,
 'time_signature': -0.18187726723275513,
 'sec_mode_range': 0.06124123946409796,
 'sec_duration_range': 0.16561093760674808,
 'no_unique_pitches': -0.10238250404269321,
 'no_segments': 0.045884099709049685,
 'mean_pitch': 0.7870807881945402,
 'mode': -0.6302631460934892,
 'no_sections': -0.49703779968968487}

In [83]:
evaluation(y_test, y_pred)

classification report: 

              precision    recall  f1-score   support

           0       0.80      0.79      0.80       215
           1       0.79      0.80      0.80       210

   micro avg       0.80      0.80      0.80       425
   macro avg       0.80      0.80      0.80       425
weighted avg       0.80      0.80      0.80       425

ROC AUC: 0.7954042081949059
Accuracy score: 0.7952941176470588


Well that was unexpected... It's surprisingly high without being so eerily accurate that I suspect overfitting... Let's see how all the other models fair!

### KNN

In [84]:
# using non-scaled data
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2, 
                                                    random_state=42)

from sklearn.neighbors import KNeighborsClassifier
knc = KNeighborsClassifier()
knc.fit(X_train, y_train)
y_pred = knc.predict(X_test)

In [85]:
evaluation(y_test, y_pred)

classification report: 

              precision    recall  f1-score   support

           0       0.77      0.82      0.79       215
           1       0.80      0.75      0.78       210

   micro avg       0.79      0.79      0.79       425
   macro avg       0.79      0.79      0.79       425
weighted avg       0.79      0.79      0.79       425

ROC AUC: 0.7854928017718714
Accuracy score: 0.7858823529411765


### SVM

In [87]:
# need to use scaled data for SVMs
# have random state for reproducability
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, 
                                                    test_size=0.2, 
                                                    random_state=42)

In [88]:
from sklearn.svm import SVC

svc = SVC()
svc.fit(X_train, y_train)
y_pred = svc.predict(X_test)



In [89]:
evaluation(y_test, y_pred)

classification report: 

              precision    recall  f1-score   support

           0       0.82      0.81      0.82       215
           1       0.81      0.82      0.82       210

   micro avg       0.82      0.82      0.82       425
   macro avg       0.82      0.82      0.82       425
weighted avg       0.82      0.82      0.82       425

ROC AUC: 0.8165559246954597
Accuracy score: 0.8164705882352942


### Random Forest

In [91]:
# using non-scaled data
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2, 
                                                    random_state=42)

In [92]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)
y_pred = rfc.predict(X_test)



In [96]:
evaluation(y_test, y_pred)

classification report: 

              precision    recall  f1-score   support

           0       0.79      0.83      0.81       215
           1       0.81      0.77      0.79       210

   micro avg       0.80      0.80      0.80       425
   macro avg       0.80      0.80      0.80       425
weighted avg       0.80      0.80      0.80       425

ROC AUC: 0.7996677740863787
Accuracy score: 0.8


### XGBoost

In [95]:
!pip install xgboost
import xgboost as xgb

Collecting xgboost
[?25l  Downloading https://files.pythonhosted.org/packages/96/84/4e2cae6247f397f83d8adc5c2a2a0c5d7d790a14a4c7400ff6574586f589/xgboost-0.90.tar.gz (676kB)
[K    100% |████████████████████████████████| 686kB 6.6MB/s ta 0:00:01
Building wheels for collected packages: xgboost
  Building wheel for xgboost (setup.py) ... [?25ldone
[?25h  Stored in directory: /Users/hannah/Library/Caches/pip/wheels/e9/48/4d/de4187b5270dff71d3697c5a7857a1e2d9a0c63a28b3462eeb
Successfully built xgboost
Installing collected packages: xgboost
Successfully installed xgboost-0.90


In [97]:
# can use the train and test from Random Forest
xgbc = xgb.XGBRFClassifier()
xgbc.fit(X_train, y_train)
y_pred = xgbc.predict(X_test)

In [98]:
evaluation(y_test, y_pred)

classification report: 

              precision    recall  f1-score   support

           0       0.82      0.80      0.81       215
           1       0.80      0.82      0.81       210

   micro avg       0.81      0.81      0.81       425
   macro avg       0.81      0.81      0.81       425
weighted avg       0.81      0.81      0.81       425

ROC AUC: 0.8095238095238095
Accuracy score: 0.8094117647058824


## Parameter Tuning & Visualizations
- tuning parameters for each model
- visualizing ROC-AUC, feature importance (for interpretable models - logistic regression & random forest)

## Feature Selection
- PCA
- Regularisation (for logistic regression)
- New visuals if better models are found

## 