In [1]:
# Load the "autoreload" extension
%load_ext autoreload

# always reload modules marked with "%aimport"
%autoreload 1

import os
import sys

# add the 'src' directory as one where we can import modules
src_dir = os.path.join(os.getcwd(), os.pardir, 'src')
sys.path.append(src_dir)

%aimport features
%aimport models
from features.generate_features import access_spotify, get_song_ids, get_features
from models.predict_model import generate_param_grid, model_report, train_model
import numpy as np

from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.externals import joblib
from xgboost import XGBClassifier
from xgboost import DMatrix

# Generate train and test sets

In [11]:
# load features CSV
filepath = os.path.join(os.getcwd(), os.pardir, 'data', 'features.csv')
data = np.loadtxt(filepath, delimiter=',', skiprows=1)

# split data into labels and features
X = data[:, 1:]
y = data[:, 0]

# split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Benchmark model performance

### Decision tree classifier

In [10]:
dtc = DecisionTreeClassifier(criterion='entropy', max_depth=8, min_samples_split=5)
dtc.fit(X_train, y_train)
y_pred = dtc.predict(X_test)
report = model_report(y_pred=y_pred, y_true=y_test)
cm = confusion_matrix(y_test, y_pred)
print report
print '\n'
print 'Confusion Matrix'
print cm

Accuracy: 0.754761904762 
ROC: 0.761717673008 
Average precision score: 0.750119311033 
F1: 0.761020881671 
Hamming loss: 0.245238095238 

             precision    recall  f1-score   support

        0.0       0.69      0.82      0.75       186
        1.0       0.83      0.70      0.76       234

avg / total       0.77      0.75      0.76       420



Confusion Matrix
[[153  33]
 [ 70 164]]


### Random forest classifier

In [9]:
rfc = RandomForestClassifier(n_estimators=800, max_depth=8, min_samples_split=5, n_jobs=-1)
rfc.fit(X_train, y_train)
y_pred = rfc.predict(X_test)
report = model_report(y_pred=y_pred, y_true=y_test)
cm = confusion_matrix(y_test, y_pred)
print report
print '\n'
print 'Confusion Matrix'
print cm

Accuracy: 0.811904761905 
ROC: 0.818513923353 
Average precision score: 0.806973678615 
F1: 0.818390804598 
Hamming loss: 0.188095238095 

             precision    recall  f1-score   support

        0.0       0.74      0.88      0.80       186
        1.0       0.89      0.76      0.82       234

avg / total       0.82      0.81      0.81       420



Confusion Matrix
[[163  23]
 [ 56 178]]


### Gradient boosting classifer

In [8]:
gbc = GradientBoostingClassifier(n_estimators=800, max_depth=8, min_samples_split=5, learning_rate=0.1)
gbc.fit(X_train, y_train)
y_pred = gbc.predict(X_test)
report = model_report(y_pred=y_pred, y_true=y_test)
cm = confusion_matrix(y_test, y_pred)
print report
print '\n'
print 'Confusion Matrix'
print cm

Accuracy: 0.830952380952 
ROC: 0.835607940447 
Average precision score: 0.821683579578 
F1: 0.839729119639 
Hamming loss: 0.169047619048 

             precision    recall  f1-score   support

        0.0       0.77      0.88      0.82       186
        1.0       0.89      0.79      0.84       234

avg / total       0.84      0.83      0.83       420



Confusion Matrix
[[163  23]
 [ 48 186]]


### XGBoost classifier

In [3]:
xbc = XGBClassifier(n_jobs=1)
xbc.fit(X_train, y_train)
y_pred = xbc.predict(X_test)
report = model_report(y_pred=y_pred, y_true=y_test)
cm = confusion_matrix(y_test, y_pred)
print report
print '\n'
print 'Confusion Matrix'
print cm

Accuracy: 0.802380952381 
ROC: 0.806658395368 
Average precision score: 0.791066827909 
F1: 0.812641083521 
Hamming loss: 0.197619047619 

             precision    recall  f1-score   support

        0.0       0.74      0.84      0.79       186
        1.0       0.86      0.77      0.81       234

avg / total       0.81      0.80      0.80       420



Confusion Matrix
[[157  29]
 [ 54 180]]


# Get best parameters via hyper-parameter grid search

In [4]:
clf_initial = XGBClassifier(n_jobs=1)
param_grid = generate_param_grid()

In [5]:
best_model = train_model(X_train, y_train, n_jobs=1, cv=3)

In [6]:
best_clf = best_model['best_estimator']

y_pred = best_clf.predict(X_test)
report = model_report(y_pred=y_pred, y_true=y_test)
cm = confusion_matrix(y_test, y_pred)
print report
print '\n'
print 'Confusion Matrix'
print cm

Accuracy: 0.835714285714 
ROC: 0.837675765095 
Average precision score: 0.819353705655 
F1: 0.847682119205 
Hamming loss: 0.164285714286 

             precision    recall  f1-score   support

        0.0       0.79      0.85      0.82       186
        1.0       0.88      0.82      0.85       234

avg / total       0.84      0.84      0.84       420



Confusion Matrix
[[159  27]
 [ 42 192]]


In [7]:
best_model

{'best_estimator': XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
        colsample_bytree=1, gamma=0, learning_rate=0.01, max_delta_step=0,
        max_depth=12, min_child_weight=1, missing=None, n_estimators=900,
        n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
        reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
        silent=True, subsample=0.6),
 'best_params': {'learning_rate': 0.01,
  'max_depth': 12,
  'n_estimators': 900,
  'subsample': 0.6}}