In [1]:
# Load the "autoreload" extension
%load_ext autoreload

# always reload modules marked with "%aimport"
%autoreload 1

import os
import sys

# add the 'src' directory as one where we can import modules
src_dir = os.path.join(os.getcwd(), os.pardir, 'src')
sys.path.append(src_dir)

%aimport features
%aimport models
from features.generate_features import access_spotify, get_song_ids, get_features
from models.predict_model import generate_param_grid, model_report, train_model
import numpy as np

from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.externals import joblib
from xgboost import XGBClassifier
from xgboost import DMatrix

# Generate train and test sets

In [2]:
# load features CSV
filepath = os.path.join(os.getcwd(), os.pardir, 'data', 'features.csv')
data = np.loadtxt(filepath, delimiter=',', skiprows=1)

# split data into labels and features
X = data[:, 1:]
y = data[:, 0]

# split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Benchmark model performance

### Decision tree classifier

In [7]:
dtc = DecisionTreeClassifier(criterion='entropy', max_depth=8, min_samples_split=5)
dtc.fit(X_train, y_train)
y_pred = dtc.predict(X_test)
report = model_report(y_pred=y_pred, y_true=y_test)
cm = confusion_matrix(y_test, y_pred)
print report
print '\n'
print 'Confusion Matrix'
print cm

Accuracy: 0.72380952381 
ROC: 0.72522481606 
Average precision score: 0.683172485007 
F1: 0.721153846154 
Hamming loss: 0.27619047619 

             precision    recall  f1-score   support

        0.0       0.69      0.76      0.73       202
        1.0       0.76      0.69      0.72       218

avg / total       0.73      0.72      0.72       420



Confusion Matrix
[[154  48]
 [ 68 150]]


### Random forest classifier

In [6]:
rfc = RandomForestClassifier(n_estimators=800, max_depth=8, min_samples_split=5, n_jobs=-1)
rfc.fit(X_train, y_train)
y_pred = rfc.predict(X_test)
report = model_report(y_pred=y_pred, y_true=y_test)
cm = confusion_matrix(y_test, y_pred)
print report
print '\n'
print 'Confusion Matrix'
print cm

Accuracy: 0.795238095238 
ROC: 0.794758833682 
Average precision score: 0.745871559633 
F1: 0.803652968037 
Hamming loss: 0.204761904762 

             precision    recall  f1-score   support

        0.0       0.79      0.78      0.79       202
        1.0       0.80      0.81      0.80       218

avg / total       0.80      0.80      0.80       420



Confusion Matrix
[[158  44]
 [ 42 176]]


### Gradient boosting classifer

In [5]:
gbc = GradientBoostingClassifier(n_estimators=800, max_depth=8, min_samples_split=5, learning_rate=0.1)
gbc.fit(X_train, y_train)
y_pred = gbc.predict(X_test)
report = model_report(y_pred=y_pred, y_true=y_test)
cm = confusion_matrix(y_test, y_pred)
print report
print '\n'
print 'Confusion Matrix'
print cm

Accuracy: 0.785714285714 
ROC: 0.785039513126 
Average precision score: 0.735181183805 
F1: 0.795454545455 
Hamming loss: 0.214285714286 

             precision    recall  f1-score   support

        0.0       0.78      0.77      0.78       202
        1.0       0.79      0.80      0.80       218

avg / total       0.79      0.79      0.79       420



Confusion Matrix
[[155  47]
 [ 43 175]]


### XGBoost classifier

In [4]:
xbc = XGBClassifier(n_jobs=8)
xbc.fit(X_train, y_train)
y_pred = xbc.predict(X_test)
report = model_report(y_pred=y_pred, y_true=y_test)
cm = confusion_matrix(y_test, y_pred)
print report
print '\n'
print 'Confusion Matrix'
print cm

Accuracy: 0.795238095238 
ROC: 0.794758833682 
Average precision score: 0.745871559633 
F1: 0.803652968037 
Hamming loss: 0.204761904762 

             precision    recall  f1-score   support

        0.0       0.79      0.78      0.79       202
        1.0       0.80      0.81      0.80       218

avg / total       0.80      0.80      0.80       420



Confusion Matrix
[[158  44]
 [ 42 176]]


# Get best parameters via hyper-parameter grid search

In [3]:
clf_initial = XGBClassifier(n_jobs=8)
param_grid = generate_param_grid()

In [None]:
best_model = train_model(X_train, y_train, n_jobs=8, cv=5)

In [None]:
best_clf = best_model['best_estimator']

y_pred = best_clf.predict(X_test)
report = model_report(y_pred=y_pred, y_true=y_test)
cm = confusion_matrix(y_test, y_pred)
print report
print '\n'
print 'Confusion Matrix'
print cm