In [8]:
# Load the "autoreload" extension
%load_ext autoreload

# always reload modules marked with "%aimport"
%autoreload 1

import os
import sys

# add the 'src' directory as one where we can import modules
src_dir = os.path.join(os.getcwd(), os.pardir, 'src')
sys.path.append(src_dir)

%aimport features
from features.generate_features import access_spotify, get_song_ids, get_features
from models.predict_model import model_report
import numpy as np

from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.cross_validation import train_test_split
from sklearn.grid_search import GridSearchCV
from xgboost import XGBClassifier
from xgboost import DMatrix

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Generate train and test sets

In [2]:
# load features CSV
filepath = os.path.join(os.getcwd(), os.pardir, 'data', 'features.csv')
data = np.loadtxt(filepath, delimiter=',', skiprows=1)

# split data into labels and features
X = data[:, 1:]
y = data[:, 0]

# split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# Benchmark model performance

### Decision tree classifier

In [3]:
dtc = DecisionTreeClassifier(criterion='entropy', max_depth=8, min_samples_split=5)
dtc.fit(X_train, y_train)
y_pred = dtc.predict(X_test)
report = model_report(y_pred=y_pred, y_true=y_test)
cm = confusion_matrix(y_test, y_pred)
print report
print '\n'
print 'Confusion Matrix'
print cm

Accuracy: 0.739682539683 
ROC: 0.739490632023 
Average precision score: 0.803125996522 
F1: 0.730263157895 
Hamming loss: 0.260317460317 

             precision    recall  f1-score   support

        0.0       0.73      0.77      0.75       317
        1.0       0.75      0.71      0.73       313

avg / total       0.74      0.74      0.74       630



Confusion Matrix
[[244  73]
 [ 91 222]]


### Random forest classifier

In [4]:
rfc = RandomForestClassifier(n_estimators=800, max_depth=8, min_samples_split=5, n_jobs=-1)
rfc.fit(X_train, y_train)
y_pred = rfc.predict(X_test)
report = model_report(y_pred=y_pred, y_true=y_test)
cm = confusion_matrix(y_test, y_pred)
print report
print '\n'
print 'Confusion Matrix'
print cm

Accuracy: 0.807936507937 
ROC: 0.807959000615 
Average precision score: 0.854474929725 
F1: 0.807631160572 
Hamming loss: 0.192063492063 

             precision    recall  f1-score   support

        0.0       0.81      0.80      0.81       317
        1.0       0.80      0.81      0.81       313

avg / total       0.81      0.81      0.81       630



Confusion Matrix
[[255  62]
 [ 59 254]]


### Gradient boosting classifer

In [5]:
gbc = GradientBoostingClassifier(n_estimators=800, max_depth=8, min_samples_split=5, learning_rate=0.1)
gbc.fit(X_train, y_train)
y_pred = gbc.predict(X_test)
report = model_report(y_pred=y_pred, y_true=y_test)
cm = confusion_matrix(y_test, y_pred)
print report
print '\n'
print 'Confusion Matrix'
print cm

Accuracy: 0.811111111111 
ROC: 0.811153888794 
Average precision score: 0.856699505409 
F1: 0.811410459588 
Hamming loss: 0.188888888889 

             precision    recall  f1-score   support

        0.0       0.82      0.80      0.81       317
        1.0       0.81      0.82      0.81       313

avg / total       0.81      0.81      0.81       630



Confusion Matrix
[[255  62]
 [ 57 256]]


### XGBoost classifier

In [7]:
xbc = XGBClassifier(n_estimators=800, n_jobs=-1, max_depth=8, learning_rate=0.01, subsample=0.8, colsample_bytree=0.9)
xbc.fit(X_train, y_train)
y_pred = xbc.predict(X_test)
report = model_report(y_pred=y_pred, y_true=y_test)
cm = confusion_matrix(y_test, y_pred)
print report
print '\n'
print 'Confusion Matrix'
print cm

Accuracy: 0.814285714286 
ROC: 0.814328619949 
Average precision score: 0.859075625749 
F1: 0.814580031696 
Hamming loss: 0.185714285714 

             precision    recall  f1-score   support

        0.0       0.82      0.81      0.81       317
        1.0       0.81      0.82      0.81       313

avg / total       0.81      0.81      0.81       630



Confusion Matrix
[[256  61]
 [ 56 257]]


# Get best parameters via hyper-parameter grid search