In [None]:
# Load the "autoreload" extension
%load_ext autoreload

# always reload modules marked with "%aimport"
%autoreload 1

import os
import sys

# add the 'src' directory as one where we can import modules
src_dir = os.path.join(os.getcwd(), os.pardir, 'src')
sys.path.append(src_dir)

%aimport features
%aimport models
from features.generate_features import access_spotify, get_song_ids, get_features
from models.train_model import generate_param_grid, train_model
from models.predict_model import model_report
import numpy as np

from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.externals import joblib
from skopt import BayesSearchCV
from xgboost import XGBClassifier
from xgboost import DMatrix

# Generate train and test sets

In [None]:
# load features CSV
filepath = os.path.join(os.getcwd(), os.pardir, 'data', 'features.csv')
data = np.loadtxt(filepath, delimiter=',', skiprows=1)

# split data into labels and features
X = data[:, 1:]
y = data[:, 0]

# split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Benchmark model performance

### Decision tree classifier

In [None]:
dtc = DecisionTreeClassifier(criterion='entropy', max_depth=8, min_samples_split=5)
dtc.fit(X_train, y_train)
y_pred = dtc.predict(X_test)
report = model_report(y_pred=y_pred, y_true=y_test)
cm = confusion_matrix(y_test, y_pred)
print report
print '\n'
print 'Confusion Matrix'
print cm

### Random forest classifier

In [None]:
rfc = RandomForestClassifier(n_estimators=800, max_depth=8, min_samples_split=5, n_jobs=-1)
rfc.fit(X_train, y_train)
y_pred = rfc.predict(X_test)
report = model_report(y_pred=y_pred, y_true=y_test)
cm = confusion_matrix(y_test, y_pred)
print report
print '\n'
print 'Confusion Matrix'
print cm

### Gradient boosting classifer

In [None]:
gbc = GradientBoostingClassifier(n_estimators=800, max_depth=8, min_samples_split=5, learning_rate=0.1)
gbc.fit(X_train, y_train)
y_pred = gbc.predict(X_test)
report = model_report(y_pred=y_pred, y_true=y_test)
cm = confusion_matrix(y_test, y_pred)
print report
print '\n'
print 'Confusion Matrix'
print cm

### XGBoost classifier

In [3]:
xbc = XGBClassifier(n_jobs=1)
xbc.fit(X_train, y_train)
y_pred = xbc.predict(X_test)
report = model_report(y_pred=y_pred, y_true=y_test)
cm = confusion_matrix(y_test, y_pred)
print report
print '\n'
print 'Confusion Matrix'
print cm

Accuracy: 0.797619047619 
ROC: 0.797052411663 
Average precision score: 0.747895107231 
F1: 0.806378132118 
Hamming loss: 0.202380952381 

             precision    recall  f1-score   support

        0.0       0.79      0.78      0.79       202
        1.0       0.80      0.81      0.81       218

avg / total       0.80      0.80      0.80       420



Confusion Matrix
[[158  44]
 [ 41 177]]


# Get best parameters via hyper-parameter grid search

## GridSearchCV

In [4]:
best_model = train_model(X_train, y_train, n_jobs=-1, cv=3)

In [5]:
best_clf = best_model['best_estimator']

y_pred = best_clf.predict(X_test)
report = model_report(y_pred=y_pred, y_true=y_test)
cm = confusion_matrix(y_test, y_pred)
print report
print '\n'
print 'Confusion Matrix'
print cm

Accuracy: 0.809523809524 
ROC: 0.808520301571 
Average precision score: 0.758037091592 
F1: 0.81981981982 
Hamming loss: 0.190476190476 

             precision    recall  f1-score   support

        0.0       0.81      0.78      0.80       202
        1.0       0.81      0.83      0.82       218

avg / total       0.81      0.81      0.81       420



Confusion Matrix
[[158  44]
 [ 36 182]]


## BayesSearchCV

In [None]:
clf_initial = XGBClassifier()
param_grid = generate_param_grid()

param_grid = {
    'n_estimators': (500, 700),
    'max_depth': (4, 12)
}

bayes_model = BayesSearchCV(clf_initial, search_spaces=param_grid, n_iter=100, n_jobs=-1, cv=3, scoring='roc_auc')
bayes_model.fit(X_train, y_train)

# get best estimator
best_clf = bayes_model.best_estimator_

In [None]:
y_pred = best_clf.predict(X_test)
report = model_report(y_pred=y_pred, y_true=y_test)
cm = confusion_matrix(y_test, y_pred)
print report
print '\n'
print 'Confusion Matrix'
print cm

In [None]:
xbc