**program:** x_base_models<br>
**author:** chris chan<br>
**date:** jan 27,2021<br>
**desc:** Create baseline models using clean DF. No crossvalidation from these models are performed <br>

**datasources:**<br>
- sb_analytic (balanced df thru 2010)
- billboard analytic (hot 100 thru 2019)
- spotify random (random thru 2020)

- decision trees : https://towardsdatascience.com/light-on-math-machine-learning-intuitive-guide-to-understanding-decision-trees-adb2165ccab7
- gridsearch cv: https://towardsdatascience.com/understanding-decision-tree-classification-with-scikit-learn-2ddf272731bd
- random forest : https://towardsdatascience.com/an-implementation-and-explanation-of-the-random-forest-in-python-77bf308a9b76
- model eval : https://towardsdatascience.com/how-to-best-evaluate-a-classification-model-2edb12bcc587

In [1]:
from importlib import reload
import pandas as pd
import numpy as np
import scipy.stats as st
import matplotlib.pyplot as plt
import seaborn as sns
from collections import OrderedDict
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score, confusion_matrix, roc_curve
from sklearn.metrics import precision_score, recall_score, precision_recall_curve,f1_score, fbeta_score, accuracy_score

# roc curve and auc
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from matplotlib import pyplot

%config InlineBackend.figure_formats = ['svg']
%matplotlib inline

plt.rcParams['figure.figsize'] = (9, 6)
sns.set(context='notebook', style='whitegrid', font_scale=1.2)

In [2]:
import matplotlib.cm as cm
import random

from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

In [3]:
from sqlalchemy import create_engine
import pandas as pd

In [4]:
from sklearn.model_selection import train_test_split

**1. Bring in data**

In [5]:
sbdf=pd.read_csv(r'../data/clean/sbdf_clean.csv')
sbdf.head(3)

Unnamed: 0,SpotifyID,danceability,energy,key,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,loudness,is_hit,year
0,285pBltuF7vW8TeWk8hdRR,0.511,0.566,6,0,0.2,0.349,0.0,0.34,0.218,83.903,239836,-7.23,1,2018.0
1,7dt6x5M1jzdTEt8oCbisTK,0.68,0.578,10,1,0.04,0.331,0.0,0.135,0.341,145.038,231267,-5.804,1,2018.0
2,78QR3Wp35dqAhFEc2qAGjE,0.897,0.662,1,0,0.292,0.0852,0.0,0.534,0.389,112.511,145543,-6.903,1,2019.0


In [6]:
sbdf = sbdf[sbdf['year'] >= 1960] 

In [7]:
sbdf.columns = map(str.lower, sbdf.columns)

In [8]:
sbdf['decade'] = (sbdf.year//10*10).astype(int)

In [9]:
sbdf.decade.value_counts()

2000    6239
1990    3864
2010    3829
1980      91
1960       8
1970       4
Name: decade, dtype: int64

In [10]:
sbdf = sbdf[sbdf['decade'] >= 1990] 

In [11]:
sbdf['track_seconds'] = sbdf['duration_ms'] / 1000

In [12]:
sbdf.columns

Index(['spotifyid', 'danceability', 'energy', 'key', 'mode', 'speechiness',
       'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo',
       'duration_ms', 'loudness', 'is_hit', 'year', 'decade', 'track_seconds'],
      dtype='object')

**Build models**

In [16]:
import pandas as pd
import numpy as np
import matplotlib
matplotlib.use('Agg')
import pylab
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import roc_curve, auc, confusion_matrix, classification_report, precision_score, recall_score, f1_score, accuracy_score

# Quickly plot ROC Curve and calculate AUC score for several algorithms to determine the best model


# Establishing X and y
y = sbdf['is_hit']
X = sbdf.drop(columns = ['is_hit','spotifyid','decade','year','duration_ms'])

# Train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42, stratify = y)

X2_train, X2_test,  y2_train,y2_test = train_test_split(X_train,y_train, test_size = 0.25, random_state = 42)


# KNN
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X2_train, y2_train)
knn_ypred = knn.predict(X2_test)
knn_proba = knn.predict_proba(X2_test)[:,1]
fpr_knn, tpr_knn, _ = roc_curve(y2_test, knn_proba)
knn_auc = auc(fpr_knn, tpr_knn)
knn_f1 = f1_score(y2_test, knn_ypred)
knn_prec = precision_score(y2_test, knn_ypred)
knn_recall = recall_score(y2_test, knn_ypred)
knn_accuracy = accuracy_score(y2_test, knn_ypred)

#LR
lr = LogisticRegression(C = 0.5  )
lr.fit(X2_train,y2_train)
lr_ypred = lr.predict(X2_test)
lr_proba = lr.predict_proba(X2_test)[:,1]
fpr_lr, tpr_lr, _ = roc_curve(y2_test, lr_proba)
lr_auc = auc(fpr_lr, tpr_lr)
lr_f1 = f1_score(y2_test, lr_ypred)
lr_prec = precision_score(y2_test, lr_ypred)
lr_recall = recall_score(y2_test, lr_ypred)
lr_accuracy = accuracy_score(y2_test, lr_ypred)

# Decision Tree
tree = DecisionTreeClassifier()
tree.fit(X2_train, y2_train)
tree_ypred = tree.predict(X2_test)
tree_proba = tree.predict_proba(X2_test)[:,1]
fpr_tree, tpr_tree, _ = roc_curve(y2_test, tree_proba)
tree_auc = auc(fpr_tree, tpr_tree)
tree_f1 = f1_score(y2_test, tree_ypred)
tree_prec = precision_score(y2_test, tree_ypred)
tree_recall = recall_score(y2_test, tree_ypred)
tree_accuracy = accuracy_score(y2_test, tree_ypred)


# Random Forest
forest = RandomForestClassifier(n_estimators=100, max_features=10)
forest.fit(X2_train, y2_train)
forest_ypred = forest.predict(X2_test)
forest_proba = forest.predict_proba(X2_test)[:,1]
fpr_rf, tpr_rf, _ = roc_curve(y2_test, forest_proba)
forest_auc = auc(fpr_rf, tpr_rf)
forest_f1 = f1_score(y2_test, forest_ypred)
forest_prec = precision_score(y2_test, forest_ypred)
forest_recall = recall_score(y2_test, forest_ypred)
forest_accuracy = accuracy_score(y2_test, forest_ypred)

# Gradient Boosting
grad = GradientBoostingClassifier()
grad.fit(X2_train, y2_train)
grad_ypred = grad.predict(X2_test)
grad_proba = grad.predict_proba(X2_test)[:,1]
fpr_gb, tpr_gb, _ = roc_curve(y2_test, grad_proba)
grad_auc = auc(fpr_gb, tpr_gb)
grad_f1 = f1_score(y2_test, grad_ypred)
grad_prec = precision_score(y2_test, grad_ypred)
grad_recall = recall_score(y2_test, grad_ypred)
grad_accuracy = accuracy_score(y2_test, grad_ypred)


print('Precision SCORES',
'\n',
'KNN:',knn_prec,
'\n',
'LR:', lr_prec,
'\n',
'TREE:',tree_prec,
'\n',
'FOREST:',forest_prec,
'\n',
'GRAD:',grad_prec)

print('Recall SCORES',
'\n',
'KNN:',knn_recall,
'\n',
'LR:', lr_recall,
'\n',
'TREE:',tree_recall,
'\n',
'FOREST:',forest_recall,
'\n',
'GRAD:',grad_recall)


print('F1 SCORES',
'\n',
'KNN:',knn_f1,
'\n',
'LR:', lr_f1,
'\n',
'TREE:',tree_f1,
'\n',
'FOREST:',forest_f1,
'\n',
'GRAD:',grad_f1)

print('ACCURACY SCORES',
'\n',
'KNN:',knn_accuracy,
'\n',
'LR:', lr_accuracy,
'\n',
'TREE:',tree_accuracy,
'\n',
'FOREST:',forest_accuracy,
'\n',
'GRAD:',grad_accuracy)

print('AUC SCORES',
'\n',
'KNN:',knn_auc,
'\n',
'LR:',lr_auc,
'\n',
'TREE:',tree_auc,
'\n',
'FOREST:',forest_auc,
'\n',
'GRAD:',grad_auc)


# Gradient Boosting seems to work significantly better, so we'll tune the model using Gradient Boosting going forward

pylab.figure(figsize=(10,10))
pylab.plot(fpr_knn, tpr_knn, label='knn')
pylab.plot(fpr_lr, tpr_lr, label='lr')
pylab.plot(fpr_tree, tpr_tree, label='decision tree')
pylab.plot(fpr_rf, tpr_rf, label='random forest')
pylab.plot(fpr_gb, tpr_gb, label='gradient boosting')
pylab.plot([0,1],[0,1], linestyle='dashed')
pylab.xlabel('FPR', labelpad=10)
pylab.ylabel('TPR',rotation=0, labelpad=15)
pylab.legend(loc='upper left')
pylab.title('ROC Curves')
pylab.savefig('all_aucroc.png')

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Precision SCORES 
 KNN: 0.7159940209267563 
 LR: 0.7572254335260116 
 TREE: 0.7648827420324714 
 FOREST: 0.7936425221469515 
 GRAD: 0.7975522692503825
Recall SCORES 
 KNN: 0.8403508771929824 
 LR: 0.9192982456140351 
 TREE: 0.743859649122807 
 FOREST: 0.8906432748538011 
 GRAD: 0.9146198830409357
F1 SCORES 
 KNN: 0.7732041969330105 
 LR: 0.8304278922345484 
 TREE: 0.7542247257634154 
 FOREST: 0.8393496831082944 
 GRAD: 0.8520839008444566
ACCURACY SCORES 
 KNN: 0.697524219590958 
 LR: 0.7696447793326158 
 TREE: 0.7025475421600287 
 FOREST: 0.7908144958736993 
 GRAD: 0.8051668460710442
AUC SCORES 
 KNN: 0.6849693484717675 
 LR: 0.818562500339366 
 TREE: 0.6904070761862875 
 FOREST: 0.8388753142528249 
 GRAD: 0.8530800849229233
