# Spotify Hit Predictor

## Christopher El Khouri

### 625.740

### 17/11/2021

Importing necessary libraries:

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
from sklearn import tree
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier

  import pandas.util.testing as tm


Function backwardElimination performs backward elimination on a Logistic Regression model

The method take the following input parameters:
* X1: The x values
* sl: The level of significance upon which to compare p-values
* y: The y values
* X_test (optional): The test set X values

The method returns:
* X1: The X values with the eliminated features removed
* regressor_OLS: The final Logit model
* X_test: The X_test values with the eliminated features removed
* True/False: An indicator whether or not backward elimination took place. True indicates that it did, false otherwise.

In [2]:
def backwardElimination(X1, sl,y,X_test=pd.DataFrame()):
    numVars = len(X1.iloc[0])
    if(numVars<3):
        return X1,np.nan,X_test,False
    else:
        regressor_OLS = sm.Logit(y, sm.add_constant(X1)).fit(disp=0,maxiter=1000)
        maxVar = max(regressor_OLS.pvalues[1:])
        if maxVar > sl:
            for j in range(0, numVars):
                if (regressor_OLS.pvalues[j+1].astype(float) == maxVar):
                    X1 = X1.drop([X1.iloc[:,j].name],axis=1)
                    if(X_test.empty==False):
                        X_test=X_test.drop([X_test.iloc[:,j].name],axis=1)
                    break
            return backwardElimination(X1,sl,y,X_test)
        else:
            print(regressor_OLS.summary())
            return X1,regressor_OLS,X_test,True

Importing the data

In [3]:
df=pd.read_csv('data_final.csv')
df=df.iloc[:,1:]
df.head()


Unnamed: 0,track,artist,uri,danceability,energy,key,loudness,mode,speechiness,acousticness,...,genres_1,genres_2,genres_3,genres_4,genres,Decade,key_name,mode_name,key_full,hit_flop
0,Wild Things,Alessia Cara,spotify:track:2ZyuwVvV6Z3XJaXIFbspeE,0.741,0.626,1,-4.826,0,0.0886,0.02,...,canadian contemporary r&b,canadian pop,dance pop,electropop,Pop,10s,C#,Minor,C# Minor,Hit
1,Love Someone,Lukas Graham,spotify:track:2JqnpexlO9dmvjUMCaLCLJ,0.55,0.415,9,-6.557,0,0.052,0.161,...,danish pop,pop,scandipop,0,Pop,10s,A,Minor,A Minor,Hit
2,Here's To Never Growing Up,Avril Lavigne,spotify:track:0qwcGscxUHGZTgq0zcaqk1,0.482,0.873,0,-3.145,1,0.0853,0.0111,...,canadian pop,candy pop,dance pop,pop,Pop,10s,C,Major,C Major,Hit
3,Crawling Back To You,Daughtry,spotify:track:6BDtTzjbJ5kKKSWcJT8MlX,0.438,0.919,0,-2.91,0,0.0495,0.00674,...,alternative metal,neo mellow,pop rock,post-grunge,Pop,10s,C,Minor,C Minor,Hit
4,Faster,Matt Nathanson,spotify:track:6plKFdrBnKF0y3CRuceTDh,0.742,0.853,9,-4.147,1,0.0393,0.00743,...,acoustic pop,indiecoustica,neo mellow,pop rock,Pop,10s,A,Major,A Major,Hit


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33354 entries, 0 to 33353
Data columns (total 31 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   track             33354 non-null  object 
 1   artist            33354 non-null  object 
 2   uri               33354 non-null  object 
 3   danceability      33354 non-null  float64
 4   energy            33354 non-null  float64
 5   key               33354 non-null  int64  
 6   loudness          33354 non-null  float64
 7   mode              33354 non-null  int64  
 8   speechiness       33354 non-null  float64
 9   acousticness      33354 non-null  float64
 10  instrumentalness  33354 non-null  float64
 11  liveness          33354 non-null  float64
 12  valence           33354 non-null  float64
 13  tempo             33354 non-null  float64
 14  duration_ms       33354 non-null  int64  
 15  time_signature    33354 non-null  int64  
 16  chorus_hit        33354 non-null  float6

Removing the following features as they will not be needed for modeling:

* track
* artist
* uri
* key: key and mode are combined in another feature called 'key_full'
* mode
* track_id
* artist_id
* genres_1: genres are combined and processed into the feature 'genres'
* genres_2
* genres_3
* genres_4
* key_name
* mode_name
* hit_flop: The target variable, this has been converted under the feature 'target'

In [5]:
df_us=df.iloc[:,[3,4,6,8,9,10,11,12,13,14,15,16,17,18,25,26,29]]
df_us.head()


Unnamed: 0,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,chorus_hit,sections,target,genres,Decade,key_full
0,0.741,0.626,-4.826,0.0886,0.02,0.0,0.0828,0.706,108.029,188493,4,41.18681,10,1,Pop,10s,C# Minor
1,0.55,0.415,-6.557,0.052,0.161,0.0,0.108,0.274,172.065,205463,4,44.89147,9,1,Pop,10s,A Minor
2,0.482,0.873,-3.145,0.0853,0.0111,0.0,0.409,0.737,165.084,214320,4,32.17301,12,1,Pop,10s,C Major
3,0.438,0.919,-2.91,0.0495,0.00674,0.0,0.158,0.195,151.026,225813,4,34.01444,8,1,Pop,10s,C Minor
4,0.742,0.853,-4.147,0.0393,0.00743,5e-06,0.332,0.95,107.03,208280,4,43.42073,10,1,Pop,10s,A Major


In [6]:
df_us.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33354 entries, 0 to 33353
Data columns (total 17 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   danceability      33354 non-null  float64
 1   energy            33354 non-null  float64
 2   loudness          33354 non-null  float64
 3   speechiness       33354 non-null  float64
 4   acousticness      33354 non-null  float64
 5   instrumentalness  33354 non-null  float64
 6   liveness          33354 non-null  float64
 7   valence           33354 non-null  float64
 8   tempo             33354 non-null  float64
 9   duration_ms       33354 non-null  int64  
 10  time_signature    33354 non-null  int64  
 11  chorus_hit        33354 non-null  float64
 12  sections          33354 non-null  int64  
 13  target            33354 non-null  int64  
 14  genres            33354 non-null  object 
 15  Decade            33354 non-null  object 
 16  key_full          33354 non-null  object

In [7]:
df_us.describe()

Unnamed: 0,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,chorus_hit,sections,target
count,33354.0,33354.0,33354.0,33354.0,33354.0,33354.0,33354.0,33354.0,33354.0,33354.0,33354.0,33354.0,33354.0,33354.0
mean,0.54162,0.5879,-9.989044,0.066563,0.35032,0.141713,0.192361,0.544328,119.142088,232196.3,3.899652,39.224273,10.4362,0.494363
std,0.176678,0.251042,5.262882,0.066292,0.335579,0.294732,0.15421,0.264872,28.45556,96037.21,0.406641,16.878675,4.046238,0.499976
min,0.0588,0.000251,-49.253,0.022,0.0,0.0,0.0136,0.0,31.988,15168.0,0.0,0.0,0.0,0.0
25%,0.424,0.407,-12.5455,0.0334,0.0347,0.0,0.0935,0.335,97.507,175310.2,4.0,27.553977,8.0,0.0
50%,0.554,0.613,-8.992,0.0428,0.236,7.7e-05,0.131,0.56,117.348,218747.0,4.0,35.66539,10.0,0.0
75%,0.67,0.794,-6.183,0.067,0.648,0.0301,0.253,0.766,136.2315,265840.0,4.0,47.06538,12.0,1.0
max,0.988,1.0,3.744,0.957,0.996,1.0,0.989,0.993,214.848,2223827.0,5.0,433.182,73.0,1.0


Scaling so as to ensure that all features are between 0 and 1:

In [8]:
df_us_norm=df_us.copy()
df_us_norm.iloc[:,2]=(df_us_norm.iloc[:,2]-np.min(df_us_norm.iloc[:,2]))/(np.max(df_us_norm.iloc[:,2])-np.min(df_us_norm.iloc[:,2]))
df_us_norm.iloc[:,8]=(df_us_norm.iloc[:,8]-np.min(df_us_norm.iloc[:,8]))/(np.max(df_us_norm.iloc[:,8])-np.min(df_us_norm.iloc[:,8]))
df_us_norm.iloc[:,9]=(df_us_norm.iloc[:,9]-np.min(df_us_norm.iloc[:,9]))/(np.max(df_us_norm.iloc[:,9])-np.min(df_us_norm.iloc[:,9]))
df_us_norm.iloc[:,11]=(df_us_norm.iloc[:,11]-np.min(df_us_norm.iloc[:,11]))/(np.max(df_us_norm.iloc[:,11])-np.min(df_us_norm.iloc[:,11]))
df_us_norm.iloc[:,12]=(df_us_norm.iloc[:,12]-np.min(df_us_norm.iloc[:,12]))/(np.max(df_us_norm.iloc[:,12])-np.min(df_us_norm.iloc[:,12]))


In [9]:
df_us_norm.describe()

Unnamed: 0,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,chorus_hit,sections,target
count,33354.0,33354.0,33354.0,33354.0,33354.0,33354.0,33354.0,33354.0,33354.0,33354.0,33354.0,33354.0,33354.0,33354.0
mean,0.54162,0.5879,0.740871,0.066563,0.35032,0.141713,0.192361,0.544328,0.476616,0.098262,3.899652,0.090549,0.142962,0.494363
std,0.176678,0.251042,0.099305,0.066292,0.335579,0.294732,0.15421,0.264872,0.155614,0.043482,0.406641,0.038964,0.055428,0.499976
min,0.0588,0.000251,0.0,0.022,0.0,0.0,0.0136,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.424,0.407,0.692634,0.0334,0.0347,0.0,0.0935,0.335,0.358301,0.072507,4.0,0.063608,0.109589,0.0
50%,0.554,0.613,0.759685,0.0428,0.236,7.7e-05,0.131,0.56,0.466805,0.092173,4.0,0.082333,0.136986,0.0
75%,0.67,0.794,0.812688,0.067,0.648,0.0301,0.253,0.766,0.570073,0.113495,4.0,0.10865,0.164384,1.0
max,0.988,1.0,1.0,0.957,0.996,1.0,0.989,0.993,1.0,1.0,5.0,1.0,1.0,1.0


Creating dummy variables for the categoricals:

In [10]:
# DUMIZ
df_us_norm.iloc[:,10]=df_us_norm.iloc[:,10].astype(str)
dumiz = pd.get_dummies(df_us_norm.iloc[:,[10,14,15,16]],drop_first=True)
df_fin=pd.concat([df_us_norm, dumiz], axis=1)
df_fin=df_fin.drop(['time_signature','genres','Decade','key_full'],axis=1)
df_fin.head()

Unnamed: 0,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,...,key_full_Eb Major,key_full_Eb Minor,key_full_F Major,key_full_F Minor,key_full_F# Major,key_full_F# Minor,key_full_G Major,key_full_G Minor,key_full_G# Major,key_full_G# Minor
0,0.741,0.626,0.838293,0.0886,0.02,0.0,0.0828,0.706,0.415843,0.078475,...,0,0,0,0,0,0,0,0,0,0
1,0.55,0.415,0.805631,0.052,0.161,0.0,0.108,0.274,0.766034,0.086159,...,0,0,0,0,0,0,0,0,0,0
2,0.482,0.873,0.870012,0.0853,0.0111,0.0,0.409,0.737,0.727857,0.090169,...,0,0,0,0,0,0,0,0,0,0
3,0.438,0.919,0.874446,0.0495,0.00674,0.0,0.158,0.195,0.650979,0.095372,...,0,0,0,0,0,0,0,0,0,0
4,0.742,0.853,0.851105,0.0393,0.00743,5e-06,0.332,0.95,0.41038,0.087434,...,0,0,0,0,0,0,0,0,0,0


In [11]:
df_fin.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33354 entries, 0 to 33353
Data columns (total 58 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   danceability           33354 non-null  float64
 1   energy                 33354 non-null  float64
 2   loudness               33354 non-null  float64
 3   speechiness            33354 non-null  float64
 4   acousticness           33354 non-null  float64
 5   instrumentalness       33354 non-null  float64
 6   liveness               33354 non-null  float64
 7   valence                33354 non-null  float64
 8   tempo                  33354 non-null  float64
 9   duration_ms            33354 non-null  float64
 10  chorus_hit             33354 non-null  float64
 11  sections               33354 non-null  float64
 12  target                 33354 non-null  int64  
 13  time_signature_1       33354 non-null  uint8  
 14  time_signature_3       33354 non-null  uint8  
 15  ti

Re-organizing and shuffling the data:

In [12]:
X=df_fin.copy()
X=X.drop('target',axis=1)

y=df_fin['target']

df_fin_1=pd.concat([X,y],axis=1)

df_fin_1=df_fin_1.sample(frac=1)

df_fin_1.head()

Unnamed: 0,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,...,key_full_Eb Minor,key_full_F Major,key_full_F Minor,key_full_F# Major,key_full_F# Minor,key_full_G Major,key_full_G Minor,key_full_G# Major,key_full_G# Minor,target
8930,0.578,0.87,0.808178,0.0323,0.00268,0.412,0.0346,0.667,0.590638,0.09597,...,0,0,0,0,0,0,0,0,0,1
13826,0.53,0.0645,0.554182,0.0451,0.974,0.31,0.116,0.574,0.277179,0.054908,...,0,0,0,0,0,1,0,0,0,0
2797,0.301,0.97,0.862407,0.142,5.2e-05,0.715,0.595,0.211,0.480603,0.117322,...,0,0,0,0,0,0,0,1,0,0
5131,0.483,0.186,0.517558,0.0409,0.939,0.873,0.115,0.498,0.650126,0.138901,...,0,0,0,0,0,0,1,0,0,0
30312,0.138,0.712,0.829896,0.069,0.746,0.773,0.0714,0.133,0.371262,0.085074,...,0,0,0,0,0,0,0,0,0,0


Method fivefoldcv divides a dataframe df into 5 folds for cross validation 
The method take the following input parameters:
*  df: The dataset

The method returns:
* folddfs: A 5x2 array containing with each row containing a column for
* the training dataset and a column for the test dataset

In [13]:
def fivefoldcv(df):
        
        folds=[]
        tl=len(df)
        start=0
        for i in range(5):
            end=np.round((tl*(i+1)/5),0).astype(int)
            df_1=df.iloc[start:end,:].copy()
            folds.append(df_1)
            start=end
        folddfs=[]
        for i in range(5):
            folddf=[]
            df_2=pd.DataFrame(columns=df.columns)
            for j in range(5):
                if(j==i):
                    df_3=folds[j]
                else:
                    
                    df_2=pd.concat([df_2,folds[j]])
            folddf.append(df_2.reset_index(drop=True))
            folddf.append(df_3.reset_index(drop=True))
            folddfs.append(folddf)
        return folddfs

In [14]:
mds=['Logistic Regression','Logistic Regression-BE','Decision Tree','K-NN','Neural Network']

errs=[]

fp_all=[]
fn_all=[]

In [15]:
xx=fivefoldcv(df_fin_1)

Modelling our data with logisitic regression:

In [16]:
err_lr=[]
fp_lr=[]
fn_lr=[]
for i in range(len(xx)):
    df_train=xx[i][0]
    df_test=xx[i][1]
    X_train=df_train.iloc[:,:-1]
    y_train=df_train.iloc[:,-1]
    X_test=df_test.iloc[:,:-1]
    y_test=df_test.iloc[:,-1]
    sm_model = sm.Logit(y_train.astype(float), sm.add_constant(X_train).astype(float)).fit(disp=0,maxiter=3000)
    print(sm_model.summary())
    y_pred=np.around(sm_model.predict(sm.add_constant(X_test)))
    count=0
    fps=0
    fns=0
    for i in range(len(y_pred)):
        if(y_pred.iloc[i]!=y_test.iloc[i]):
            count+=1
            if(y_pred.iloc[i]==0):
                fns+=1
            else:
                fps+=1

    
    err=count/len(y_pred)
    fpp=fps/len(y_test[y_test==0])
    fnn=fns/len(y_test[y_test==1])
    err_lr.append(err)
    fp_lr.append(fpp)
    fn_lr.append(fnn)

errs.append(err_lr)
fp_all.append(fp_lr)
fn_all.append(fn_lr)




                           Logit Regression Results                           
Dep. Variable:                 target   No. Observations:                26683
Model:                          Logit   Df Residuals:                    26625
Method:                           MLE   Df Model:                           57
Date:                Tue, 16 Nov 2021   Pseudo R-squ.:                  0.4344
Time:                        08:33:37   Log-Likelihood:                -10460.
converged:                      False   LL-Null:                       -18493.
Covariance Type:            nonrobust   LLR p-value:                     0.000
                            coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------------
const                   -21.4173   1.73e+04     -0.001      0.999   -3.39e+04    3.39e+04
danceability              4.0341      0.155     25.983      0.000       3.730       4.338
energy  

  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q*np.dot(X,params))))


                           Logit Regression Results                           
Dep. Variable:                 target   No. Observations:                26683
Model:                          Logit   Df Residuals:                    26625
Method:                           MLE   Df Model:                           57
Date:                Tue, 16 Nov 2021   Pseudo R-squ.:                    -inf
Time:                        08:34:16   Log-Likelihood:                   -inf
converged:                      False   LL-Null:                       -18494.
Covariance Type:            nonrobust   LLR p-value:                     1.000
                            coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------------
const                  7.196e+06   1.87e+06      3.855      0.000    3.54e+06    1.09e+07
danceability              4.0297      0.154     26.101      0.000       3.727       4.332
energy  



                           Logit Regression Results                           
Dep. Variable:                 target   No. Observations:                26684
Model:                          Logit   Df Residuals:                    26626
Method:                           MLE   Df Model:                           57
Date:                Tue, 16 Nov 2021   Pseudo R-squ.:                  0.4353
Time:                        08:34:54   Log-Likelihood:                -10444.
converged:                      False   LL-Null:                       -18495.
Covariance Type:            nonrobust   LLR p-value:                     0.000
                            coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------------
const                   -23.5880   5.11e+04     -0.000      1.000      -1e+05       1e+05
danceability              4.0350      0.154     26.204      0.000       3.733       4.337
energy  

                           Logit Regression Results                           
Dep. Variable:                 target   No. Observations:                26683
Model:                          Logit   Df Residuals:                    26625
Method:                           MLE   Df Model:                           57
Date:                Tue, 16 Nov 2021   Pseudo R-squ.:                  0.4342
Time:                        08:35:33   Log-Likelihood:                -10464.
converged:                      False   LL-Null:                       -18494.
Covariance Type:            nonrobust   LLR p-value:                     0.000
                            coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------------
const                   -21.7796    2.3e+04     -0.001      0.999   -4.52e+04    4.51e+04
danceability              4.0955      0.154     26.597      0.000       3.794       4.397
energy  



Based on the regression results above, time_signature shall be eliminated due to its hgih p-value:

In [17]:
df_fin_2=df_fin_1.drop(['time_signature_1','time_signature_3','time_signature_4','time_signature_5'],axis=1)

The updated dataframe, df_fin_2, proceeds to automated backward elimination:

In [18]:
X=df_fin_2.iloc[:,:-1]
y=df_fin_2.iloc[:,-1]
be=backwardElimination(X, 0.05,y)

                           Logit Regression Results                           
Dep. Variable:                 target   No. Observations:                33354
Model:                          Logit   Df Residuals:                    33314
Method:                           MLE   Df Model:                           39
Date:                Tue, 16 Nov 2021   Pseudo R-squ.:                  0.4329
Time:                        08:35:36   Log-Likelihood:                -13109.
converged:                       True   LL-Null:                       -23117.
Covariance Type:            nonrobust   LLR p-value:                     0.000
                            coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------------
const                    -7.6373      0.278    -27.437      0.000      -8.183      -7.092
danceability              4.1272      0.136     30.328      0.000       3.860       4.394
energy  

The updated dataframe, df_fin_3, is set below and prepared for 5 fold cross validation:

In [19]:
df_fin_3=pd.concat([be[0],y],axis=1)
df_fin_3.head()

Unnamed: 0,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,chorus_hit,...,key_full_C Major,key_full_C# Major,key_full_D Minor,key_full_E Major,key_full_E Minor,key_full_Eb Major,key_full_F Major,key_full_F# Major,key_full_G# Major,target
8930,0.578,0.87,0.808178,0.0323,0.00268,0.412,0.0346,0.667,0.590638,0.047565,...,0,0,0,0,0,0,0,0,0,1
13826,0.53,0.0645,0.554182,0.0451,0.974,0.31,0.116,0.574,0.277179,0.10053,...,0,0,0,0,0,0,0,0,0,0
2797,0.301,0.97,0.862407,0.142,5.2e-05,0.715,0.595,0.211,0.480603,0.12663,...,0,0,0,0,0,0,0,0,1,0
5131,0.483,0.186,0.517558,0.0409,0.939,0.873,0.115,0.498,0.650126,0.052901,...,0,0,0,0,0,0,0,0,0,0
30312,0.138,0.712,0.829896,0.069,0.746,0.773,0.0714,0.133,0.371262,0.073774,...,1,0,0,0,0,0,0,0,0,0


In [20]:
xx=fivefoldcv(df_fin_3)

Modeling our modified dataframe with logistic regression:

In [21]:
err_lrt=[]
fp_lrt=[]
fn_lrt=[]
for i in range(len(xx)):
    df_train=xx[i][0]
    df_test=xx[i][1]
    X_train=df_train.iloc[:,:-1]
    y_train=df_train.iloc[:,-1]
    X_test=df_test.iloc[:,:-1]
    y_test=df_test.iloc[:,-1]
    sm_model = sm.Logit(y_train.astype(float), sm.add_constant(X_train).astype(float)).fit(disp=0,maxiter=3000)
    y_pred=np.around(sm_model.predict(sm.add_constant(X_test)))
    count=0
    fps=0
    fns=0
    for i in range(len(y_pred)):
        if(y_pred.iloc[i]!=y_test.iloc[i]):
            count+=1
            if(y_pred.iloc[i]==0):
                fns+=1
            else:
                fps+=1


    err=count/len(y_pred)
    fpp=fps/len(y_test[y_test==0])
    fnn=fns/len(y_test[y_test==1])
    err_lrt.append(err)
    fp_lrt.append(fpp)
    fn_lrt.append(fnn)

errs.append(err_lrt)
fp_all.append(fp_lrt)
fn_all.append(fn_lrt)


For the remaining models, a validation set will be extracted for tuning purposes consiting of 10% of the data. The remaining 90% is split into 5 folds.

In [22]:
df_fin_4=df_fin_3.sample(frac=1).reset_index(drop=True)

# Determining the size of the 10% for our validation set:

tenp=np.round(len(df_fin_4)/10,0).astype(int)

# df_v: validation set
# df_t: test/train sets

df_v=df_fin_4.iloc[:tenp,:]
df_t=df_fin_4.iloc[tenp:,:]
xx=fivefoldcv(df_t)

Tuning the alpha for DecisionTreeClassifier:

In [23]:

mine=1000
bestalpha=0
for alph in np.arange(0,0.1,0.0001):
    Xv=df_v.iloc[:,:-1]
    yv=df_v.iloc[:,-1]
    clf=tree.DecisionTreeClassifier(ccp_alpha=alph)
    clf=clf.fit(Xv.astype(float),yv.astype(float))
    y_pred=clf.predict(Xv)
    count=0
    for i in range(len(y_pred)):
        if(y_pred[i]!=yv.iloc[i]):
            count+=1

    err=count/len(y_pred)
    if err<=mine:
        mine=err
        bestalpha=alph
    else:
        break


In [24]:
bestalpha

0.0001

Modeling our data with the tuned Decision Tree model:

In [25]:
err_t=[]
fp_t=[]
fn_t=[]

for i in range(len(xx)):
    df_train=xx[i][0]
    df_test=xx[i][1]
    X_train=df_train.iloc[:,:-1]
    y_train=df_train.iloc[:,-1]
    X_test=df_test.iloc[:,:-1]
    y_test=df_test.iloc[:,-1]
    clf=tree.DecisionTreeClassifier(ccp_alpha=bestalpha)
    clf=clf.fit(X_train.astype(float),y_train.astype(float))
    y_pred=clf.predict(X_test)
    count=0
    fps=0
    fns=0
    for i in range(len(y_pred)):
        if(y_pred[i]!=y_test.iloc[i]):
            count+=1
            if(y_pred[i]==0):
                fns+=1
            else:
                fps+=1


    err=count/len(y_pred)
    fpp=fps/len(y_test[y_test==0])
    fnn=fns/len(y_test[y_test==1])
    err_t.append(err)
    fp_t.append(fpp)
    fn_t.append(fnn)

errs.append(err_t)
fp_all.append(fp_t)
fn_all.append(fn_t)


Tuning the n for KNeighborsClassifier:

In [26]:

mine=1000
bestn=5
for n in range(5,16,2):
    Xv=df_v.iloc[:,:-1]
    yv=df_v.iloc[:,-1]
    clf=KNeighborsClassifier(n_neighbors=n)
    clf=clf.fit(Xv.astype(float),yv.astype(float))
    y_pred=clf.predict(Xv)
    count=0
    for i in range(len(y_pred)):
        if(y_pred[i]!=yv.iloc[i]):
            count+=1

    err=count/len(y_pred)
    if err<mine:
        mine=err
        bestn=n



In [27]:
bestn

5

Modeling our data with the tuned KNN model:

In [28]:
err_k=[]
fp_k=[]
fn_k=[]
for i in range(len(xx)):
    df_train=xx[i][0]
    df_test=xx[i][1]
    X_train=df_train.iloc[:,:-1]
    y_train=df_train.iloc[:,-1]
    X_test=df_test.iloc[:,:-1]
    y_test=df_test.iloc[:,-1]
    clf=KNeighborsClassifier(n_neighbors=bestn)
    clf=clf.fit(X_train.astype(float),y_train.astype(float))
    y_pred=clf.predict(X_test)
    count=0
    fps=0
    fns=0
    for i in range(len(y_pred)):
        if(y_pred[i]!=y_test.iloc[i]):
            count+=1
            if(y_pred[i]==0):
                fns+=1
            else:
                fps+=1


    err=count/len(y_pred)
    fpp=fps/len(y_test[y_test==0])
    fnn=fns/len(y_test[y_test==1])
    err_k.append(err)
    fp_k.append(fpp)
    fn_k.append(fnn)

errs.append(err_k)
fp_all.append(fp_k)
fn_all.append(fn_k)


Tuning the hidden layer sizes for MLPClassifier:

In [29]:

mine=1000
bestnhd=[1,1]
for n1 in range(1,4):
    for n2 in range(1,5):
        Xv=df_v.iloc[:,:-1]
        yv=df_v.iloc[:,-1]
        clf=MLPClassifier(hidden_layer_sizes=(n1, n2), random_state=1,max_iter=1000)
        clf=clf.fit(Xv.astype(float),yv.astype(float))
        y_pred=clf.predict(Xv)
        count=0
        for i in range(len(y_pred)):
            if(y_pred[i]!=yv.iloc[i]):
                count+=1

        err=count/len(y_pred)
        if err<mine:
            mine=err
            bestnhd=[n1,n2]


In [30]:
bestnhd

[3, 4]

Modeling our data with the tuned MLPClassifier:

In [31]:
err_n=[]
fp_n=[]
fn_n=[]
for i in range(len(xx)):
    df_train=xx[i][0]
    df_test=xx[i][1]
    X_train=df_train.iloc[:,:-1]
    y_train=df_train.iloc[:,-1]
    X_test=df_test.iloc[:,:-1]
    y_test=df_test.iloc[:,-1]
    clf=MLPClassifier(hidden_layer_sizes=(bestnhd[0], bestnhd[1]), random_state=1,max_iter=1000)
    clf=clf.fit(X_train.astype(float),y_train.astype(float))
    y_pred=clf.predict(X_test)
    count=0
    fps=0
    fns=0
    for i in range(len(y_pred)):
        if(y_pred[i]!=y_test.iloc[i]):
            count+=1
            if(y_pred[i]==0):
                fns+=1
            else:
                fps+=1


    err=count/len(y_pred)
    fpp=fps/len(y_test[y_test==0])
    fnn=fns/len(y_test[y_test==1])
    err_n.append(err)
    fp_n.append(fpp)
    fn_n.append(fnn)

errs.append(err_n)
fp_all.append(fp_n)
fn_all.append(fn_n)


Preparing the performance summary sheet:

In [32]:
df_sum=[]
for i in range(len(mds)):
    ent=[]
    ent.append(mds[i])
    ent.append('Error')
    for j in range(len(errs[i])):
        ent.append(errs[i][j])
    ent.append(np.mean(errs[i]))
    df_sum.append(ent)
    ent=[]
    ent.append(mds[i])
    ent.append('False Positives')
    for j in range(len(fp_all[i])):
        ent.append(fp_all[i][j])
    ent.append(np.mean(fp_all[i]))
    df_sum.append(ent)
    ent=[]
    ent.append(mds[i])
    ent.append('False Negatives')
    for j in range(len(fn_all[i])):
        ent.append(fn_all[i][j])
    ent.append(np.mean(fn_all[i]))
    df_sum.append(ent)    


In [33]:
df_summary=pd.DataFrame(data=df_sum,columns=['Model','Measure','Fold 1','Fold 2','Fold 3','Fold 4','Fold 5','Mean'])

In [34]:
df_summary

Unnamed: 0,Model,Measure,Fold 1,Fold 2,Fold 3,Fold 4,Fold 5,Mean
0,Logistic Regression,Error,0.172538,0.170139,0.177511,0.166392,0.173287,0.171974
1,Logistic Regression,False Positives,0.229371,0.232407,0.235381,0.225045,0.233942,0.231229
2,Logistic Regression,False Negatives,0.114967,0.106111,0.117844,0.107368,0.110467,0.111351
3,Logistic Regression-BE,Error,0.174037,0.16894,0.17961,0.165492,0.174337,0.172483
4,Logistic Regression-BE,False Positives,0.229371,0.231815,0.237448,0.223252,0.237773,0.231932
5,Logistic Regression-BE,False Negatives,0.117984,0.104287,0.119976,0.107368,0.108636,0.11165
6,Decision Tree,Error,0.18571,0.189041,0.178244,0.189707,0.196536,0.187847
7,Decision Tree,False Positives,0.204545,0.201907,0.195122,0.19377,0.219038,0.202877
8,Decision Tree,False Negatives,0.166999,0.175835,0.160519,0.185511,0.173518,0.172476
9,K-NN,Error,0.177215,0.184877,0.174746,0.174384,0.187542,0.179753
