# Voting Classifiers

In [2]:
import pandas as pd
import numpy as np
import matplotlib as plt
from collections import Counter
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn import metrics
from sklearn.metrics import classification_report
from tqdm import tqdm
from statistics import mean
import math
from statistics import mean
import warnings
warnings.filterwarnings('ignore')
from sklearn.metrics import recall_score, accuracy_score
from timeit import default_timer as timer
from datetime import timedelta

from sklearn.ensemble import RandomForestClassifier 
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression 
from sklearn.svm import SVC

In [3]:
hp_oHe = pd.read_csv('HP_OHE_3class.csv')
hp_oHe.drop(hp_oHe.tail(17).index,inplace=True) 
hp_oHe = hp_oHe.drop('Unnamed: 0',axis=1)

hp_ME = pd.read_csv("harryPotterClean.csv")
hp_ME.drop(hp_ME.tail(17).index,inplace=True) 
hp_ME = hp_ME.drop('Unnamed: 0',axis=1)

hp_OE = pd.read_csv("harryPotterCleanOE.csv")
hp_OE.drop(hp_OE.tail(17).index,inplace=True) 
hp_OE = hp_OE.drop('Unnamed: 0',axis=1)

In [4]:
def getXandY(df):
    df.drop(df.tail(20).index,inplace=True) 
    x = df.drop(['HP_Forbidden_clean'],axis=1)
    y = df.HP_Forbidden_clean
    return(x,y)

def trainTest(x,y):
    X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.30, shuffle=True)
    return(X_train, X_test, y_train, y_test)

#Function to perform oversampling
def overSampling(X_train, y_train, y_test, method):
    X_train_os, y_train_os= method.fit_resample(X_train, y_train)
    # Check the number of records after over sampling
    #print(sorted(Counter(y_train_os).items())) 
    return(X_train_os, y_train_os)

smote = SMOTE(random_state=42)


def testModel(df,var_order,n_vars,n_loops,method):
    highest = 0
    for j in tqdm(range(1,n_vars)):
        #split our dataframe into X and Y
        x,y=getXandY(df)
        #create the lists to store metrics
        acc = []
        rec = []
        preci = []
        f1 = []
        for i in range(n_loops):
            #split the dataFrame into test and train
            X_train, X_test, y_train, y_test = trainTest(x,y)
            #Oversample the train dataset with SMOTE
            X_train_os, y_train_os=overSampling(X_train, y_train, y_test, smote)
            #define the variables order 
            X_train_os_r = X_train_os[var_order]
            X_test_r = X_test[var_order]
            df1= X_train_os_r.iloc[:, 0:j] #use only part of the variables
            
            #create and train decision trees
            rnd_clf = RandomForestClassifier(n_jobs=-1)
            rnd_clf.fit(df1, y_train_os)
        
            y_pred=rnd_clf.predict(X_test_r.iloc[:, 0:j])
            ac=metrics.accuracy_score(y_test, y_pred)
            acc.append(ac)
            p=metrics.precision_score(y_test, y_pred,average='macro')
            preci.append(p)
            r=metrics.recall_score(y_test, y_pred,average='macro')
            rec.append(r)
            f=metrics.f1_score(y_test, y_pred, average='macro')
            f1.append(f)
        print(df1.columns)
        print("For {} features: \n Accuracy: {} \n Precision: {} \n Recall: {} \n F1 score: {}".format(
        j,mean(acc),mean(preci),mean(rec),mean(f1)))
        
        if mean(acc)>highest:
            highest = mean(acc)
            best = "best accuracy = {}, with {} features, with {}".format(mean(acc),j,method)
        print(best)
        #print(classification_report(y_test, y_pred))
    print(best)
        
def analizeDF(df,order,n_vars,n_loops):
    for i in range(len(order)):
        print('------------------------- Analyzing method {} -------------------------'.format(method[i]))
        print('The variable order is: \n {}'.format(order[i]))
        testModel(df,order[i],n_vars,n_loops,method[i])
        print('\n \n')

## Creating the Voting Classifier

In [32]:
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier

dt_clf = DecisionTreeClassifier()
rnd_clf = RandomForestClassifier()
svm_clf = SVC(kernel="poly", degree=5, coef0=1, C=50)

voting_clf = VotingClassifier(
    estimators=[('dt', dt_clf), ('rf', rnd_clf)],
    voting='hard')

def testVoting(df):
    x,y=getXandY(df)
    #split the dataFrame into test and train
    X_train, X_test, y_train, y_test = trainTest(x,y)
    #Oversample the train dataset with SMOTE
    X_train_os, y_train_os=overSampling(X_train, y_train, y_test, smote)

    voting_clf.fit(X_train_os, y_train_os)
    for clf in (dt_clf, rnd_clf, voting_clf):
        clf.fit(X_train_os, y_train_os)
        y_pred = clf.predict(X_test)
        print(clf.__class__.__name__, accuracy_score(y_test, y_pred))

## One hot encoding

In [14]:
#Variable order in one hot encoding:
pear_corrO = ['temperature',	'holiday',	'day',	'month',	'pressure',	'dayOfTheWeek',	'shower rain',	'broken clouds',	'fog',	'overcast clouds',	'heavy intensity rain',	'minute',	'haze',	'thunderstorm with light rain',	'scattered clouds',	'clear sky',	'mist',	'light intensity drizzle',	'few clouds',	'thunderstorm',	'very heavy rain',	'moderate rain',	'thunderstorm with rain',	'year',	'light rain',	'humidity',	'hour',	'Pandemic']
kend_corrO = ['temperature',	'holiday',	'month',	'day',	'pressure',	'dayOfTheWeek',	'shower rain',	'broken clouds',	'heavy intensity rain',	'fog',	'year',	'overcast clouds',	'minute',	'haze',	'thunderstorm with light rain',	'mist',	'light intensity drizzle',	'scattered clouds',	'clear sky',	'few clouds',	'thunderstorm',	'very heavy rain',	'moderate rain',	'thunderstorm with rain',	'light rain',	'humidity',	'hour',	'Pandemic']
mutInf_classO = ['month',	'day',	'year',	'hour',	'minute',	'holiday',	'dayOfTheWeek',	'Pandemic',	'temperature',	'humidity',	'pressure',	'heavy intensity rain',	'light rain',	'broken clouds',	'moderate rain',	'mist',	'overcast clouds',	'clear sky',	'scattered clouds',	'thunderstorm with rain',	'few clouds',	'thunderstorm',	'shower rain',	'very heavy rain',	'fog',	'haze',	'thunderstorm with light rain',	'light intensity drizzle']
mutInf_regO = ['month',	'day',	'year',	'hour',	'minute',	'holiday',	'dayOfTheWeek',	'Pandemic',	'temperature',	'humidity',	'pressure',	'heavy intensity rain',	'light rain',	'broken clouds',	'moderate rain',	'mist',	'overcast clouds',	'clear sky',	'scattered clouds',	'thunderstorm with rain',	'few clouds',	'thunderstorm',	'shower rain',	'very heavy rain',	'fog',	'haze',	'thunderstorm with light rain',	'light intensity drizzle']
mutInf_class2O = ['day',	'temperature',	'month',	'humidity',	'hour',	'pressure','dayOfTheWeek',	'year',	'holiday',	'shower rain',	'light rain',	'thunderstorm',	'fog',	'broken clouds',	'Pandemic',	'thunderstorm with rain',	'light intensity drizzle',	'thunderstorm with light rain',	'heavy intensity rain',	'mist',	'scattered clouds',	'very heavy rain',	'overcast clouds',	'moderate rain',	'minute',	'haze',	'few clouds',	'clear sky']
varThreO = ['month',	'day',	'year',	'hour',	'minute',	'holiday',	'dayOfTheWeek',	'temperature',	'humidity',	'pressure',	'heavy intensity rain',	'light rain',	'broken clouds',	'scattered clouds',	'thunderstorm with rain',	'few clouds',	'thunderstorm',	'shower rain',]
mrmrO = ['month',	'day',	'year',	'hour',	'minute',	'holiday',	'dayOfTheWeek',	'temperature',	'humidity',	'pressure',	'heavy intensity rain',	'light rain',	'broken clouds',	'scattered clouds',	'thunderstorm with rain',	'few clouds',	'thunderstorm',	'shower rain',	'heavy intensity rain',	'mist',	'scattered clouds',	'very heavy rain',	'overcast clouds',	'moderate rain',	'minute',	'haze',	'few clouds']

orderOHE=[pear_corrO,kend_corrO,mutInf_classO,mutInf_regO,mutInf_class2O,varThreO,mrmrO]

method = ['Pearson_correlation','Kendalls_correlation','mutualInformation_classification',
          'mutualInformation_reggression','mutualInformation_classification2',
          'varianceThreshold','MRMR']

In [33]:
testVoting(hp_oHe)

DecisionTreeClassifier 0.8385784628559195
RandomForestClassifier 0.8679083707985442
VotingClassifier 0.850139156497538


In [54]:
from sklearn.ensemble import AdaBoostClassifier

x,y=getXandY(hp_oHe)
#split the dataFrame into test and train
X_train, X_test, y_train, y_test = trainTest(x,y)
#Oversample the train dataset with SMOTE
X_train_os, y_train_os=overSampling(X_train, y_train, y_test, smote)
ada_clf = AdaBoostClassifier(
     DecisionTreeClassifier(), n_estimators=300,
     algorithm="SAMME.R", learning_rate=0.5)
ada_clf.fit(X_train_os, y_train_os)

y_pred = ada_clf.predict(X_test)

print(accuracy_score(y_test, y_pred))

0.8512179065174457


In [None]:
analizeDF(hp_oHe,orderOHE,28,n_loops=20)

## Manual Encoding

In [23]:
#Replace times ending in 5 and also compacting the rest to have only 6 classes
b=hp_ME.Harry_Potter_and_the_Forbidden.replace([
5, 10, 11, 15, 20, 25, 30, 35, 40, 45,50.0,55.0,60.0,65.0,70.0,75.0,80.0,85.0,90.0,95.0,100.0,105.0,110.0,115.0,120.0,125.0,130.0,135.0,145.0,150.0,180.0], 
[1,1,  1,  1,  1,  1,  1,  2,  2,  2, 2,   2,   2,   3,   3,   3,   3,   3,   3,   3,   3,    3,    3,    3,     3,    3,   3,    3,     3,    3,    3])

df3=pd.DataFrame(b)
df3.rename(columns = {'Harry_Potter_and_the_Forbidden':'HP_Forbidden_clean'}, inplace = True)
hp_bis3=pd.concat([hp_ME, df3], axis=1)
hp3 = hp_bis3.drop('Harry_Potter_and_the_Forbidden',axis=1)
hp3.rename(columns = {'day.1':'dayOfTheWeek'}, inplace = True)
hp3 = hp3[hp3.HP_Forbidden_clean != 0] #delete rows with 0 min
hp3.HP_Forbidden_clean.unique()

array([2., 1., 3.])

In [24]:
HP_ME = hp3

pear_corr = ['temperature','holiday','day','month','pressure','dayOfTheWeek',
            'report','minute','year','humidity','hour',	'Pandemic']
kend_corr = ['temperature','holiday','month','day','pressure','dayOfTheWeek','report','year',
             'minute','humidity','hour','Pandemic']
mutInf_class = ['month','day','year','hour','minute','holiday','dayOfTheWeek','Pandemic',
                'temperature','humidity','pressure','report']
mutInf_reg = ['month','day','year','hour','minute','holiday','dayOfTheWeek','Pandemic',
                'temperature','humidity','pressure','report']
mutInf_class2 = ['day','temperature','month','humidity','hour','dayOfTheWeek','pressure',
                 'holiday','year','report','minute','Pandemic']
varThre = ['month','day','year','hour','minute','holiday','dayOfTheWeek',
           'temperature','humidity','pressure','report']
mrmr = ['temperature','dayOfTheWeek','hour','holiday','humidity','day',
        'year','month','pressure','minute','report']
method = ['Pearson_correlation','Kendalls_correlation','mutualInformation_classification',
          'mutualInformation_reggression','mutualInformation_classification2',
          'variableThreshold','MRMR']

orderME=[pear_corr,kend_corr,mutInf_class,mutInf_reg,mutInf_class2,varThre,mrmr]

In [34]:
testVoting(HP_ME)

DecisionTreeClassifier 0.8495443949989404
RandomForestClassifier 0.8789997880906972
VotingClassifier 0.8709472345835982


In [None]:
analizeDF(HP_ME,orderME,13,n_loops=20)

## Ordinal Encoding

In [26]:
hp_OE.rename(columns = {'day.1':'dayOfTheWeek'}, inplace = True)

#Replace times ending in 5 and also compacting the rest to have only 6 classes
a=hp_OE.Harry_Potter_and_the_Forbidden.replace([
5, 10, 11, 15, 20, 25, 30, 35, 40, 45,50.0,55.0,60.0,65.0,70.0,75.0,80.0,85.0,90.0,95.0,100.0,105.0,110.0,115.0,120.0,125.0,130.0,135.0,145.0,150.0,180.0], 
[1,1,  1,  1,  1,  1,  1,  2,  2,  2, 2,   2,   2,   3,   3,   3,   3,   3,   3,   3,   3,    3,    3,    3,     3,    3,   3,    3,     3,    3,    3])

df1=pd.DataFrame(a)
df1.rename(columns = {'Harry_Potter_and_the_Forbidden':'HP_Forbidden_clean'}, inplace = True)
hp_bis=pd.concat([hp_OE, df1], axis=1)
hp2 = hp_bis.drop('Harry_Potter_and_the_Forbidden',axis=1)
hp2 = hp2[hp2.HP_Forbidden_clean != 0] #delete rows with 0 min
hp2.HP_Forbidden_clean.unique()

#Variable order in ordinal encoding:
pear_corrOE = ['temperature',	'holiday',	'day',	'month',	'pressure',	'dayOfTheWeek',	'report',	'minute',	'year',	'humidity',	'hour',	'Pandemic']
kend_corrOE = ['temperature',	'holiday',	'month',	'day',	'pressure',	'dayOfTheWeek',	'report',	'year',	'minute',	'humidity',	'hour',	'Pandemic',]
mutInf_classOE = ['month',	'day',	'year',	'hour',	'minute',	'holiday',	'dayOfTheWeek',	'Pandemic',	'temperature',	'humidity',	'pressure',	'report',]
mutInf_regOE = ['month',	'day',	'year',	'hour',	'minute',	'holiday',	'dayOfTheWeek',	'Pandemic',	'temperature',	'humidity',	'pressure',	'report',]
mutInf_class2OE = ['day',	'temperature',	'month',	'humidity',	'hour',	'dayOfTheWeek',	'pressure',	'holiday',	'year',	'report',	'minute',	'Pandemic',]
varThreOE = ['month',	'day',	'year',	'hour',	'minute',	'holiday',	'dayOfTheWeek',	'temperature',	'humidity',	'pressure',	'report']
mrmrOE = ['temperature',	'dayOfTheWeek',	'hour',	'holiday',	'humidity',	'day',	'year',	'month',	'pressure',	'minute',	'report']

orderOE=[pear_corrOE,kend_corrOE,mutInf_classOE,mutInf_regOE,mutInf_class2OE,varThreOE,mrmrOE]

In [35]:
testVoting(hp2)

DecisionTreeClassifier 0.8429752066115702
RandomForestClassifier 0.8751854206399661
VotingClassifier 0.8635304089849545


In [None]:
analizeDF(hp2,orderOE,13,n_loops=20)

## Hyperparameter Tuning

In [None]:
bootstrap1= [True, False]
max_depth1=[10, 20, 50, 80, 100, None]
max_features1= ['auto', 'sqrt']
min_samples_leaf1= [1, 2, 4, 8, 10]
min_samples_split1= [2, 5, 10]
n_estimators1= [200, 500, 1000, 1500]

In [None]:
def tuneRandomF(X,Y,mu,bootstrap1,max_depth1,max_features1,min_samples_leaf1,
                min_samples_split1,n_estimators1,loops):
    arr = []
    highest=0
    for a in range(len(bootstrap1)):
        for b in range(len(max_depth1)):
            for c in range(len(max_features1)):
                for d in range(len(min_samples_leaf1)):
                    for e in range(len(min_samples_split1)):
                        for f in range(len(n_estimators1)):
                            start = timer()
                            for l in range(loops):
                                start = timer()
                                X_train, X_test, y_train, y_test = trainTest(X,Y)
                                #Oversample the train dataset with SMOTE
                                X_train_os, y_train_os=overSampling(X_train, y_train, y_test, smote)
                                #define the variables order 
                                X_train_os_r = X_train_os[mu]
                                X_test_r = X_test[mu]
                                rnd_clf = RandomForestClassifier(bootstrap=bootstrap1[a],
                                                                 max_depth=max_depth1[b],
                                                                 max_features=max_features1[c],
                                                                 min_samples_leaf=min_samples_leaf1[d],
                                                                 min_samples_split=min_samples_split1[e],
                                                                 n_estimators=n_estimators1[f])
                                rnd_clf.fit(X_train_os_r,y_train_os)
                                y_pred=rnd_clf.predict(X_test_r)
                                score = accuracy_score(y_test, rnd_clf.predict(X_test_r))
                                arr.append(score)
                            print("For the parameters boots:{}, mx_depth:{}, mx_fts:{} "
                                  "min_leaf:{}, min_splt:{}, n_stim {} accuracy is:".format(bootstrap1[a],
                                                                 max_depth1[b],
                                                                 max_features1[c],
                                                                 min_samples_leaf1[d],
                                                                 min_samples_split1[e],
                                                                 n_estimators1[f]))
                            mean_acc=mean(arr)
                            print(mean_acc)
                            end1 = timer()
                            print(timedelta(seconds=end1-start))
                            #check the best configuration
                            if mean_acc > highest:
                                highest = mean_acc
                                description1 = "---------------Best values = "
                                d12="boots:{}, mx_depth:{}, mx_fts:{}, min_leaf:{}, ".format(bootstrap1[a],
                                                                                         max_depth1[b],
                                                                                         max_features1[c],
                                                                                         min_samples_leaf1[d])
                                description2= "min_splt:{}, n_stim{} with accuracy: {}---------".format(
                                                                                         min_samples_split1[e],
                                                                                         n_estimators1[f],
                                                                                         mean_acc)
                            else:
                                highest = highest
                            arr = []
        print(description1+d12+description2)

In [None]:
X,Y=getXandY(hp_oHe)
mrmr = ['month',	'day',	'year',	'hour',	'minute',	'holiday',	'dayOfTheWeek',	'temperature',	'humidity',	
        'pressure',	'heavy intensity rain',	'light rain',	'broken clouds',	'scattered clouds',	'thunderstorm with rain',
        'few clouds',	'thunderstorm',	'shower rain',	'heavy intensity rain',	'mist',	'scattered clouds']
tuneRandomF(X,Y,mrmr,bootstrap1,max_depth1,max_features1,min_samples_leaf1,min_samples_split1,n_estimators1,1)
