# Stacking Algorithm

In [1]:
import pandas as pd
import numpy as np
import matplotlib as plt
from collections import Counter
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn import metrics
from sklearn.metrics import classification_report
from statistics import mean
import math
import warnings
warnings.filterwarnings('ignore')
from sklearn.metrics import recall_score, accuracy_score

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier 
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier

In [2]:
hp_oHe = pd.read_csv('HP_OHE_3class.csv')
hp_oHe.drop(hp_oHe.tail(17).index,inplace=True) 
hp_oHe = hp_oHe.drop('Unnamed: 0',axis=1)

hp_ME = pd.read_csv("harryPotterClean.csv")
hp_ME.drop(hp_ME.tail(17).index,inplace=True) 
hp_ME = hp_ME.drop('Unnamed: 0',axis=1)

hp_OE = pd.read_csv("harryPotterCleanOE.csv")
hp_OE.drop(hp_OE.tail(17).index,inplace=True) 
hp_OE = hp_OE.drop('Unnamed: 0',axis=1)

In [3]:
def getXandY(df):
    df.drop(df.tail(20).index,inplace=True) 
    x = df.drop(['HP_Forbidden_clean'],axis=1)
    y = df.HP_Forbidden_clean
    return(x,y)

def trainTest(x,y):
    X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.30, shuffle=True)
    return(X_train, X_test, y_train, y_test)

#Function to perform oversampling
def overSampling(X_train, y_train, y_test, method):
    X_train_os, y_train_os= method.fit_resample(X_train, y_train)
    # Check the number of records after over sampling
    #print(sorted(Counter(y_train_os).items())) 
    return(X_train_os, y_train_os)

smote = SMOTE(random_state=42)


def testModel(df,var_order,n_vars,n_loops,method):
    highest = 0
    for j in tqdm(range(1,n_vars)):
        #split our dataframe into X and Y
        x,y=getXandY(df)
        #create the lists to store metrics
        acc = []
        rec = []
        preci = []
        f1 = []
        for i in range(n_loops):
            #split the dataFrame into test and train
            X_train, X_test, y_train, y_test = trainTest(x,y)
            #Oversample the train dataset with SMOTE
            X_train_os, y_train_os=overSampling(X_train, y_train, y_test, smote)
            #define the variables order 
            X_train_os_r = X_train_os[var_order]
            X_test_r = X_test[var_order]
            df1= X_train_os_r.iloc[:, 0:j] #use only part of the variables
            
            #create and train decision trees
            rnd_clf = RandomForestClassifier(n_jobs=-1)
            rnd_clf.fit(df1, y_train_os)
        
            y_pred=rnd_clf.predict(X_test_r.iloc[:, 0:j])
            ac=metrics.accuracy_score(y_test, y_pred)
            acc.append(ac)
            p=metrics.precision_score(y_test, y_pred,average='macro')
            preci.append(p)
            r=metrics.recall_score(y_test, y_pred,average='macro')
            rec.append(r)
            f=metrics.f1_score(y_test, y_pred, average='macro')
            f1.append(f)
        print(df1.columns)
        print("For {} features: \n Accuracy: {} \n Precision: {} \n Recall: {} \n F1 score: {}".format(
        j,mean(acc),mean(preci),mean(rec),mean(f1)))
        
        if mean(acc)>highest:
            highest = mean(acc)
            best = "best accuracy = {}, with {} features, with {}".format(mean(acc),j,method)
        print(best)
        #print(classification_report(y_test, y_pred))
    print(best)
        
def analizeDF(df,order,n_vars,n_loops):
    for i in range(len(order)):
        print('------------------------- Analyzing method {} -------------------------'.format(method[i]))
        print('The variable order is: \n {}'.format(order[i]))
        testModel(df,order[i],n_vars,n_loops,method[i])
        print('\n \n')

In [12]:
estimators = [
    ('rf', RandomForestClassifier()),
    ('dt', DecisionTreeClassifier()),
    ('gbc', GradientBoostingClassifier()),
    ('abc', AdaBoostClassifier()),
    ('svm', SVC())
]

Test 1 Final Estimator with Logistic Regression

In [7]:
x,y=getXandY(hp_oHe)
#split the dataFrame into test and train
X_train, X_test, y_train, y_test = trainTest(x,y)
#Oversample the train dataset with SMOTE
X_train_os, y_train_os=overSampling(X_train, y_train, y_test, smote)

In [14]:
stk_clf = StackingClassifier(
    estimators=esiitimators, final_estimator=LogisticRegression())

In [15]:
stk_clf.fit(X_train_os, y_train_os)

StackingClassifier(estimators=[('rf', RandomForestClassifier()),
                               ('dt', DecisionTreeClassifier()),
                               ('gbc', GradientBoostingClassifier()),
                               ('abc', AdaBoostClassifier()), ('svm', SVC())],
                   final_estimator=LogisticRegression())

In [16]:
y_pred=stk_clf.predict(X_test)
ac=metrics.accuracy_score(y_test, y_pred)
print(ac)

0.87481434330575


Test 2 Final Estimator with SVC

In [4]:
def testStaking(final):
    stk_clf = StackingClassifier(estimators=estimators, final_estimator=final)
    stk_clf.fit(X_train_os, y_train_os)
    y_pred=stk_clf.predict(X_test)
    ac=metrics.accuracy_score(y_test, y_pred)
    print(ac)

In [19]:
testStaking(SVC())

0.8754508805431784


Test 3 Final Estimator with random forest

In [20]:
testStaking(RandomForestClassifier())

0.8735412688308932


Test 4 Final Estimator with decision trees

In [22]:
testStaking(DecisionTreeClassifier())

0.8357733927434755


Test 5 Final Estimator with decision trees

In [23]:
testStaking(AdaBoostClassifier())

0.8697220454063229


Test 6 Final Estimator with GBC

In [24]:
testStaking(GradientBoostingClassifier())

0.87481434330575


Tuning hyper parameters of each estimator

In [5]:
estimators = [
    ('rf', RandomForestClassifier(max_depth=50, min_samples_leaf=1, 
                                  min_samples_split=2, n_estimators=200)),
    ('dt', DecisionTreeClassifier(criterion='entropy', splitter='random',  max_depth=500, min_samples_split=2)),
    ('gbc', GradientBoostingClassifier(n_estimators=500, learning_rate=1, max_depth=None)),
    ('abc', AdaBoostClassifier(RandomForestClassifier(), n_estimators=100, learning_rate=0.9)),
    ('svm', SVC(kernel="poly", degree=7, coef0=1, C=1000))
]

In [None]:
testStaking(SVC(kernel="poly", degree=7, coef0=1, C=1000))