# How to treat imbalance datasets and do also Cross Validation

In [68]:
import pandas as pd
import numpy as np
import matplotlib as plt
from collections import Counter
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.metrics import classification_report
from tqdm import tqdm
from statistics import mean
import warnings
import math
warnings.filterwarnings('ignore')

In [2]:
hp = pd.read_csv('HP_OHE_3class.csv')
hp = hp.drop('Unnamed: 0',axis=1)

In [3]:
hp.head()

Unnamed: 0,month,day,year,hour,minute,holiday,dayOfTheWeek,Pandemic,temperature,humidity,...,thunderstorm with rain,few clouds,thunderstorm,shower rain,very heavy rain,fog,haze,thunderstorm with light rain,light intensity drizzle,HP_Forbidden_clean
0,0.636364,0.966667,0.0,0.608696,0.4,0.0,0.0,0.0,0.721791,0.740741,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
1,0.636364,0.966667,0.0,0.608696,0.6,0.0,0.0,0.0,0.721791,0.740741,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
2,0.636364,0.966667,0.0,0.608696,0.8,0.0,0.0,0.0,0.71194,0.679012,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
3,0.636364,0.966667,0.0,0.608696,1.0,0.0,0.0,0.0,0.723881,0.679012,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
4,0.636364,0.966667,0.0,0.652174,0.0,0.0,0.0,0.0,0.724179,0.679012,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [4]:
def getXandY(df):
    df.drop(df.tail(20).index,inplace=True) 
    x = df.drop(['HP_Forbidden_clean'],axis=1)
    y = df.HP_Forbidden_clean
    return(x,y)

def trainTest(x,y):
    X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.30, shuffle=True)
    return(X_train, X_test, y_train, y_test)

smote = SMOTE(random_state=42)
X,Y=getXandY(hp)
X_train, X_test, y_train, y_test = trainTest(X,Y)
X_train_Smote, y_train_Smote= smote.fit_resample(X_train, y_train)

mrmrO = ['month', 'day', 'year', 'hour', 'minute', 'holiday', 'dayOfTheWeek', 'temperature', 'humidity', 
         'pressure', 'heavy intensity rain', 'light rain', 'broken clouds', 'scattered clouds', 
         'thunderstorm with rain', 'few clouds', 'thunderstorm', 'shower rain', 'heavy intensity rain', 
         'mist', 'scattered clouds']
X_train_Smot_r = X_train_Smote[mrmrO]
X_test_r = X_test[mrmrO]

print(sorted(Counter(y_train_Smote).items()))

[(1.0, 5612), (2.0, 5612), (3.0, 5612)]


In [5]:
from sklearn.model_selection import cross_val_score
logisticRegr = LogisticRegression(max_iter=20000)
scores = cross_val_score(logisticRegr, X_train_Smot_r, y_train_Smote, cv=10)
mean(scores)
#logisticRegr.fit(X_train_Smot_r, y_train_Smote)
#y_pred=logisticRegr.predict(X_test_r)
#print(classification_report(y_test, y_pred))

0.5367685870864577

#### Create the function to oversample only the train part of the fold

In [66]:
from sklearn.metrics import recall_score, accuracy_score
smoter = SMOTE(random_state=42)
scores = []

def manualKFolds(X_mrmr, y, k):
    a= len(X) #lenght of my complete dataframe
    n= math.floor(a/k)
    k=k+1 #sum one because the for starts in 1 and not in zero

    for i in range(1,k):
        if i == 1:
            xtrain_fold = X.iloc[n:-1]
            ytrain_fold = y.iloc[n:-1]
            xtest_fold = X.iloc[:n]
            ytest_fold = y.iloc[:n]
        elif i == k:
            xtrain_fold = X.iloc[:(i-1)*n]
            ytrain_fold = y.iloc[:(i-1)*n]
            xtest_fold = X.iloc[(i-1)*n:-1] 
            ytest_fold = y.iloc[(i-1)*n:-1] 
        else:
            xtrain1_fold = X.iloc[:(i-1)*n,:]
            xtrain2_fold = X.iloc[i*n:-1,:]
            xtrain_fold = pd.concat([xtrain1_fold,xtrain2_fold],axis=0)

            ytrain1_fold = y.iloc[:(i-1)*n]
            ytrain2_fold = y.iloc[i*n:-1]
            ytrain_fold = pd.concat([ytrain1_fold,ytrain2_fold],axis=0)

            xtest_fold = X.iloc[(i-1)*n:i*n]
            ytest_fold = y.iloc[(i-1)*n:i*n]
   
        try:
            xtrain_fold = xtrain_fold.drop('index',axis=1)
            ytrain_fold = ytrain_fold.drop('index',axis=1)
            xtest_fold = xtest_fold.drop('index',axis=1)
            ytest_fold = ytest_fold.drop('index',axis=1)
        except:
            print("")

        # Upsample only the data in the training section
        xtrain_fold_upsample, ytrain_fold_upsample = smoter.fit_resample(xtrain_fold,ytrain_fold)
        # Fit the model on the upsampled training data
        model_obj = logisticRegr.fit(xtrain_fold_upsample, ytrain_fold_upsample)
        # Score the model on the (non-upsampled) validation data
        score = accuracy_score(ytest_fold, model_obj.predict(xtest_fold))
        print(score)
        if i>1:
            scores.append(score)

    print('Mean accuracy of the model: {}'.format(mean(scores)))

In [67]:
X_mrmr =X[mrmrO]

manualKFolds(X_mrmr, Y, k=10)


0.3754764930114358

0.4104193138500635

0.51143583227446

0.4771283354510801

0.613722998729352

0.5292249047013977

0.542566709021601

0.4358322744599746

0.48602287166454894

0.5546378653113088
Mean accuracy of the model: 0.5067767894959763
