In [1]:
!pip install xgboost
!pip install imblearn

Collecting imblearn
  Downloading imblearn-0.0-py2.py3-none-any.whl (1.9 kB)
Collecting imbalanced-learn
  Downloading imbalanced_learn-0.7.0-py3-none-any.whl (167 kB)
Installing collected packages: imbalanced-learn, imblearn
Successfully installed imbalanced-learn-0.7.0 imblearn-0.0


In [1]:
def scale_data(data, columns, scaler):
    for col in columns:
        data[col] = scaler.fit_transform(data[col].values.reshape(-1, 1))
    return data
  
def one_hot_obj_feature(df,features):
    new_df=pd.get_dummies(df,columns=features,sparse=True)
    return new_df

In [2]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
from sklearn import preprocessing
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter
from imblearn.under_sampling import TomekLinks
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import NearMiss
from sklearn import svm
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

In [18]:
all_df = pd.read_csv("MLFinalProjectDataset/train_data.csv")
train_df = all_df.sample(frac=0.01, random_state=42)
min_max_scaler = preprocessing.MinMaxScaler()
df=scale_data(train_df,list(train_df.columns)[:-1], preprocessing.MaxAbsScaler())
test_df = all_df.drop(df.index).sample(frac=0.02, random_state=42)
test_df=scale_data(test_df,list(test_df.columns)[:-1], preprocessing.MaxAbsScaler())

In [4]:
class balance:
    def __init__(self,x,y,model):
        self.x=x
        self.y=y
        self.model=model
    def fit(self):
        return self.model.fit_resample(self.x,self.y)

class Classifier:
    """
        train_data: tuple(train_features, train_tags)
        test_data: tuple(test_features)
    """
    def __init__(self, x, y, model):
        self.train_sparse_matrix = x
        self.train_tags = y
        self.model = model
        self.is_learned = False

    def fit(self):
        self.model.fit(self.train_sparse_matrix, self.train_tags)
        self.is_learned = True

    def predict(self,t):
        if not self.is_learned:
            self.fit()
        return self.model.predict(t)
  

In [5]:
from sklearn.model_selection import StratifiedKFold


def four_fold_CV(X, Y, fold_num=4):
    
    validation_sets = [None for i in range(fold_num)]
    train_sets = [None for i in range(fold_num)] 
    
    skf_cv = StratifiedKFold(n_splits=fold_num)
    
    i = 0
    for train_idx, valid_idx in skf_cv.split(X, Y):
        validation_sets[i] = {'X': X[valid_idx], 'Y': Y[valid_idx]}
        train_sets[i] = {'X': X[train_idx], 'Y': Y[train_idx]}
        i += 1
    
    return train_sets, validation_sets

In [6]:
df_x=df.drop("clicked",axis=1)
df_y=df["clicked"]
rus = balance(df_x,df_y,RandomUnderSampler(random_state=42, replacement=True))# fit predictor and target variable
x_rus, y_rus = rus.fit()
ros=balance(df_x,df_y,RandomOverSampler(random_state=42))
x_ros, y_ros=ros.fit()
smote = balance(df_x,df_y,SMOTE())
x_smote, y_smote=smote.fit()
nm = balance(df_x,df_y,NearMiss())
x_nm,y_nm=nm.fit()

x_rus, y_rus=x_rus.to_numpy(), y_rus.to_numpy()
x_ros, y_ros=x_ros.to_numpy(), y_ros.to_numpy()
x_smote, y_smote=x_smote.to_numpy(), y_smote.to_numpy()
x_nm,y_nm=x_nm.to_numpy(),y_nm.to_numpy()

balanced_all_fetuers=[x_rus, y_rus,x_ros, y_ros,x_smote, y_smote,x_nm,y_nm]
method=["rus","ros","smote","nm"]

df_x=test_df.drop("clicked",axis=1)
df_y=test_df["clicked"]
rus = balance(df_x,df_y,RandomUnderSampler(random_state=42, replacement=True))# fit predictor and target variable
x_rus, y_rus = rus.fit()
ros=balance(df_x,df_y,RandomOverSampler(random_state=42))
x_ros, y_ros=ros.fit()
smote = balance(df_x,df_y,SMOTE())
x_smote, y_smote=smote.fit()
nm = balance(df_x,df_y,NearMiss())
x_nm,y_nm=nm.fit()

x_rus, y_rus=x_rus.to_numpy(), y_rus.to_numpy()
x_ros, y_ros=x_ros.to_numpy(), y_ros.to_numpy()
x_smote, y_smote=x_smote.to_numpy(), y_smote.to_numpy()
x_nm,y_nm=x_nm.to_numpy(),y_nm.to_numpy()

test_balanced_all_fetuers=[x_rus, y_rus,x_ros, y_ros,x_smote, y_smote,x_nm,y_nm]
method=["rus","ros","smote","nm"]

In [8]:
features_mask = ['hourOfDay', 'creativeId', 'publisher', 'widgetId', 'device']

df_x=df[features_mask]
df_y=df["clicked"]
rus = balance(df_x,df_y,RandomUnderSampler(random_state=42, replacement=True))# fit predictor and target variable
x_rus, y_rus = rus.fit()
ros=balance(df_x,df_y,RandomOverSampler(random_state=42))
x_ros, y_ros=ros.fit()
smote = balance(df_x,df_y,SMOTE())
x_smote, y_smote=smote.fit()
nm = balance(df_x,df_y,NearMiss())
x_nm,y_nm=nm.fit()

x_rus, y_rus=x_rus.to_numpy(), y_rus.to_numpy()
x_ros, y_ros=x_ros.to_numpy(), y_ros.to_numpy()
x_smote, y_smote=x_smote.to_numpy(), y_smote.to_numpy()
x_nm,y_nm=x_nm.to_numpy(),y_nm.to_numpy()

balanced=[x_rus, y_rus,x_ros, y_ros,x_smote, y_smote,x_nm,y_nm]
method=["rus","ros","smote","nm"]


df_x=test_df[features_mask]
df_y=test_df["clicked"]
rus = balance(df_x,df_y,RandomUnderSampler(random_state=42, replacement=True))# fit predictor and target variable
x_rus, y_rus = rus.fit()
ros=balance(df_x,df_y,RandomOverSampler(random_state=42))
x_ros, y_ros=ros.fit()
smote = balance(df_x,df_y,SMOTE())
x_smote, y_smote=smote.fit()
nm = balance(df_x,df_y,NearMiss())
x_nm,y_nm=nm.fit()

x_rus, y_rus=x_rus.to_numpy(), y_rus.to_numpy()
x_ros, y_ros=x_ros.to_numpy(), y_ros.to_numpy()
x_smote, y_smote=x_smote.to_numpy(), y_smote.to_numpy()
x_nm,y_nm=x_nm.to_numpy(),y_nm.to_numpy()

test_balanced=[x_rus, y_rus,x_ros, y_ros,x_smote, y_smote,x_nm,y_nm]
method=["rus","ros","smote","nm"]

In [None]:
CC = [0.01, 0.1, 1, 10]

accs = [[0 for i in range(len(CC))]for j in range(4)]
folds_num = 3
for j in range(0,8,2):
    train_sets, validation_sets = four_fold_CV(balanced_all_fetuers[j], balanced_all_fetuers[j+1], folds_num)
    for (cntC, C) in enumerate(CC):
        avg_acc = 0
        for i in range(folds_num):
            try:
                t_X = train_sets[i]['X']
                t_Y = train_sets[i]['Y']
                v_X = validation_sets[i]['X']
                v_Y = validation_sets[i]['Y']
                clf=Classifier(t_X, t_Y,svm.SVC(C=C))
                clf.fit()
                avg_acc+=f1_score(v_Y,clf.predict(v_X))
            except Exception as e: 
                print(e)

        accs[j//2][cntC] = avg_acc / folds_num
        print('-> mean f1-score 3-fold cross validation for svm with method', method[j//2], 
              'and c =', C, 'is:', accs[j//2][cntC])


print(accs)
print("--------------")
print("-> Result of 4-fold cross validation for svm :")
maxx=0
maxx_i=-1
maxx_j=-1
for i in range(len(accs)):
    for j in range(len(accs[i])):
        if accs[i][j]>maxx:
            maxx=accs[i][j]
            maxx_i=i
            maxx_j=j
best_C = CC[maxx_j]
print("    Best parameter C is:", best_C, "\n", 
      "    with method:",method[maxx_i],"best accuracy =",maxx)
print()

In [15]:
CC = [0.01, 0.1, 1, 10]

accs = [[0 for i in range(len(CC))]for j in range(4)]
folds_num = 3
for j in range(0,8,2):
    train_sets, validation_sets = four_fold_CV(balanced[j], balanced[j+1], folds_num)
    for (cntC, C) in enumerate(CC):
        avg_acc = 0
        for i in range(folds_num):
            try:
                t_X = train_sets[i]['X']
                t_Y = train_sets[i]['Y']
                v_X = validation_sets[i]['X']
                v_Y = validation_sets[i]['Y']
                clf=Classifier(t_X, t_Y,svm.SVC(C=C))
                clf.fit()
                avg_acc+=f1_score(v_Y,clf.predict(v_X))
            except Exception as e: 
                print(e)

        accs[j//2][cntC] = avg_acc / folds_num
        print('-> mean f1-score 3-fold cross validation for svm with method', method[j//2], 
              'and c =', C, 'is:', accs[j//2][cntC])


print(accs)
print("--------------")
print("-> Result of 3-fold cross validation for svm with our selected features:")
maxx=0
maxx_i=-1
maxx_j=-1
for i in range(len(accs)):
    for j in range(len(accs[i])):
        if accs[i][j]>maxx:
            maxx=accs[i][j]
            maxx_i=i
            maxx_j=j
best_C = CC[maxx_j]
print("    Best parameter C is:", best_C, "\n", 
      "    with method:",method[maxx_i],"best accuracy =",maxx)
print()

[[0.5712708542821483, 0.5852227813039433, 0.5686469184117325, 0.564212325112828], [0.5922745341876631, 0.5680190312690726, 0.5657780601572795, 0.5673122885266936], [0.5783078723312106, 0.5635324095032397, 0.562799734228895, 0.5663391214259098], [0.587207788998157, 0.5721716413198905, 0.5958625613146281, 0.6339787684992702]]
--------------
-> Result of 4-fold cross validation for svm :
    Best parameter C is: 10 
     with method: nm best accuracy = 0.6339787684992702



[[0.5967004636350872, 0.5815487862528559, 0.576551879415649, 0.5832034167891991], [0.5867981358260647, 0.5886376634133571, 0.6215402672305637, 0.6188350982884393], [0.5840365963807864, 0.5776309943579285, 0.6121896182154699, 0.6137464488311024], [0.5810374724881552, 0.5895009359360274, 0.5971509121775429, 0.6156927447631586]]
--------------
-> Result of 4-fold cross validation for svm :
    Best parameter C is: 1 
     with method: ros best accuracy = 0.6215402672305637
     
 [[0.5768446195566149, 0.5892935456419133, 0.5853994730043791, 0.582733900598742], [0.5863684344982306, 0.595372703828982, 0.620554149760623, 0.619595100260067], [0.5852592874068079, 0.5942046945291757, 0.6222750371705171, 0.6259665976908928], [0.555210945176802, 0.5799017394395695, 0.627717621110346, 0.6626145346281342]]
--------------
-> Result of 4-fold cross validation for svm :
    Best parameter C is: 10 
     with method: nm best accuracy = 0.6626145346281342

In [20]:
CC = [0.01, 0.1, 1, 10]

accs = [[0 for i in range(len(CC))]for j in range(4)]
folds_num = 4
for j in range(0,8,2):
    train_sets, validation_sets = four_fold_CV(balanced_all_fetuers[j], balanced_all_fetuers[j+1], folds_num)

    for (cntC, C) in enumerate(CC):
        avg_acc = 0
        for i in range(folds_num):
            try:
                t_X = train_sets[i]['X']
                t_Y = train_sets[i]['Y']
                v_X = validation_sets[i]['X']
                v_Y = validation_sets[i]['Y']
                clf=Classifier(t_X, t_Y,LogisticRegression(random_state=0,C=C, max_iter=10000))
                clf.fit()
                avg_acc+=f1_score(v_Y,clf.predict(v_X))
            except Exception as e: 
                print(e)

        accs[j//2][cntC] = avg_acc / folds_num
        print('-> mean f1-score 4-fold cross validation for LogisticRegression with method', method[j//2], 
              'and c =', C, 'is:', accs[j//2][cntC])


print(accs)
print("--------------")
print("-> Result of 4-fold cross validation for logistic regression:")
maxx=0
maxx_i=-1
maxx_j=-1
for i in range(len(accs)):
    for j in range(len(accs[i])):
        if accs[i][j]>maxx:
            maxx=accs[i][j]
            maxx_i=i
            maxx_j=j
best_C = CC[maxx_j]
print("    Best parameter C is:", best_C, "\n", 
      "    with method:",method[maxx_i],"best f1-score =",maxx)
print()

-> mean f1-score 4-fold cross validation for LogisticRegression with method rus and c = 0.01 is: 0.5497463459353171
-> mean f1-score 4-fold cross validation for LogisticRegression with method rus and c = 0.1 is: 0.5605014170843892
-> mean f1-score 4-fold cross validation for LogisticRegression with method rus and c = 1 is: 0.5630897608005667
-> mean f1-score 4-fold cross validation for LogisticRegression with method rus and c = 10 is: 0.5618140097893303
-> mean f1-score 4-fold cross validation for LogisticRegression with method ros and c = 0.01 is: 0.5754330373811306
-> mean f1-score 4-fold cross validation for LogisticRegression with method ros and c = 0.1 is: 0.5796980084906229
-> mean f1-score 4-fold cross validation for LogisticRegression with method ros and c = 1 is: 0.5798789476426768
-> mean f1-score 4-fold cross validation for LogisticRegression with method ros and c = 10 is: 0.58002461750957
-> mean f1-score 4-fold cross validation for LogisticRegression with method smote and 

In [21]:
CC = [0.01, 0.1, 1, 10]

accs = [[0 for i in range(len(CC))]for j in range(4)]
folds_num = 4
for j in range(0,8,2):
    train_sets, validation_sets = four_fold_CV(balanced[j], balanced[j+1], folds_num)

    for (cntC, C) in enumerate(CC):
        avg_acc = 0
        for i in range(folds_num):
            try:
                t_X = train_sets[i]['X']
                t_Y = train_sets[i]['Y']
                v_X = validation_sets[i]['X']
                v_Y = validation_sets[i]['Y']
                clf=Classifier(t_X, t_Y, LogisticRegression(C=C, max_iter=10000))
                clf.fit()
                avg_acc+=f1_score(v_Y,clf.predict(v_X))
            except Exception as e: 
                print(e)

        accs[j//2][cntC] = avg_acc / folds_num
        print('-> mean f1-score 4-fold cross validation for LogisticRegression with method', method[j//2], 
              'and c =', C, 'is:', accs[j//2][cntC])


print(accs)
print("--------------")
print("-> Result of 4-fold cross validation for logistic regression with our selected features:")
maxx=0
maxx_i=-1
maxx_j=-1
for i in range(len(accs)):
    for j in range(len(accs[i])):
        if accs[i][j]>maxx:
            maxx=accs[i][j]
            maxx_i=i
            maxx_j=j
best_C = CC[maxx_j]
print("    Best parameter C is:", best_C, "\n", 
      "    with method:",method[maxx_i],"best f1-score =",maxx)
print()

-> mean f1-score 4-fold cross validation for LogisticRegression with method rus and c = 0.01 is: 0.5763873011570309
-> mean f1-score 4-fold cross validation for LogisticRegression with method rus and c = 0.1 is: 0.5987956875546934
-> mean f1-score 4-fold cross validation for LogisticRegression with method rus and c = 1 is: 0.6025844375210536
-> mean f1-score 4-fold cross validation for LogisticRegression with method rus and c = 10 is: 0.6025844375210536
-> mean f1-score 4-fold cross validation for LogisticRegression with method ros and c = 0.01 is: 0.6062022430476869
-> mean f1-score 4-fold cross validation for LogisticRegression with method ros and c = 0.1 is: 0.6092748532013329
-> mean f1-score 4-fold cross validation for LogisticRegression with method ros and c = 1 is: 0.6092748532013329
-> mean f1-score 4-fold cross validation for LogisticRegression with method ros and c = 10 is: 0.6092748532013329
-> mean f1-score 4-fold cross validation for LogisticRegression with method smote an

[[0.5765120673537338, 0.5823695913009046, 0.5825923605025684, 0.582777659349109], [0.5846216679022144, 0.5864178555777048, 0.5864581939477248, 0.5864913060821646], [0.5834911815785344, 0.5842760118063621, 0.5845090366316807, 0.5844973220431374], [0.5581654481477375, 0.5561945259391313, 0.5504467794440864, 0.549506404407838]]
--------------
-> Result of 4-fold cross validation for logistic regression with our selected features:
    Best parameter C is: 10 
     with method: ros best accuracy = 0.5864913060821646
[[0.5734392089207982, 0.576941444283601, 0.577563817166687, 0.5773869989578434], [0.5770660722173564, 0.5760060571866957, 0.5773382764772736, 0.5768539684486106], [0.5756844437605892, 0.5751845859666964, 0.5751091243830301, 0.5755657699681545], [0.5977481743060398, 0.5965780219319656, 0.5963515289073948, 0.5954563077004094]]
--------------
-> Result of 4-fold cross validation for logistic regression :
    Best parameter C is: 0.01 
     with method: nm best accuracy = 0.5977481743060398

In [9]:
accs = [0 for j in range(4)]
folds_num = 4
for j in range(0,8,2):
        train_sets, validation_sets = four_fold_CV(balanced_all_fetuers[j], balanced_all_fetuers[j+1], folds_num)
        avg_acc = 0
        for i in range(folds_num):
            try:
                t_X = train_sets[i]['X']
                t_Y = train_sets[i]['Y']
                v_X = validation_sets[i]['X']
                v_Y = validation_sets[i]['Y']
                clf=Classifier(t_X, t_Y, RandomForestClassifier(random_state=0))
                clf.fit()
                avg_acc+=f1_score(v_Y,clf.predict(v_X))
            except Exception as e: 
                    print(e)

        accs[j//2] = avg_acc / folds_num
        print('-> mean f1-score 4-fold cross validation for RandomForestClassifier with method', method[j//2], 
              'is:', accs[j//2])


print(accs)
print("--------------")
print("-> Result of 4-fold cross validation for random forest :")
c_ind = np.argmax(accs)
print("with method:",method[c_ind],"best f1-score =",max(accs))
print()

-> mean f1-score 4-fold cross validation for RandomForestClassifier with method rus is: 0.5976990678126525
-> mean f1-score 4-fold cross validation for RandomForestClassifier with method ros is: 0.9291436604199588
-> mean f1-score 4-fold cross validation for RandomForestClassifier with method smote is: 0.7758391878418495
-> mean f1-score 4-fold cross validation for RandomForestClassifier with method nm is: 0.6305172853000833
[0.5976990678126525, 0.9291436604199588, 0.7758391878418495, 0.6305172853000833]
--------------
-> Result of 4-fold cross validation for random forest :
with method: ros best f1-score = 0.9291436604199588



In [11]:
accs = [0 for j in range(4)]
folds_num = 4
for j in range(0,8,2):
        train_sets, validation_sets = four_fold_CV(balanced[j], balanced[j+1], folds_num)
        avg_acc = 0
        for i in range(folds_num):
            try:
                t_X = train_sets[i]['X']
                t_Y = train_sets[i]['Y']
                v_X = validation_sets[i]['X']
                v_Y = validation_sets[i]['Y']
                clf=Classifier(t_X, t_Y,RandomForestClassifier(random_state=0))
                clf.fit()
                avg_acc+=f1_score(v_Y,clf.predict(v_X))
            except Exception as e: 
                    print(e)

        accs[j//2] = avg_acc / folds_num
        print('-> mean f1-score 4-fold cross validation for LogisticRegression with method', method[j//2], 
              'is:', accs[j//2])


print(accs)
print("--------------")
print("-> Result of 4-fold cross validation for random forest with selected features:")
c_ind = np.argmax(accs)
print("with method:",method[c_ind],"best accuracy =",max(accs))
print()

-> mean f1-score 4-fold cross validation for LogisticRegression with method rus is: 0.58057935575151
-> mean f1-score 4-fold cross validation for LogisticRegression with method ros is: 0.8545836828489337
-> mean f1-score 4-fold cross validation for LogisticRegression with method smote is: 0.7613718857268211
-> mean f1-score 4-fold cross validation for LogisticRegression with method nm is: 0.5791174608431623
[0.58057935575151, 0.8545836828489337, 0.7613718857268211, 0.5791174608431623]
--------------
-> Result of 4-fold cross validation for random forest :
with method: ros best accuracy = 0.8545836828489337



[0.5982587020441167, 0.7205548903150465, 0.7321080204319267, 0.7119640286509841]
--------------
-> Result of 4-fold cross validation for random forest with our selected features:
with method: smote best accuracy = 0.7321080204319267

[0.58910022555684, 0.8691357481897046, 0.7831332783301392, 0.6622645900004881]
--------------
-> Result of 4-fold cross validation for random forest :
with method: ros best accuracy = 0.8691357481897046

In [13]:
accs = [0 for j in range(4)]
folds_num = 4
for j in range(0,8,2):
        train_sets, validation_sets = four_fold_CV(balanced_all_fetuers[j], balanced_all_fetuers[j+1], folds_num)
        avg_acc = 0
        for i in range(folds_num):
            try:
                t_X = train_sets[i]['X']
                t_Y = train_sets[i]['Y']
                v_X = validation_sets[i]['X']
                v_Y = validation_sets[i]['Y']
                clf=Classifier(t_X, t_Y, XGBClassifier())
                clf.fit()
                avg_acc+=f1_score(v_Y,clf.predict(v_X))
            except Exception as e: 
                print(e)

        accs[j//2] = avg_acc / folds_num
        print('-> mean f1-score 4-fold cross validation for XGBClassifier with method', method[j//2], 
              'is:', accs[j//2])


print(accs)
print("--------------")
print("-> Result of 4-fold cross validation for XGBClassifier:")
c_ind = np.argmax(accs)
print("with method:",method[c_ind],"best accuracy =",max(accs))
print()



-> mean f1-score 4-fold cross validation for XGBClassifier with method rus is: 0.586954141985848




-> mean f1-score 4-fold cross validation for XGBClassifier with method ros is: 0.7293677023165749




-> mean f1-score 4-fold cross validation for XGBClassifier with method smote is: 0.7100597789238049




-> mean f1-score 4-fold cross validation for XGBClassifier with method nm is: 0.6581494567112133
[0.586954141985848, 0.7293677023165749, 0.7100597789238049, 0.6581494567112133]
--------------
-> Result of 4-fold cross validation for XGBClassifier:
with method: ros best accuracy = 0.7293677023165749



In [20]:
accs = [0 for j in range(4)]
folds_num = 4
for j in range(0,8,2):
        train_sets, validation_sets = four_fold_CV(balanced[j], balanced[j+1], folds_num)
        avg_acc = 0
        for i in range(folds_num):
            try:
                t_X = train_sets[i]['X']
                t_Y = train_sets[i]['Y']
                v_X = validation_sets[i]['X']
                v_Y = validation_sets[i]['Y']
                clf=Classifier(t_X, t_Y, XGBClassifier())
                clf.fit()
                avg_acc+=f1_score(v_Y,clf.predict(v_X))
            except Exception as e: 
                print(e)

        accs[j//2] = avg_acc / folds_num
        print('-> mean f1-score 4-fold cross validation for XGBClassifier with method', method[j//2], 
              'is:', accs[j//2])


print(accs)
print("--------------")
print("-> Result of 4-fold cross validation for XGBClassifier with selected features:")
c_ind = np.argmax(accs)
print("with method:",method[c_ind],"best accuracy =",max(accs))
print()



-> mean f1-score 4-fold cross validation for XGBClassifier with method rus is: 0.5800582126988227




-> mean f1-score 4-fold cross validation for XGBClassifier with method ros is: 0.6884348746023645




-> mean f1-score 4-fold cross validation for XGBClassifier with method smote is: 0.6868460739006393




-> mean f1-score 4-fold cross validation for XGBClassifier with method nm is: 0.6399611244747678
[0.5800582126988227, 0.6884348746023645, 0.6868460739006393, 0.6399611244747678]
--------------
-> Result of 4-fold cross validation for XGBClassifier with selected features:
with method: ros best accuracy = 0.6884348746023645



[0.613850239551532, 0.6784836802586837, 0.7255296747658944, 0.7187145833327385]
--------------
-> Result of 4-fold cross validation for XGBClassifier with our selected features :
with method: smote best accuracy = 0.7255296747658944

[0.586101996041527, 0.7164887569980167, 0.7778297949736547, 0.6843311608674746]
--------------
-> Result of 4-fold cross validation for XGBClassifier :
with method: smote best accuracy = 0.7778297949736547


we use 4 ways yo handle imbalanced data:
1.Undersampling can be defined as removing some observations of the majority class. This is done until the majority and minority class is balanced out.

Undersampling can be a good choice when you have a ton of data -think millions of rows. But a drawback to undersampling is that we are removing information that may be valuable.
2.Undersampling can be defined as removing some observations of the majority class. This is done until the majority and minority class is balanced out.

Undersampling can be a good choice when you have a ton of data -think millions of rows. But a drawback to undersampling is that we are removing information that may be valuable.
3.SMOTE (Synthetic Minority Oversampling Technique) works by randomly picking a point from the minority class and computing the k-nearest neighbors for this point. The synthetic points are added between the chosen point and its neighbors.
SMOTE algorithm works in 4 simple steps:

Choose a minority class as the input vector
Find its k nearest neighbors (k_neighbors is specified as an argument in the SMOTE() function)
Choose one of these neighbors and place a synthetic point anywhere on the line joining the point under consideration and its chosen neighbor
Repeat the steps until data is balanced
4.NearMiss is an under-sampling technique. Instead of resampling the Minority class, using a distance, this will make the majority class equal to the minority class.