In [74]:
import pandas as pd 
import numpy as np 
import matplotlib as plt
%matplotlib inline

In [123]:

df = pd.read_csv('../data/raw/train.csv')
df.drop(['id', 'CustomerId', 'Surname'], axis=1, inplace=True)
string_col = df.select_dtypes(include="object").columns
df[string_col]=df[string_col].astype("string")
df[string_col].head()
for col in string_col:
    print(f"The distribution of categorical values in the {col} is : ")
    print(df[col].value_counts())


#working on tree based model
from sklearn.preprocessing import LabelEncoder
df_tree = df.apply(LabelEncoder().fit_transform)

## Creaeting one hot encoded features for working with non tree based algorithms 
df_nontree=pd.get_dummies(df,columns=string_col,drop_first=False)

target = 'Exited'
y = df_nontree[target]
df_nontree = df_nontree.drop([target], axis=1)
df_nontree=pd.concat([df_nontree,y],axis=1)

from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score,roc_auc_score
from sklearn.preprocessing import RobustScaler,MinMaxScaler,StandardScaler
acc_log=[]
feature_col_nontree=df_nontree.columns.to_list()
feature_col_nontree.remove(target)

The distribution of categorical values in the Geography is : 
Geography
France     94215
Spain      36213
Germany    34606
Name: count, dtype: Int64
The distribution of categorical values in the Gender is : 
Gender
Male      93150
Female    71884
Name: count, dtype: Int64


In [105]:
kf=model_selection.StratifiedKFold(n_splits=5)
for fold , (trn_,val_) in enumerate(kf.split(X=df_nontree,y=y)):
    
    X_train=df_nontree.loc[trn_,feature_col_nontree]
    y_train=df_nontree.loc[trn_,target]
    
    X_valid=df_nontree.loc[val_,feature_col_nontree]
    y_valid=df_nontree.loc[val_,target]
    
    #print(pd.DataFrame(X_valid).head())
    ro_scaler=MinMaxScaler()
    X_train=ro_scaler.fit_transform(X_train)
    X_valid=ro_scaler.transform(X_valid)
    
    
    clf=LogisticRegression()
    clf.fit(X_train,y_train)
    y_pred=clf.predict(X_valid)
    print(f"The fold is : {fold} : ")
    print(classification_report(y_valid,y_pred))
    acc=roc_auc_score(y_valid,y_pred)
    acc_log.append(acc)
    print(f"The accuracy for Fold {fold+1} : {acc}")
    pass

The fold is : 0 : 
              precision    recall  f1-score   support

           0       0.85      0.96      0.90     26023
           1       0.71      0.39      0.50      6984

    accuracy                           0.84     33007
   macro avg       0.78      0.67      0.70     33007
weighted avg       0.82      0.84      0.82     33007

The accuracy for Fold 1 : 0.6721461930165837
The fold is : 1 : 
              precision    recall  f1-score   support

           0       0.85      0.96      0.90     26023
           1       0.70      0.38      0.49      6984

    accuracy                           0.83     33007
   macro avg       0.78      0.67      0.70     33007
weighted avg       0.82      0.83      0.81     33007

The accuracy for Fold 2 : 0.667190541836746
The fold is : 2 : 
              precision    recall  f1-score   support

           0       0.85      0.96      0.90     26023
           1       0.71      0.39      0.51      6984

    accuracy                        

In [106]:
from sklearn.neighbors import KNeighborsClassifier
acc_KNN=[]
kf=model_selection.StratifiedKFold(n_splits=5)
for fold , (trn_,val_) in enumerate(kf.split(X=df_nontree,y=y)):
    
    X_train=df_nontree.loc[trn_,feature_col_nontree]
    y_train=df_nontree.loc[trn_,target]
    
    X_valid=df_nontree.loc[val_,feature_col_nontree]
    y_valid=df_nontree.loc[val_,target]
    
    ro_scaler=MinMaxScaler()
    X_train=ro_scaler.fit_transform(X_train)
    X_valid=ro_scaler.transform(X_valid)
    
    clf=KNeighborsClassifier(n_neighbors=32)
    clf.fit(X_train,y_train)
    y_pred=clf.predict(X_valid)
    print(f"The fold is : {fold} : ")
    print(classification_report(y_valid,y_pred))
    acc=roc_auc_score(y_valid,y_pred)
    acc_KNN.append(acc)
    print(f"The accuracy for {fold+1} : {acc}")

The fold is : 0 : 
              precision    recall  f1-score   support

           0       0.87      0.96      0.91     26023
           1       0.75      0.46      0.57      6984

    accuracy                           0.85     33007
   macro avg       0.81      0.71      0.74     33007
weighted avg       0.84      0.85      0.84     33007

The accuracy for 1 : 0.7073961254602557
The fold is : 1 : 
              precision    recall  f1-score   support

           0       0.87      0.96      0.91     26023
           1       0.74      0.44      0.56      6984

    accuracy                           0.85     33007
   macro avg       0.81      0.70      0.73     33007
weighted avg       0.84      0.85      0.84     33007

The accuracy for 2 : 0.7019935449867922
The fold is : 2 : 
              precision    recall  f1-score   support

           0       0.87      0.96      0.91     26023
           1       0.74      0.45      0.56      6984

    accuracy                           0.85  

In [124]:
feature_col_tree=df_tree.columns.to_list()
feature_col_tree.remove(target)

In [108]:

from sklearn.tree import DecisionTreeClassifier
acc_Dtree=[]
kf=model_selection.StratifiedKFold(n_splits=5)
for fold , (trn_,val_) in enumerate(kf.split(X=df_tree,y=y)):
    
    X_train=df_tree.loc[trn_,feature_col_tree]
    y_train=df_tree.loc[trn_,target]
    
    X_valid=df_tree.loc[val_,feature_col_tree]
    y_valid=df_tree.loc[val_,target]
    
    clf=DecisionTreeClassifier(criterion="entropy")
    clf.fit(X_train,y_train)
    y_pred=clf.predict(X_valid)
    print(f"The fold is : {fold} : ")
    print(classification_report(y_valid,y_pred))
    acc=roc_auc_score(y_valid,y_pred)
    acc_Dtree.append(acc)
    print(f"The accuracy for {fold+1} : {acc}")


The fold is : 0 : 
              precision    recall  f1-score   support

           0       0.88      0.87      0.87     26023
           1       0.53      0.54      0.54      6984

    accuracy                           0.80     33007
   macro avg       0.70      0.71      0.71     33007
weighted avg       0.80      0.80      0.80     33007

The accuracy for 1 : 0.7066472202601285
The fold is : 1 : 
              precision    recall  f1-score   support

           0       0.88      0.87      0.87     26023
           1       0.53      0.54      0.53      6984

    accuracy                           0.80     33007
   macro avg       0.70      0.71      0.70     33007
weighted avg       0.80      0.80      0.80     33007

The accuracy for 2 : 0.7057178585610164
The fold is : 2 : 
              precision    recall  f1-score   support

           0       0.87      0.87      0.87     26023
           1       0.53      0.53      0.53      6984

    accuracy                           0.80  

In [135]:
from xgboost import XGBClassifier
acc_XGB=[]
kf=model_selection.StratifiedKFold(n_splits=5)
for fold , (trn_,val_) in enumerate(kf.split(X=df_tree,y=y)):
    
    X_train=df_tree.loc[trn_,feature_col_tree]
    y_train=df_tree.loc[trn_,target]
    
    X_valid=df_tree.loc[val_,feature_col_tree]
    y_valid=df_tree.loc[val_,target]
    
    best_param = {'gamma': 3.4449286575210953, 'max_depth': 8}
    clf=XGBClassifier(**best_param)
    clf.fit(X_train,y_train)
    y_pred=clf.predict(X_valid)
    print(f"The fold is : {fold} : ")
    print(classification_report(y_valid,y_pred))
    acc=roc_auc_score(y_valid,y_pred)
    acc_XGB.append(acc)
    print(f"The accuracy for {fold+1} : {acc}")

The fold is : 0 : 
              precision    recall  f1-score   support

           0       0.89      0.95      0.92     26023
           1       0.75      0.55      0.64      6984

    accuracy                           0.87     33007
   macro avg       0.82      0.75      0.78     33007
weighted avg       0.86      0.87      0.86     33007

The accuracy for 1 : 0.7507639647920936
The fold is : 1 : 
              precision    recall  f1-score   support

           0       0.89      0.95      0.92     26023
           1       0.74      0.54      0.63      6984

    accuracy                           0.86     33007
   macro avg       0.81      0.75      0.77     33007
weighted avg       0.86      0.86      0.86     33007

The accuracy for 2 : 0.7465502997634614
The fold is : 2 : 
              precision    recall  f1-score   support

           0       0.89      0.95      0.92     26023
           1       0.75      0.56      0.64      6984

    accuracy                           0.87  

In [132]:
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe
space={'max_depth': hp.quniform("max_depth", 3, 18, 1),
        'gamma': hp.uniform ('gamma', 1,9),
        'n_estimators': 180,
           }
def objective(space):
    clf=XGBClassifier(
                    n_estimators =space['n_estimators'], max_depth = int(space['max_depth']), gamma = space['gamma']
                 )
    
    evaluation = [( X_train, y_train), ( X_valid, y_valid)]
    
    clf.fit(X_train, y_train,
            eval_set=evaluation, eval_metric="auc",
            early_stopping_rounds=10,verbose=False)
    

    pred = clf.predict(X_valid)
    accuracy = accuracy_score(y_valid, pred>0.5)
    print ("SCORE:", accuracy)
    return {'loss': -accuracy, 'status': STATUS_OK }

In [133]:
trials = Trials()

best_hyperparams = fmin(fn = objective,
                        space = space,
                        algo = tpe.suggest,
                        max_evals = 20,
                        trials = trials)

  0%|          | 0/20 [00:00<?, ?trial/s, best loss=?]





SCORE:                                                
0.8652669211658486                                    
  5%|▌         | 1/20 [00:00<00:13,  1.45trial/s, best loss: -0.8652669211658486]





SCORE:                                                                           
0.8649033509058959                                                               
 10%|█         | 2/20 [00:01<00:13,  1.29trial/s, best loss: -0.8652669211658486]





SCORE:                                                                           
0.8642671029509786                                                               
 15%|█▌        | 3/20 [00:02<00:12,  1.32trial/s, best loss: -0.8652669211658486]





SCORE:                                                                           
0.8631460946494577                                                               
 20%|██        | 4/20 [00:03<00:12,  1.32trial/s, best loss: -0.8652669211658486]





SCORE:                                                                           
0.8619341937829486                                                               
 25%|██▌       | 5/20 [00:03<00:10,  1.37trial/s, best loss: -0.8652669211658486]





SCORE:                                                                           
0.8627219293461795                                                               
 30%|███       | 6/20 [00:04<00:09,  1.48trial/s, best loss: -0.8652669211658486]





SCORE:                                                                           
0.8563291522753439                                                               
 35%|███▌      | 7/20 [00:05<00:09,  1.33trial/s, best loss: -0.8652669211658486]





SCORE:                                                                           
0.8591771193116403                                                               
 40%|████      | 8/20 [00:05<00:09,  1.30trial/s, best loss: -0.8652669211658486]





SCORE:                                                                           
0.8647215657759195                                                               
 45%|████▌     | 9/20 [00:06<00:09,  1.19trial/s, best loss: -0.8652669211658486]





SCORE:                                                                           
0.8646003756892686                                                               
 50%|█████     | 10/20 [00:08<00:10,  1.00s/trial, best loss: -0.8652669211658486]





SCORE:                                                                            
0.861328243349694                                                                 
 55%|█████▌    | 11/20 [00:09<00:08,  1.04trial/s, best loss: -0.8652669211658486]





SCORE:                                                                            
0.864115615342665                                                                 
 60%|██████    | 12/20 [00:10<00:08,  1.02s/trial, best loss: -0.8652669211658486]





SCORE:                                                                            
0.8646912682542568                                                                
 65%|██████▌   | 13/20 [00:11<00:07,  1.04s/trial, best loss: -0.8652669211658486]





SCORE:                                                                            
0.8642368054293159                                                                
 70%|███████   | 14/20 [00:12<00:06,  1.01s/trial, best loss: -0.8652669211658486]





SCORE:                                                                            
0.8650851360358722                                                                
 75%|███████▌  | 15/20 [00:13<00:05,  1.12s/trial, best loss: -0.8652669211658486]





SCORE:                                                                            
0.8655395988608132                                                                
 80%|████████  | 16/20 [00:14<00:04,  1.07s/trial, best loss: -0.8655395988608132]





SCORE:                                                                            
0.860994970611404                                                                 
 85%|████████▌ | 17/20 [00:15<00:03,  1.03s/trial, best loss: -0.8655395988608132]





SCORE:                                                                            
0.8613888383930195                                                                
 90%|█████████ | 18/20 [00:16<00:01,  1.01trial/s, best loss: -0.8655395988608132]





SCORE:                                                                            
0.8650548385142095                                                                
 95%|█████████▌| 19/20 [00:17<00:01,  1.10s/trial, best loss: -0.8655395988608132]





SCORE:                                                                            
0.864782160819245                                                                 
100%|██████████| 20/20 [00:19<00:00,  1.04trial/s, best loss: -0.8655395988608132]


In [134]:
print("The best hyperparameters are : ","\n")
print(best_hyperparams)

The best hyperparameters are :  

{'gamma': 3.4449286575210953, 'max_depth': 8.0}
