## Model Training

#### 1.1 Import Data and Required Packages
##### Importing Pandas, Numpy, Matplotlib, Seaborn and Warings Library.

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore') 
plt.rcParams['figure.figsize']=[15,8]
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,BaggingClassifier, AdaBoostClassifier, GradientBoostingClassifier,StackingClassifier,VotingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split,GridSearchCV,KFold,cross_val_score
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from xgboost import XGBClassifier
#from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix,roc_auc_score,roc_curve, precision_score, recall_score, f1_score, cohen_kappa_score

In [4]:
pip install lightgbm

Note: you may need to restart the kernel to use updated packages.


In [2]:
#from lightgbm import LGBMClassifier

#### Import the CSV Data as Pandas DataFrame

In [26]:
df = pd.read_excel('CHURNDATA (1).xlsx')

#### Show Top 5 Records

In [27]:
df.head()

Unnamed: 0,CIF,CUS_DOB,AGE,CUS_Month_Income,CUS_Gender,CUS_Marital_Status,CUS_Customer_Since,YEARS_WITH_US,# total debit transactions for S1,# total debit transactions for S2,...,total credit amount for S2,total credit amount for S3,total debit amount,total debit transactions,total credit amount,total credit transactions,total transactions,CUS_Target,TAR_Desc,Status
0,XXXXXX,Feb 13 1970 12:00AM,49,7116.64,MALE,MARRIED,1994-06-30,25,277,265,...,1764079.61,2378592.62,4758311.63,887,5659653.33,65,952,2231,EXECUTIVE,ACTIVE
1,XXXXXX,Sep 20 1973 12:00AM,46,1500000.0,FEMALE,SINGLE,2005-05-19,14,37,15,...,19500.0,57500.0,139363.22,97,87000.0,10,107,2223,LOW,ACTIVE
2,XXXXXX,Jul 18 1966 12:00AM,53,5000000.0,FEMALE,SINGLE,2005-05-20,14,31,14,...,240311.84,70946.86,37375.46,53,1020883.7,27,80,2222,MIDLE,ACTIVE
3,XXXXXX,Jan 9 2004 12:00AM,15,500.0,FEMALE,SINGLE,2005-05-20,14,0,1,...,28089.99,18753.92,4000.0,1,76946.96,12,13,2223,LOW,ACTIVE
4,XXXXXX,Mar 7 1971 12:00AM,48,9000000.0,FEMALE,SINGLE,2014-06-30,5,15,33,...,176000.0,70500.0,338470.29,60,442100.0,21,81,2223,LOW,ACTIVE


In [28]:
df.columns=['CIF','CUS_DOB','AGE','CUS_Month_Income','CUS_Gender','CUS_Marital_Status','CUS_Customer_Since','YEARS_WITH_US','debit_trans_S1','debit_trans_S2','debit_trans_S3','debit_amount_S1','debit_amount_S2','debit_amount_S3','credit_trans_S1','credit_trans_S2','credit_trans_S3','credit_amount_S1','credit_amount_S2','credit_amount_S3','total_debit_amount','total_debit_transactions','total_credit_amount','total_credit_transactions','total_transactions','CUS_Target','TAR_Desc','Status']

In [29]:
df.drop(['CIF','CUS_DOB','CUS_Customer_Since'],axis=1,inplace=True)

In [30]:
df.dropna(inplace=True)

In [31]:
df.drop(['debit_trans_S1','debit_trans_S2','debit_trans_S3','debit_amount_S1','debit_amount_S2','debit_amount_S3','credit_trans_S1','credit_trans_S2','credit_trans_S3','credit_amount_S1','credit_amount_S2','credit_amount_S3'],axis=1,inplace=True)

In [32]:
df['CUS_Target']=df['CUS_Target'].astype(object)

In [33]:
df['total_amount']=df['total_credit_amount']-df['total_debit_amount']

In [34]:
df.head()

Unnamed: 0,AGE,CUS_Month_Income,CUS_Gender,CUS_Marital_Status,YEARS_WITH_US,total_debit_amount,total_debit_transactions,total_credit_amount,total_credit_transactions,total_transactions,CUS_Target,TAR_Desc,Status,total_amount
0,49,7116.64,MALE,MARRIED,25,4758311.63,887,5659653.33,65,952,2231,EXECUTIVE,ACTIVE,901341.7
1,46,1500000.0,FEMALE,SINGLE,14,139363.22,97,87000.0,10,107,2223,LOW,ACTIVE,-52363.22
2,53,5000000.0,FEMALE,SINGLE,14,37375.46,53,1020883.7,27,80,2222,MIDLE,ACTIVE,983508.24
3,15,500.0,FEMALE,SINGLE,14,4000.0,1,76946.96,12,13,2223,LOW,ACTIVE,72946.96
4,48,9000000.0,FEMALE,SINGLE,5,338470.29,60,442100.0,21,81,2223,LOW,ACTIVE,103629.71


#### Preparing X and Y variables

In [37]:
df['Status']=df['Status'].map({'ACTIVE':0,'CHURN':1})

In [38]:
df.head()

Unnamed: 0,AGE,CUS_Month_Income,CUS_Gender,CUS_Marital_Status,YEARS_WITH_US,total_debit_amount,total_debit_transactions,total_credit_amount,total_credit_transactions,total_transactions,CUS_Target,TAR_Desc,Status,total_amount
0,49,7116.64,MALE,MARRIED,25,4758311.63,887,5659653.33,65,952,2231,EXECUTIVE,0,901341.7
1,46,1500000.0,FEMALE,SINGLE,14,139363.22,97,87000.0,10,107,2223,LOW,0,-52363.22
2,53,5000000.0,FEMALE,SINGLE,14,37375.46,53,1020883.7,27,80,2222,MIDLE,0,983508.24
3,15,500.0,FEMALE,SINGLE,14,4000.0,1,76946.96,12,13,2223,LOW,0,72946.96
4,48,9000000.0,FEMALE,SINGLE,5,338470.29,60,442100.0,21,81,2223,LOW,0,103629.71


In [39]:
X = df.drop(columns=['Status'],axis=1)

In [40]:
X.head()

Unnamed: 0,AGE,CUS_Month_Income,CUS_Gender,CUS_Marital_Status,YEARS_WITH_US,total_debit_amount,total_debit_transactions,total_credit_amount,total_credit_transactions,total_transactions,CUS_Target,TAR_Desc,total_amount
0,49,7116.64,MALE,MARRIED,25,4758311.63,887,5659653.33,65,952,2231,EXECUTIVE,901341.7
1,46,1500000.0,FEMALE,SINGLE,14,139363.22,97,87000.0,10,107,2223,LOW,-52363.22
2,53,5000000.0,FEMALE,SINGLE,14,37375.46,53,1020883.7,27,80,2222,MIDLE,983508.24
3,15,500.0,FEMALE,SINGLE,14,4000.0,1,76946.96,12,13,2223,LOW,72946.96
4,48,9000000.0,FEMALE,SINGLE,5,338470.29,60,442100.0,21,81,2223,LOW,103629.71


In [14]:
print("Categories in 'gender' variable:     ",end=" " )
print(df['gender'].unique())

print("Categories in 'race_ethnicity' variable:  ",end=" ")
print(df['race_ethnicity'].unique())

print("Categories in'parental level of education' variable:",end=" " )
print(df['parental_level_of_education'].unique())

print("Categories in 'lunch' variable:     ",end=" " )
print(df['lunch'].unique())

print("Categories in 'test preparation course' variable:     ",end=" " )
print(df['test_preparation_course'].unique())

Categories in 'gender' variable:      ['female' 'male']
Categories in 'race_ethnicity' variable:   ['group B' 'group C' 'group A' 'group D' 'group E']
Categories in'parental level of education' variable: ["bachelor's degree" 'some college' "master's degree" "associate's degree"
 'high school' 'some high school']
Categories in 'lunch' variable:      ['standard' 'free/reduced']
Categories in 'test preparation course' variable:      ['none' 'completed']


In [41]:
y = df['Status']

In [42]:
y

0       0
1       0
2       0
3       0
4       0
       ..
1244    1
1245    0
1246    0
1247    0
1248    0
Name: Status, Length: 1238, dtype: int64

In [43]:
# Create Column Transformer with 3 types of transformers
num_features = X.select_dtypes(exclude="object").columns
cat_features = X.select_dtypes(include="object").columns

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

numeric_transformer = StandardScaler()
oh_transformer = OneHotEncoder()

preprocessor = ColumnTransformer(
    [
        ("OneHotEncoder", oh_transformer, cat_features),
         ("StandardScaler", numeric_transformer, num_features),        
    ]
)

In [44]:
X = preprocessor.fit_transform(X)

In [46]:
X.shape

(1238, 33)

In [47]:
# separate dataset into train and test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)
X_train.shape, X_test.shape

((990, 33), (248, 33))

#### Create an Evaluate Function to give all metrics after model Training

In [48]:
def model_train_and_score(estimator, X_train, y_train, X_test, y_test):
    estimator.fit(X_train, y_train)
    y_pred = estimator.predict(X_test)
    y_pred_train = estimator.predict(X_train)
    
    print("**************************************************")
    print("________"+estimator.__class__.__name__+"__________")
    print()
    print("Accuracy: {}".format(accuracy_score(y_test, y_pred)))
    print("Precision: {}".format(precision_score(y_test, y_pred, average = 'weighted')))
    print("Recall: {}".format(recall_score(y_test, y_pred, average = 'weighted')))
    print("F1 Score: {}".format(f1_score(y_test, y_pred, average = 'weighted')))
    print()
    print('Classification Report on train data \n',classification_report(y_train,y_pred_train))
    print()
    print('Classification Report on test data \n',classification_report(y_test,y_pred))
    print()
    print("**************************************************") 

In [49]:
model_train_and_score(RandomForestClassifier(), X_train, y_train, X_test, y_test)

**************************************************
________RandomForestClassifier__________

Accuracy: 0.8548387096774194
Precision: 0.8465392186419337
Recall: 0.8548387096774194
F1 Score: 0.8499857448030511

Classification Report on train data 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       807
           1       1.00      1.00      1.00       183

    accuracy                           1.00       990
   macro avg       1.00      1.00      1.00       990
weighted avg       1.00      1.00      1.00       990


Classification Report on test data 
               precision    recall  f1-score   support

           0       0.90      0.93      0.91       208
           1       0.56      0.47      0.51        40

    accuracy                           0.85       248
   macro avg       0.73      0.70      0.71       248
weighted avg       0.85      0.85      0.85       248


**************************************************


In [51]:
model_train_and_score(RandomForestClassifier(n_jobs=-1, random_state = 53), X_train, y_train, X_test, y_test)

**************************************************
________RandomForestClassifier__________

Accuracy: 0.8629032258064516
Precision: 0.8508434448061557
Recall: 0.8629032258064516
F1 Score: 0.8547197161463405

Classification Report on train data 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       807
           1       1.00      1.00      1.00       183

    accuracy                           1.00       990
   macro avg       1.00      1.00      1.00       990
weighted avg       1.00      1.00      1.00       990


Classification Report on test data 
               precision    recall  f1-score   support

           0       0.90      0.94      0.92       208
           1       0.60      0.45      0.51        40

    accuracy                           0.86       248
   macro avg       0.75      0.70      0.72       248
weighted avg       0.85      0.86      0.85       248


**************************************************


In [52]:
model_train_and_score(LogisticRegression(), X_train, y_train, X_test, y_test)

**************************************************
________LogisticRegression__________

Accuracy: 0.8467741935483871
Precision: 0.8189184977403772
Recall: 0.8467741935483871
F1 Score: 0.8215622634992711

Classification Report on train data 
               precision    recall  f1-score   support

           0       0.85      0.97      0.91       807
           1       0.65      0.21      0.32       183

    accuracy                           0.83       990
   macro avg       0.75      0.59      0.61       990
weighted avg       0.81      0.83      0.80       990


Classification Report on test data 
               precision    recall  f1-score   support

           0       0.87      0.96      0.91       208
           1       0.56      0.25      0.34        40

    accuracy                           0.85       248
   macro avg       0.71      0.61      0.63       248
weighted avg       0.82      0.85      0.82       248


**************************************************


In [53]:
model_train_and_score(DecisionTreeClassifier(), X_train, y_train, X_test, y_test)

**************************************************
________DecisionTreeClassifier__________

Accuracy: 0.7540322580645161
Precision: 0.793038160339203
Recall: 0.7540322580645161
F1 Score: 0.770346218055948

Classification Report on train data 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       807
           1       1.00      1.00      1.00       183

    accuracy                           1.00       990
   macro avg       1.00      1.00      1.00       990
weighted avg       1.00      1.00      1.00       990


Classification Report on test data 
               precision    recall  f1-score   support

           0       0.88      0.81      0.85       208
           1       0.32      0.45      0.37        40

    accuracy                           0.75       248
   macro avg       0.60      0.63      0.61       248
weighted avg       0.79      0.75      0.77       248


**************************************************


In [54]:
model_train_and_score(AdaBoostClassifier(), X_train, y_train, X_test, y_test)

**************************************************
________AdaBoostClassifier__________

Accuracy: 0.8508064516129032
Precision: 0.8523244278726096
Recall: 0.8508064516129032
F1 Score: 0.8515476481793363

Classification Report on train data 
               precision    recall  f1-score   support

           0       0.91      0.95      0.93       807
           1       0.70      0.57      0.63       183

    accuracy                           0.88       990
   macro avg       0.80      0.76      0.78       990
weighted avg       0.87      0.88      0.87       990


Classification Report on test data 
               precision    recall  f1-score   support

           0       0.91      0.91      0.91       208
           1       0.54      0.55      0.54        40

    accuracy                           0.85       248
   macro avg       0.72      0.73      0.73       248
weighted avg       0.85      0.85      0.85       248


**************************************************


In [55]:
model_train_and_score(GradientBoostingClassifier(), X_train, y_train, X_test, y_test)

**************************************************
________GradientBoostingClassifier__________

Accuracy: 0.8467741935483871
Precision: 0.8378761815247655
Recall: 0.8467741935483871
F1 Score: 0.8416516195143317

Classification Report on train data 
               precision    recall  f1-score   support

           0       0.95      0.99      0.97       807
           1       0.94      0.76      0.84       183

    accuracy                           0.95       990
   macro avg       0.94      0.87      0.90       990
weighted avg       0.95      0.95      0.94       990


Classification Report on test data 
               precision    recall  f1-score   support

           0       0.90      0.92      0.91       208
           1       0.53      0.45      0.49        40

    accuracy                           0.85       248
   macro avg       0.71      0.69      0.70       248
weighted avg       0.84      0.85      0.84       248


**************************************************


In [56]:
scorecard=pd.DataFrame(columns=['Model','Precision','Recall','F1_score','Train_Accuracy','Test_Accuracy','Kappa_score'])
def model_accuracy(model_name,classifier,X_train,y_train,X_test,y_test):
    
    # traing data 
    model=classifier.fit(X_train,y_train)
    y_train_pred=model.predict(X_train)
    #testing data
    y_test_pred=model.predict(X_test)
    
    Train_accuracy=model.score(X_train,y_train)
    Test_accuracy=model.score(X_test,y_test)
    
    global scorecard
    scorecard=scorecard.append({'Model':model_name
                               ,'Precision':precision_score(y_test,y_test_pred, average = 'weighted')
                               ,'Recall':recall_score(y_test,y_test_pred, average = 'weighted')
                               ,'F1_score':f1_score(y_test,y_test_pred, average = 'weighted')
                               ,'Train_Accuracy':Train_accuracy
                               ,'Test_Accuracy':Test_accuracy
                               ,'Kappa_score':cohen_kappa_score(y_test,y_test_pred)},ignore_index=True
                              )

In [57]:
model_accuracy('Decision Tree', DecisionTreeClassifier(), X_train, y_train, X_test, y_test)

In [58]:
model_accuracy('Logistic Regression', LogisticRegression(), X_train, y_train, X_test, y_test)

In [59]:
model_accuracy('Random Forest', RandomForestClassifier(), X_train, y_train, X_test, y_test)

In [66]:
from sklearn.ensemble import ExtraTreesClassifier
model_accuracy('Extra Trees Classifier', ExtraTreesClassifier(), X_train, y_train, X_test, y_test)

In [61]:
model_accuracy('Bagging Classifier', BaggingClassifier(), X_train, y_train, X_test, y_test)

In [62]:
model_accuracy('AdaBoost Classifier', AdaBoostClassifier(), X_train, y_train, X_test, y_test)

In [63]:
model_accuracy('GradientBoosting Classifier', GradientBoostingClassifier(), X_train, y_train, X_test, y_test)

In [64]:
model_accuracy('XGB Classifier', XGBClassifier(), X_train, y_train, X_test, y_test)

In [65]:
model_accuracy('LGBM Classifier', LGBMClassifier(), X_train, y_train, X_test, y_test)

In [68]:
scorecard

Unnamed: 0,Model,Precision,Recall,F1_score,Train_Accuracy,Test_Accuracy,Kappa_score
0,Decision Tree,0.800259,0.762097,0.777876,1.0,0.762097,0.249487
1,Logistic Regression,0.818918,0.846774,0.821562,0.833333,0.846774,0.271941
2,Random Forest,0.835032,0.842742,0.838434,1.0,0.842742,0.387848
3,Bagging Classifier,0.804132,0.826613,0.812419,0.986869,0.826613,0.262313
4,AdaBoost Classifier,0.852324,0.850806,0.851548,0.875758,0.850806,0.454069
5,GradientBoosting Classifier,0.837876,0.846774,0.841652,0.946465,0.846774,0.397134
6,XGB Classifier,0.849192,0.854839,0.85171,1.0,0.854839,0.440882
7,LGBM Classifier,0.823882,0.830645,0.826995,1.0,0.830645,0.347695
8,Extra Trees Classifier,0.838264,0.850806,0.84293,1.0,0.850806,0.393443


In [69]:
models = []
models.append(('Logistic Regression', LogisticRegression()))
models.append(('Decision Tree', DecisionTreeClassifier()))
models.append(('Random Forest',RandomForestClassifier()))
models.append(('Extra Trees Classifier',ExtraTreesClassifier()))
models.append(('Bagging Classifier',BaggingClassifier()))

In [70]:
results = []
names = []
for name, model in models:
    kfold = KFold(shuffle=True,n_splits=3,random_state=0)
    cv_results = cross_val_score(model, X_train, y_train, cv=kfold, scoring='f1_weighted')
    results.append(cv_results)
    names.append(name)
    print("%s: Bias error: %f Variance error: (%f)" % (name, 1-np.mean(cv_results),np.std(cv_results,ddof=1)))

Logistic Regression: Bias error: 0.214318 Variance error: (0.031371)
Decision Tree: Bias error: 0.202910 Variance error: (0.022725)
Random Forest: Bias error: 0.162977 Variance error: (0.014403)
Extra Trees Classifier: Bias error: 0.181826 Variance error: (0.006913)
Bagging Classifier: Bias error: 0.179922 Variance error: (0.016455)


In [71]:
models1 = []
models1.append(('AdaBoost Classifier',AdaBoostClassifier()))
models1.append(('GradientBoosting Classifier',GradientBoostingClassifier()))
models1.append(('XGB Classifier',XGBClassifier()))
models1.append(('LGBM Classifier',LGBMClassifier()))

In [72]:
results = []
names = []
for name, model in models1:
    kfold = KFold(shuffle=True,n_splits=3,random_state=10)
    cv_results = cross_val_score(model, X_train, y_train, cv=kfold, scoring='f1_weighted')
    results.append(cv_results)
    names.append(name)
    print("%s: Bias error: %f Variance error: (%f)" % (name, 1-np.mean(cv_results),np.std(cv_results,ddof=1)))

AdaBoost Classifier: Bias error: 0.182519 Variance error: (0.044658)
GradientBoosting Classifier: Bias error: 0.173912 Variance error: (0.031221)
XGB Classifier: Bias error: 0.181233 Variance error: (0.023450)
LGBM Classifier: Bias error: 0.173211 Variance error: (0.027403)


In [None]:
#Tuning

In [73]:
LR = LogisticRegression(random_state=53)

hyp_parameters = {'penalty':['l1','l2'],'max_iter':[30,50,100,120,150]}

kf = KFold(n_splits=5,shuffle=True,random_state=10)

grid_LR = GridSearchCV(estimator=LR,param_grid=hyp_parameters,cv=kf,scoring='f1_weighted')

grid_LR.fit(X_train,y_train)

GridSearchCV(cv=KFold(n_splits=5, random_state=10, shuffle=True),
             estimator=LogisticRegression(random_state=53),
             param_grid={'max_iter': [30, 50, 100, 120, 150],
                         'penalty': ['l1', 'l2']},
             scoring='f1_weighted')

In [76]:
print('Best Parameters for LogisticRegression \n',grid_LR.best_params_)

Best Parameters for LogisticRegression 
 {'max_iter': 50, 'penalty': 'l2'}


In [75]:
rf = RandomForestClassifier(random_state=53)
hyp_parameters = {'n_estimators':[10,20,30,50],'criterion':['gini','entropy'],'max_depth':[2,3,5,6],
                  'min_samples_split':[2,3,4,5],'min_samples_leaf':[1,3,5],'max_leaf_nodes':[2,3,4]}
Kf = KFold(n_splits=5,shuffle=True,random_state=10)
grid_rf = GridSearchCV(estimator=rf,param_grid=hyp_parameters,cv=Kf,scoring='f1_weighted')
grid_rf.fit(X_train,y_train)

GridSearchCV(cv=KFold(n_splits=5, random_state=10, shuffle=True),
             estimator=RandomForestClassifier(random_state=53),
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [2, 3, 5, 6], 'max_leaf_nodes': [2, 3, 4],
                         'min_samples_leaf': [1, 3, 5],
                         'min_samples_split': [2, 3, 4, 5],
                         'n_estimators': [10, 20, 30, 50]},
             scoring='f1_weighted')

In [77]:
print('Best Parameters for RandomForest \n',grid_rf.best_params_)

Best Parameters for RandomForest 
 {'criterion': 'gini', 'max_depth': 3, 'max_leaf_nodes': 4, 'min_samples_leaf': 3, 'min_samples_split': 2, 'n_estimators': 50}


In [78]:
Dt = DecisionTreeClassifier(random_state=53)

hyp_parameters = {'criterion':['gini','entropy'],
                  'max_depth':[2,3,5,6],
                  'min_samples_split':[2,3,4,5],
                  'min_samples_leaf':[1,3,5],
                  'max_leaf_nodes':[2,3,4],
                  'max_features':['auto','sqrt','log2']}

Kf = KFold(n_splits=5,shuffle=True,random_state=10)
grid_Dt = GridSearchCV(estimator=Dt,param_grid=hyp_parameters,cv=Kf,scoring='f1_weighted')
grid_Dt.fit(X_train,y_train)

GridSearchCV(cv=KFold(n_splits=5, random_state=10, shuffle=True),
             estimator=DecisionTreeClassifier(random_state=53),
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [2, 3, 5, 6],
                         'max_features': ['auto', 'sqrt', 'log2'],
                         'max_leaf_nodes': [2, 3, 4],
                         'min_samples_leaf': [1, 3, 5],
                         'min_samples_split': [2, 3, 4, 5]},
             scoring='f1_weighted')

In [80]:
print('The best parameters from Decision Tree Classifier are:\n ', grid_Dt.best_params_)

The best parameters from Decision Tree Classifier are:
  {'criterion': 'gini', 'max_depth': 3, 'max_features': 'auto', 'max_leaf_nodes': 4, 'min_samples_leaf': 1, 'min_samples_split': 2}


In [81]:
Ada = AdaBoostClassifier(random_state=53)

hyp_parameters = {'base_estimator':[RandomForestClassifier(),DecisionTreeClassifier()],'n_estimators':[30,50,80,100],
                 'learning_rate':[0.5,1,1.5,2]}

kf = KFold(n_splits=5,shuffle=True,random_state=10)

grid_Ada = GridSearchCV(estimator=Ada,param_grid=hyp_parameters,cv=kf,scoring='f1_weighted')

grid_Ada.fit(X_train,y_train)

GridSearchCV(cv=KFold(n_splits=5, random_state=10, shuffle=True),
             estimator=AdaBoostClassifier(random_state=53),
             param_grid={'base_estimator': [RandomForestClassifier(),
                                            DecisionTreeClassifier()],
                         'learning_rate': [0.5, 1, 1.5, 2],
                         'n_estimators': [30, 50, 80, 100]},
             scoring='f1_weighted')

In [82]:
print('Best Parameters for AdaBoostClassifier \n',grid_Ada.best_params_)

Best Parameters for AdaBoostClassifier 
 {'base_estimator': RandomForestClassifier(), 'learning_rate': 0.5, 'n_estimators': 30}


In [83]:
bag = BaggingClassifier(n_jobs=-1, random_state = 53)

hyp_parameters = {'base_estimator':[RandomForestClassifier(), LogisticRegression()],
                  'n_estimators':[10, 20, 40],
                  'max_samples':[1,2,3],
                  'max_features':[1,2,3]}

Kf = KFold(n_splits=3, shuffle=True, random_state=10)
grid_bag = GridSearchCV(estimator=bag, param_grid=hyp_parameters, cv=Kf, scoring='f1_weighted')
grid_bag.fit(X_train,y_train)
print('The best parameters from KNeighbors Classifier are:\n ', grid_bag.best_params_)

The best parameters from KNeighbors Classifier are:
  {'base_estimator': RandomForestClassifier(), 'max_features': 1, 'max_samples': 1, 'n_estimators': 10}


In [84]:
hyper_params = [{'learning_rate': [0.4, 0.7, 1.0], 'max_depth': [5,6], 'gamma': [0, 2]}]
kf=KFold(n_splits=3,shuffle=True,random_state=0)
xgb = XGBClassifier(random_state = 10)
xgb_grid = GridSearchCV(estimator = xgb, param_grid = hyper_params, cv = kf, scoring='f1_weighted')
xgb_grid = xgb_grid.fit(X_train, y_train)

In [85]:
print('The best parameters from KNeighbors Classifier are:\n ', xgb_grid.best_params_)

The best parameters from KNeighbors Classifier are:
  {'gamma': 0, 'learning_rate': 0.4, 'max_depth': 5}


In [86]:
scorecard_tuned=pd.DataFrame(columns=['Model','Precision','Recall','F1_score','Train_Accuracy','Test_Accuracy','Kappa_score'])
def model_accuracy_score(model_name,classifier,X_train,y_train,X_test,y_test):
    
    # traing data 
    model=classifier.fit(X_train,y_train)
    y_train_pred=model.predict(X_train)
    #testing data
    y_test_pred=model.predict(X_test)
    
    Train_accuracy=model.score(X_train,y_train)
    Test_accuracy=model.score(X_test,y_test)
    
    global scorecard_tuned
    scorecard_tuned=scorecard_tuned.append({'Model':model_name
                               ,'Precision':precision_score(y_test,y_test_pred, average = 'weighted')
                               ,'Recall':recall_score(y_test,y_test_pred, average = 'weighted')
                               ,'F1_score':f1_score(y_test,y_test_pred, average = 'weighted')
                               ,'Train_Accuracy':Train_accuracy
                               ,'Test_Accuracy':Test_accuracy
                               ,'Kappa_score':cohen_kappa_score(y_test,y_test_pred)},ignore_index=True
                              )

In [None]:
#Tuned

In [87]:
tuned_LR = LogisticRegression(max_iter = 50, penalty = 'l2', random_state = 53)

In [88]:
model_train_and_score(tuned_LR, X_train, y_train, X_test, y_test)

**************************************************
________LogisticRegression__________

Accuracy: 0.8467741935483871
Precision: 0.8189184977403772
Recall: 0.8467741935483871
F1 Score: 0.8215622634992711

Classification Report on train data 
               precision    recall  f1-score   support

           0       0.85      0.97      0.91       807
           1       0.65      0.21      0.32       183

    accuracy                           0.83       990
   macro avg       0.75      0.59      0.61       990
weighted avg       0.81      0.83      0.80       990


Classification Report on test data 
               precision    recall  f1-score   support

           0       0.87      0.96      0.91       208
           1       0.56      0.25      0.34        40

    accuracy                           0.85       248
   macro avg       0.71      0.61      0.63       248
weighted avg       0.82      0.85      0.82       248


**************************************************


In [89]:
Tuned_DT = DecisionTreeClassifier(criterion='gini', 
                                 max_depth=3, 
                                 max_features='auto', 
                                 max_leaf_nodes=4, 
                                 min_samples_leaf=3, 
                                 min_samples_split=2, 
                                 random_state = 53)


In [90]:
model_train_and_score(Tuned_DT, X_train, y_train, X_test, y_test)

**************************************************
________DecisionTreeClassifier__________

Accuracy: 0.8387096774193549
Precision: 0.8063113604488079
Recall: 0.8387096774193549
F1 Score: 0.8121708036834433

Classification Report on train data 
               precision    recall  f1-score   support

           0       0.86      0.96      0.91       807
           1       0.65      0.32      0.43       183

    accuracy                           0.84       990
   macro avg       0.76      0.64      0.67       990
weighted avg       0.82      0.84      0.82       990


Classification Report on test data 
               precision    recall  f1-score   support

           0       0.87      0.96      0.91       208
           1       0.50      0.23      0.31        40

    accuracy                           0.84       248
   macro avg       0.68      0.59      0.61       248
weighted avg       0.81      0.84      0.81       248


**************************************************


In [91]:
Tuned_Random_forest = RandomForestClassifier(n_estimators = 10, 
    criterion = 'gini', 
    max_depth = 3, 
    min_samples_split = 2, 
    min_samples_leaf = 3,
    max_leaf_nodes = 4,                                        
    random_state = 53,  
    n_jobs = -1)


In [92]:
model_train_and_score(Tuned_Random_forest, X_train, y_train, X_test, y_test)

**************************************************
________RandomForestClassifier__________

Accuracy: 0.8467741935483871
Precision: 0.815518498268635
Recall: 0.8467741935483871
F1 Score: 0.8103525361589877

Classification Report on train data 
               precision    recall  f1-score   support

           0       0.85      0.99      0.91       807
           1       0.79      0.24      0.37       183

    accuracy                           0.85       990
   macro avg       0.82      0.61      0.64       990
weighted avg       0.84      0.85      0.81       990


Classification Report on test data 
               precision    recall  f1-score   support

           0       0.86      0.98      0.91       208
           1       0.58      0.17      0.27        40

    accuracy                           0.85       248
   macro avg       0.72      0.58      0.59       248
weighted avg       0.82      0.85      0.81       248


**************************************************


In [93]:
tuned_bagging = BaggingClassifier(base_estimator = RandomForestClassifier(),
                                  max_features=1, 
                                  max_samples=1, 
                                  n_estimators = 10,
                                 random_state = 53)


In [94]:
model_train_and_score(tuned_bagging, X_train, y_train, X_test, y_test)

**************************************************
________BaggingClassifier__________

Accuracy: 0.8387096774193549
Precision: 0.7034339229968783
Recall: 0.8387096774193549
F1 Score: 0.7651386530843237

Classification Report on train data 
               precision    recall  f1-score   support

           0       0.82      1.00      0.90       807
           1       0.00      0.00      0.00       183

    accuracy                           0.82       990
   macro avg       0.41      0.50      0.45       990
weighted avg       0.66      0.82      0.73       990


Classification Report on test data 
               precision    recall  f1-score   support

           0       0.84      1.00      0.91       208
           1       0.00      0.00      0.00        40

    accuracy                           0.84       248
   macro avg       0.42      0.50      0.46       248
weighted avg       0.70      0.84      0.77       248


**************************************************


In [95]:
Tuned_AdaBoost = AdaBoostClassifier(base_estimator = RandomForestClassifier(), 
                                    learning_rate = 0.5, 
                                    n_estimators = 30, 
                                   random_state = 53)

In [96]:
model_train_and_score(Tuned_AdaBoost, X_train, y_train, X_test, y_test)

**************************************************
________AdaBoostClassifier__________

Accuracy: 0.8548387096774194
Precision: 0.8440113500597372
Recall: 0.8548387096774194
F1 Score: 0.8481436396835058

Classification Report on train data 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       807
           1       1.00      1.00      1.00       183

    accuracy                           1.00       990
   macro avg       1.00      1.00      1.00       990
weighted avg       1.00      1.00      1.00       990


Classification Report on test data 
               precision    recall  f1-score   support

           0       0.90      0.93      0.92       208
           1       0.56      0.45      0.50        40

    accuracy                           0.85       248
   macro avg       0.73      0.69      0.71       248
weighted avg       0.84      0.85      0.85       248


**************************************************


In [97]:
tuned_xgb = XGBClassifier(learning_rate = 0, 
             gamma = 0.4, 
             max_depth = 5)

In [98]:
model_train_and_score(tuned_xgb, X_train, y_train, X_test, y_test)

**************************************************
________XGBClassifier__________

Accuracy: 0.8387096774193549
Precision: 0.7034339229968783
Recall: 0.8387096774193549
F1 Score: 0.7651386530843237

Classification Report on train data 
               precision    recall  f1-score   support

           0       0.82      1.00      0.90       807
           1       0.00      0.00      0.00       183

    accuracy                           0.82       990
   macro avg       0.41      0.50      0.45       990
weighted avg       0.66      0.82      0.73       990


Classification Report on test data 
               precision    recall  f1-score   support

           0       0.84      1.00      0.91       208
           1       0.00      0.00      0.00        40

    accuracy                           0.84       248
   macro avg       0.42      0.50      0.46       248
weighted avg       0.70      0.84      0.77       248


**************************************************


In [99]:
ada_tuned_LR = AdaBoostClassifier(base_estimator=tuned_LR, random_state = 53)

In [100]:
model_train_and_score(ada_tuned_LR, X_train, y_train, X_test, y_test)

**************************************************
________AdaBoostClassifier__________

Accuracy: 0.842741935483871
Precision: 0.8161509765196401
Recall: 0.842741935483871
F1 Score: 0.7815066349368799

Classification Report on train data 
               precision    recall  f1-score   support

           0       0.82      0.99      0.90       807
           1       0.62      0.05      0.10       183

    accuracy                           0.82       990
   macro avg       0.72      0.52      0.50       990
weighted avg       0.79      0.82      0.75       990


Classification Report on test data 
               precision    recall  f1-score   support

           0       0.84      1.00      0.91       208
           1       0.67      0.05      0.09        40

    accuracy                           0.84       248
   macro avg       0.76      0.52      0.50       248
weighted avg       0.82      0.84      0.78       248


**************************************************


In [101]:
ada_tuned_DT = AdaBoostClassifier(DecisionTreeClassifier(), random_state = 53)

In [102]:
model_train_and_score(ada_tuned_DT, X_train, y_train, X_test, y_test)

**************************************************
________AdaBoostClassifier__________

Accuracy: 0.7620967741935484
Precision: 0.7871327342137117
Recall: 0.7620967741935484
F1 Score: 0.773244756040455

Classification Report on train data 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       807
           1       1.00      1.00      1.00       183

    accuracy                           1.00       990
   macro avg       1.00      1.00      1.00       990
weighted avg       1.00      1.00      1.00       990


Classification Report on test data 
               precision    recall  f1-score   support

           0       0.88      0.83      0.85       208
           1       0.31      0.40      0.35        40

    accuracy                           0.76       248
   macro avg       0.60      0.62      0.60       248
weighted avg       0.79      0.76      0.77       248


**************************************************


In [103]:
bag_tuned_LR = BaggingClassifier(LogisticRegression(), random_state = 53)

In [104]:
model_train_and_score(bag_tuned_LR, X_train, y_train, X_test, y_test)

**************************************************
________BaggingClassifier__________

Accuracy: 0.8548387096774194
Precision: 0.8338177665896763
Recall: 0.8548387096774194
F1 Score: 0.836777166641891

Classification Report on train data 
               precision    recall  f1-score   support

           0       0.85      0.97      0.90       807
           1       0.60      0.22      0.33       183

    accuracy                           0.83       990
   macro avg       0.72      0.60      0.61       990
weighted avg       0.80      0.83      0.80       990


Classification Report on test data 
               precision    recall  f1-score   support

           0       0.88      0.96      0.92       208
           1       0.59      0.33      0.42        40

    accuracy                           0.85       248
   macro avg       0.74      0.64      0.67       248
weighted avg       0.83      0.85      0.84       248


**************************************************


In [105]:
from sklearn.ensemble import VotingClassifier,StackingClassifier

estimator = [('RF',RandomForestClassifier()),
             ('Tuned AdaBoost Clssifier',Tuned_AdaBoost),('Xgb',XGBClassifier())]

In [107]:
vot = VotingClassifier(estimators=estimator,voting='soft')
model_train_and_score(vot,X_train, y_train, X_test, y_test)

**************************************************
________VotingClassifier__________

Accuracy: 0.842741935483871
Precision: 0.8350316955496419
Recall: 0.842741935483871
F1 Score: 0.8384338364876254

Classification Report on train data 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       807
           1       1.00      1.00      1.00       183

    accuracy                           1.00       990
   macro avg       1.00      1.00      1.00       990
weighted avg       1.00      1.00      1.00       990


Classification Report on test data 
               precision    recall  f1-score   support

           0       0.90      0.92      0.91       208
           1       0.51      0.45      0.48        40

    accuracy                           0.84       248
   macro avg       0.71      0.68      0.69       248
weighted avg       0.84      0.84      0.84       248


**************************************************


In [108]:
model_accuracy_score('Tuned Logistic Regression', tuned_LR, X_train, y_train, X_test, y_test)

In [109]:
model_accuracy_score('Tuned Decision Tree', Tuned_DT, X_train, y_train, X_test, y_test)

In [110]:
model_accuracy_score('Tuned Random Forest', Tuned_Random_forest, X_train, y_train, X_test, y_test)

In [111]:
model_accuracy_score('Tuned Bagging Clssifier', tuned_bagging, X_train, y_train, X_test, y_test)

In [112]:
model_accuracy_score('Tuned AdaBoost Clssifier', Tuned_AdaBoost, X_train, y_train, X_test, y_test)

In [113]:
model_accuracy_score('Tuned XGBClssifier', tuned_xgb, X_train, y_train, X_test, y_test)

In [114]:
model_accuracy_score('Boosted Logistic Regression', ada_tuned_LR, X_train, y_train, X_test, y_test)

In [115]:
model_accuracy_score('Boosted Decision Tree', ada_tuned_DT, X_train, y_train, X_test, y_test)

In [116]:
model_accuracy_score('Bagged Logistic Regression', bag_tuned_LR, X_train, y_train, X_test, y_test)

In [117]:
model_accuracy_score('Voting Classifier', vot, X_train, y_train, X_test, y_test)

In [118]:
scorecard_tuned

Unnamed: 0,Model,Precision,Recall,F1_score,Train_Accuracy,Test_Accuracy,Kappa_score
0,Tuned Logistic Regression,0.818918,0.846774,0.821562,0.833333,0.846774,0.271941
1,Tuned Decision Tree,0.806311,0.83871,0.812171,0.842424,0.83871,0.233622
2,Tuned Random Forest,0.815518,0.846774,0.810353,0.847475,0.846774,0.210456
3,Tuned Bagging Clssifier,0.703434,0.83871,0.765139,0.815152,0.83871,0.0
4,Tuned AdaBoost Clssifier,0.844011,0.854839,0.848144,1.0,0.854839,0.416318
5,Tuned XGBClssifier,0.703434,0.83871,0.765139,0.815152,0.83871,0.0
6,Boosted Logistic Regression,0.816151,0.842742,0.781507,0.819192,0.842742,0.072141
7,Boosted Decision Tree,0.787133,0.762097,0.773245,1.0,0.762097,0.208568
8,Bagged Logistic Regression,0.833818,0.854839,0.836777,0.829293,0.854839,0.344301
9,Voting Classifier,0.843578,0.850806,0.846719,1.0,0.850806,0.419241


In [119]:
models_tuned = []
models_tuned.append(('Tuned Logistic Regression', tuned_LR))
models_tuned.append(('Tuned Decision Tree', Tuned_DT))
models_tuned.append(('Tuned Random Forest',Tuned_Random_forest))
models_tuned.append(('Tuned Bagging Classifier',tuned_bagging))
models_tuned.append(('Tuned Adaboost Classifier',Tuned_AdaBoost))
models_tuned.append(('Tuned XGBClassifier',tuned_xgb))
models_tuned.append(('Boosted Logistic Regression',ada_tuned_LR))
models_tuned.append(('Boosted Decision Tree',ada_tuned_DT))
models_tuned.append(('Bagged Logistic Regression',bag_tuned_LR))
models_tuned.append(('Voting Classifier',vot))

In [120]:
results = []
names = []
for name, model in models_tuned:
    kfold = KFold(shuffle=True,n_splits=3,random_state=0)
    cv_results = cross_val_score(model, X_train, y_train, cv=kfold, scoring='f1_weighted')
    results.append(cv_results)
    names.append(name)
    print("%s: Bias error: (%f) Variance error: (%f)" % (name, 1-np.mean(cv_results),np.std(cv_results,ddof=1)))

Tuned Logistic Regression: Bias error: (0.214318) Variance error: (0.031371)
Tuned Decision Tree: Bias error: (0.182417) Variance error: (0.010798)
Tuned Random Forest: Bias error: (0.232395) Variance error: (0.010798)
Tuned Bagging Classifier: Bias error: (0.267784) Variance error: (0.025611)
Tuned Adaboost Classifier: Bias error: (0.166585) Variance error: (0.007747)
Tuned XGBClassifier: Bias error: (0.267784) Variance error: (0.025611)
Boosted Logistic Regression: Bias error: (0.240033) Variance error: (0.033878)
Boosted Decision Tree: Bias error: (0.191142) Variance error: (0.033691)
Bagged Logistic Regression: Bias error: (0.221226) Variance error: (0.022644)
Voting Classifier: Bias error: (0.161670) Variance error: (0.010649)


In [123]:
from sklearn.model_selection import cross_val_score

import time

def model_evaluation(clf):
    
    clf = clf
    
    t_start = time.time()
    clf = clf.fit(X_train, y_train)
    t_end = time.time()
    
    c_start = time.time()     
    accuracy  = cross_val_score(clf, X_train, y_train, cv = 3, scoring = 'accuracy')
    f1_score = cross_val_score(clf, X_train, y_train, cv = 3, scoring = 'f1_macro')
    c_end = time.time()    
    
    acc_mean = np.round(accuracy.mean() * 100, 2)
    f1_mean = np.round(f1_score.mean() * 100, 2)
    
    t_time = np.round((t_end - t_start) / 60, 3)
    c_time = np.round((c_end - c_start) / 60, 3)
    
    print("The accuracy score of this classifier on our training set is", acc_mean,"% and f1 score is", f1_mean,"% taking", t_time,"minutes to train and", c_time,
          "minutes to evaluate cross validation and metric scores.")

In [124]:
model_evaluation(RandomForestClassifier(n_jobs=-1, random_state = 53))

The accuracy score of this classifier on our training set is 84.14 % and f1 score is 69.65 % taking 0.003 minutes to train and 0.025 minutes to evaluate cross validation and metric scores.


In [125]:
model_evaluation(DecisionTreeClassifier(random_state = 53))

The accuracy score of this classifier on our training set is 80.1 % and f1 score is 67.37 % taking 0.0 minutes to train and 0.001 minutes to evaluate cross validation and metric scores.


In [126]:
model_evaluation(LogisticRegression(random_state = 53))

The accuracy score of this classifier on our training set is 81.52 % and f1 score is 59.3 % taking 0.001 minutes to train and 0.002 minutes to evaluate cross validation and metric scores.


In [127]:
model_evaluation(AdaBoostClassifier(random_state = 53))

The accuracy score of this classifier on our training set is 83.13 % and f1 score is 69.6 % taking 0.001 minutes to train and 0.006 minutes to evaluate cross validation and metric scores.


In [128]:
model_evaluation(BaggingClassifier(random_state = 53))

The accuracy score of this classifier on our training set is 84.44 % and f1 score is 70.59 % taking 0.001 minutes to train and 0.002 minutes to evaluate cross validation and metric scores.


In [129]:
model_evaluation(GradientBoostingClassifier(random_state = 53))

The accuracy score of this classifier on our training set is 84.24 % and f1 score is 70.57 % taking 0.003 minutes to train and 0.012 minutes to evaluate cross validation and metric scores.


In [130]:
model_evaluation(XGBClassifier(random_state = 53))

The accuracy score of this classifier on our training set is 84.34 % and f1 score is 70.77 % taking 0.002 minutes to train and 0.008 minutes to evaluate cross validation and metric scores.


In [131]:
model_evaluation(LGBMClassifier(random_state = 53))

The accuracy score of this classifier on our training set is 83.23 % and f1 score is 69.22 % taking 0.004 minutes to train and 0.007 minutes to evaluate cross validation and metric scores.


In [132]:
model_evaluation(Tuned_AdaBoost)

The accuracy score of this classifier on our training set is 83.74 % and f1 score is 68.34 % taking 0.003 minutes to train and 0.012 minutes to evaluate cross validation and metric scores.


In [133]:
model_evaluation(tuned_xgb)

The accuracy score of this classifier on our training set is 81.52 % and f1 score is 44.91 % taking 0.004 minutes to train and 0.011 minutes to evaluate cross validation and metric scores.


In [134]:
model_evaluation(vot) 

The accuracy score of this classifier on our training set is 84.75 % and f1 score is 70.54 % taking 0.006 minutes to train and 0.031 minutes to evaluate cross validation and metric scores.


In [135]:
model_evaluation(ExtraTreesClassifier(random_state = 53))

The accuracy score of this classifier on our training set is 83.13 % and f1 score is 67.87 % taking 0.002 minutes to train and 0.008 minutes to evaluate cross validation and metric scores.
