In [1]:
import pickle   # importing pickle for saving and loading machine learning models
from sklearn.model_selection import train_test_split  # importing train_test_split for spliting the data into training and testing
from imblearn.over_sampling import SMOTE  # importing SMOTE for Balancing the Data
import warnings
warnings.filterwarnings('ignore')

In [2]:
from preprocessing import *

In [3]:
with open('Tree_CT.pkl','rb') as f:
    pre = pickle.load(f)

In [4]:
pre

In [5]:
with open('Processed_data.pkl','rb') as f:
    df = pickle.load(f)

In [6]:
df

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,Gender,...,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,2,Female,...,3,1,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,3,Male,...,4,4,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,4,Male,...,3,2,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,4,Female,...,3,3,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,Male,...,3,4,1,6,3,3,2,2,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1465,36,No,Travel_Frequently,884,Research & Development,23,2,Medical,3,Male,...,3,3,1,17,3,3,5,2,0,3
1466,39,No,Travel_Rarely,613,Research & Development,6,1,Medical,4,Male,...,3,1,1,9,5,3,7,7,1,7
1467,27,No,Travel_Rarely,155,Research & Development,4,3,Life Sciences,2,Male,...,4,2,1,6,0,3,6,2,0,3
1468,49,No,Travel_Frequently,1023,Sales,2,3,Medical,4,Male,...,3,4,0,17,3,2,9,6,0,8


In [7]:
X = df.drop("Attrition",axis=1)     # Extract the features (all columns except Attritions) from the dataset
y = df["Attrition"].map({"No":0,"Yes":1})  # Extract the target variable from the dataset with converting 0 and 1.

In [8]:
# Spliting the data into train and test
x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.25,random_state=33)

In [9]:
# Transform the training data using the preprocessor object or PipeLine
processed_x_train = pre.fit_transform(x_train)

In [10]:
processed_x_train

array([[1., 0., 0., ..., 0., 3., 2.],
       [0., 0., 1., ..., 2., 2., 3.],
       [0., 0., 1., ..., 1., 3., 1.],
       ...,
       [0., 1., 0., ..., 0., 2., 3.],
       [0., 0., 1., ..., 1., 3., 3.],
       [0., 0., 1., ..., 1., 0., 2.]])

In [11]:
processed_x_train[0]

array([ 1.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        1.        ,  0.        ,  0.        ,  1.        ,  0.        ,
        0.        ,  1.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  1.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  1.        ,
        0.        ,  1.        , -0.3098615 ,  1.3622128 ,  1.16898078,
       -1.33034804,  0.05469197,  2.11675505, -0.30137951, -0.32177888,
       -0.60819028, -0.68968387, -0.03669649, -0.5936842 ,  3.        ,
        1.        ,  4.        ,  2.        ,  4.        ,  3.        ,
        2.        ,  0.        ,  3.        ,  2.        ])

### Model Building

In [13]:
from sklearn.tree import DecisionTreeClassifier        #importing decision tree from sklearn.tree
dt=DecisionTreeClassifier()             #object creation for decision tree
dt.fit(processed_x_train,y_train)    # sample_weight=sample_weights    #train the model

In [14]:
#Prediction
processed_x_test = pre.transform(x_test)
y_pred=dt.predict(processed_x_test)

In [15]:
#Evaluation
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
acc = accuracy_score(y_test,y_pred)
pr = precision_score(y_test,y_pred)
re = recall_score(y_test,y_pred)
f1 = f1_score(y_test,y_pred)
cm = confusion_matrix(y_test,y_pred)
print('Accuracy Score: ',acc,'\nPrecision:', pr,'\nRecall Score:', re,'\nF1 Score', f1, '\nConfusion Matrix: \n', cm)
print(classification_report(y_test,y_pred))

Accuracy Score:  0.7690217391304348 
Precision: 0.3466666666666667 
Recall Score: 0.41935483870967744 
F1 Score 0.3795620437956204 
Confusion Matrix: 
 [[257  49]
 [ 36  26]]
              precision    recall  f1-score   support

           0       0.88      0.84      0.86       306
           1       0.35      0.42      0.38        62

    accuracy                           0.77       368
   macro avg       0.61      0.63      0.62       368
weighted avg       0.79      0.77      0.78       368



In [16]:
#Prediction on Training data - Test of overfitting
y_pred1=dt.predict(processed_x_train)

#Evaluation
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
acc = accuracy_score(y_train,y_pred1)
pr = precision_score(y_train,y_pred1)
re = recall_score(y_train,y_pred1)
f1 = f1_score(y_train,y_pred1)
cm = confusion_matrix(y_train,y_pred1)
print('Accuracy Score: ',acc,'\nPrecision:', pr,'\nRecall Score:', re,'\nF1 Score', f1, '\nConfusion Matrix: \n', cm)
print(classification_report(y_train,y_pred1))

Accuracy Score:  1.0 
Precision: 1.0 
Recall Score: 1.0 
F1 Score 1.0 
Confusion Matrix: 
 [[927   0]
 [  0 175]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       927
           1       1.00      1.00      1.00       175

    accuracy                           1.00      1102
   macro avg       1.00      1.00      1.00      1102
weighted avg       1.00      1.00      1.00      1102



In [17]:
y_train.value_counts()

Attrition
0    927
1    175
Name: count, dtype: int64

In [18]:
## Data Balancing and Creating Model
from imblearn.over_sampling import SMOTE
x_smote,y_smote = SMOTE().fit_resample(processed_x_train,y_train)

dt_bal=DecisionTreeClassifier()            
dt_bal.fit(x_smote,y_smote) 

In [19]:
y_smote.value_counts()

Attrition
1    927
0    927
Name: count, dtype: int64

In [20]:
#Prediction
processed_x_test = pre.transform(x_test)
y_pred=dt_bal.predict(processed_x_test)

In [21]:
#Evaluation
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
acc = accuracy_score(y_test,y_pred)
pr = precision_score(y_test,y_pred)
re = recall_score(y_test,y_pred)
f1 = f1_score(y_test,y_pred)
cm = confusion_matrix(y_test,y_pred)
print('Accuracy Score: ',acc,'\nPrecision:', pr,'\nRecall Score:', re,'\nF1 Score', f1, '\nConfusion Matrix: \n', cm)
print(classification_report(y_test,y_pred))

Accuracy Score:  0.7554347826086957 
Precision: 0.3372093023255814 
Recall Score: 0.46774193548387094 
F1 Score 0.3918918918918919 
Confusion Matrix: 
 [[249  57]
 [ 33  29]]
              precision    recall  f1-score   support

           0       0.88      0.81      0.85       306
           1       0.34      0.47      0.39        62

    accuracy                           0.76       368
   macro avg       0.61      0.64      0.62       368
weighted avg       0.79      0.76      0.77       368



In [42]:
from sklearn.model_selection import GridSearchCV
#creating dictionary--> key value pair of hyperparameters having key as parameter and values as its values
params = {
    "criterion":("gini", "entropy"), #Split criterion
    "splitter":("best", "random"),  #searches the features for a split
    "max_depth":(list(range(1, 10))),  #depth of tree range from 1 to 19
    "min_samples_split":[2,3,4,5,6,7,8,9,10,12,15],    #the minimum number of samples required to split internal node
    "min_samples_leaf":list(range(1, 20)), #minimum number of samples required to be at a leaf node,we are passing list which is range from 1 to 19
}


tree_clf = DecisionTreeClassifier()                # object creation for decision tree with random state 3
tree_cv = GridSearchCV(tree_clf, params, scoring="f1", n_jobs=-1, verbose=2, cv=5)

In [125]:
tree_cv.fit(processed_x_train,y_train)    # training data on gridsearch cv
best_params = tree_cv.best_params_    # it will give you best parameters
print(f"Best paramters: {best_params})")   # printing  best parameters

Fitting 5 folds for each of 7524 candidates, totalling 37620 fits
Best paramters: {'criterion': 'gini', 'max_depth': 6, 'min_samples_leaf': 14, 'min_samples_split': 8, 'splitter': 'best'})


In [None]:
##Fitting 5 folds for each of 7524 candidates, totalling 37620 fits ---- takes huge time.

In [None]:
tree_cv.best_params_    # getting best parameters from cv

In [44]:
# passing best parameter to decision tree
dt_optimal=DecisionTreeClassifier(criterion='gini',max_depth=6,min_samples_leaf= 14,min_samples_split=8,splitter='best')
dt_optimal.fit(processed_x_train,y_train)

y_pred=dt_optimal.predict(processed_x_test)

acc = accuracy_score(y_test,y_pred)
pr = precision_score(y_test,y_pred)
re = recall_score(y_test,y_pred)
f1 = f1_score(y_test,y_pred)
cm = confusion_matrix(y_test,y_pred)
print('Accuracy Score: ',acc,'\nPrecision:', pr,'\nRecall Score:', re,'\nF1 Score', f1, '\nConfusion Matrix: \n', cm)
print(classification_report(y_test,y_pred))

Accuracy Score:  0.8260869565217391 
Precision: 0.47058823529411764 
Recall Score: 0.25806451612903225 
F1 Score 0.3333333333333333 
Confusion Matrix: 
 [[288  18]
 [ 46  16]]
              precision    recall  f1-score   support

           0       0.86      0.94      0.90       306
           1       0.47      0.26      0.33        62

    accuracy                           0.83       368
   macro avg       0.67      0.60      0.62       368
weighted avg       0.80      0.83      0.80       368



## Random Forest Implementation

In [46]:
from sklearn.ensemble import RandomForestClassifier   # importing randomforest

rf_clf = RandomForestClassifier() # Assigning RandomForest CLassifier into variable
rf_clf.fit(processed_x_train,y_train)   # training the data
y_pred=rf_clf.predict(processed_x_test) 

acc = accuracy_score(y_test,y_pred)
pr = precision_score(y_test,y_pred)
re = recall_score(y_test,y_pred)
f1 = f1_score(y_test,y_pred)
cm = confusion_matrix(y_test,y_pred)
print('Accuracy Score: ',acc,'\nPrecision:', pr,'\nRecall Score:', re,'\nF1 Score', f1, '\nConfusion Matrix: \n', cm)
print(classification_report(y_test,y_pred))


Accuracy Score:  0.8478260869565217 
Precision: 0.8 
Recall Score: 0.12903225806451613 
F1 Score 0.2222222222222222 
Confusion Matrix: 
 [[304   2]
 [ 54   8]]
              precision    recall  f1-score   support

           0       0.85      0.99      0.92       306
           1       0.80      0.13      0.22        62

    accuracy                           0.85       368
   macro avg       0.82      0.56      0.57       368
weighted avg       0.84      0.85      0.80       368



In [48]:
from sklearn.model_selection import RandomizedSearchCV
n_estimators = [int(x) for x in np.linspace(start=200, stop=2000, num=5)]      # List Comprehension-using for loop in list
max_features = ['auto', 'sqrt','log2']        # maximum number of features allowed to try in individual tree
max_depth = [int(x) for x in np.linspace(10, 110, num=5)]            # List Comprehension-using for loop in list
max_depth.append(None)
min_samples_split = [5, 10]          # minimum number of samples required to split an internal node
min_samples_leaf = [2, 4]              # minimum number of samples required to be at a leaf node.

#dictionary for hyperparameters
random_grid = {'n_estimators': n_estimators, 'max_features': max_features,
               'max_depth': max_depth, 'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf}

rf_clf1 = RandomForestClassifier(random_state=42)   # Loading the model

rf_cv = RandomizedSearchCV(estimator=rf_clf1, scoring='f1',param_distributions= random_grid, cv=3,
                               verbose=2, n_jobs=-1)

In [50]:
rf_cv.fit(processed_x_train, y_train)
rf_best_params = rf_cv.best_params_ 
print(f"Best paramters: {rf_best_params})") 

Fitting 3 folds for each of 10 candidates, totalling 30 fits
Best paramters: {'n_estimators': 1100, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': 35})


In [52]:
# passing best parameter to randomforest
rf_clf2 = RandomForestClassifier(n_estimators=1100,min_samples_leaf=2,min_samples_split=5,max_features="sqrt",max_depth=35)
rf_clf2.fit(processed_x_train, y_train)  # train with tune parameters

In [54]:
y_pred=rf_clf2.predict(processed_x_test)

acc = accuracy_score(y_test,y_pred)
pr = precision_score(y_test,y_pred)
re = recall_score(y_test,y_pred)
f1 = f1_score(y_test,y_pred)
cm = confusion_matrix(y_test,y_pred)
print('Accuracy Score: ',acc,'\nPrecision:', pr,'\nRecall Score:', re,'\nF1 Score', f1, '\nConfusion Matrix: \n', cm)
print(classification_report(y_test,y_pred))


Accuracy Score:  0.8478260869565217 
Precision: 0.8 
Recall Score: 0.12903225806451613 
F1 Score 0.2222222222222222 
Confusion Matrix: 
 [[304   2]
 [ 54   8]]
              precision    recall  f1-score   support

           0       0.85      0.99      0.92       306
           1       0.80      0.13      0.22        62

    accuracy                           0.85       368
   macro avg       0.82      0.56      0.57       368
weighted avg       0.84      0.85      0.80       368



## Bagging

In [61]:
from sklearn.ensemble import BaggingClassifier#import bagging 
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

KNN=KNeighborsClassifier()
LR=LogisticRegression()

model_bagg=BaggingClassifier(estimator=KNN,n_estimators=20) ## model objet creation
#base_estimator---> algorithm which you want to pass
#n_estimotors-----> number of base learners

model_bagg.fit(processed_x_train,y_train) ## fitting the model
y_pred=model_bagg.predict(processed_x_test) ## getting the prediction

In [63]:
acc = accuracy_score(y_test,y_pred)
pr = precision_score(y_test,y_pred)
re = recall_score(y_test,y_pred)
f1 = f1_score(y_test,y_pred)
cm = confusion_matrix(y_test,y_pred)
print('Accuracy Score: ',acc,'\nPrecision:', pr,'\nRecall Score:', re,'\nF1 Score', f1, '\nConfusion Matrix: \n', cm)
print(classification_report(y_test,y_pred))

Accuracy Score:  0.8369565217391305 
Precision: 0.5625 
Recall Score: 0.14516129032258066 
F1 Score 0.23076923076923078 
Confusion Matrix: 
 [[299   7]
 [ 53   9]]
              precision    recall  f1-score   support

           0       0.85      0.98      0.91       306
           1       0.56      0.15      0.23        62

    accuracy                           0.84       368
   macro avg       0.71      0.56      0.57       368
weighted avg       0.80      0.84      0.79       368



In [65]:
model_bagg=BaggingClassifier(estimator=LR,n_estimators=20) ## model objet creation
#base_estimator---> algorithm which you want to pass
#n_estimotors-----> number of base learners
model_bagg.fit(processed_x_train,y_train) ## fitting the model
y_pred=model_bagg.predict(processed_x_test) ## getting the prediction

acc = accuracy_score(y_test,y_pred)
pr = precision_score(y_test,y_pred)
re = recall_score(y_test,y_pred)
f1 = f1_score(y_test,y_pred)
cm = confusion_matrix(y_test,y_pred)
print('Accuracy Score: ',acc,'\nPrecision:', pr,'\nRecall Score:', re,'\nF1 Score', f1, '\nConfusion Matrix: \n', cm)
print(classification_report(y_test,y_pred))

Accuracy Score:  0.8668478260869565 
Precision: 0.6326530612244898 
Recall Score: 0.5 
F1 Score 0.5585585585585585 
Confusion Matrix: 
 [[288  18]
 [ 31  31]]
              precision    recall  f1-score   support

           0       0.90      0.94      0.92       306
           1       0.63      0.50      0.56        62

    accuracy                           0.87       368
   macro avg       0.77      0.72      0.74       368
weighted avg       0.86      0.87      0.86       368



### Boosting and XG Boosting

In [67]:
from sklearn.ensemble import GradientBoostingClassifier  # Importing GradientBoostingClassifier
gbm=GradientBoostingClassifier() ## object creation
gbm.fit(processed_x_train,y_train) ## fitting the data

y_pred=gbm.predict(processed_x_test)

acc = accuracy_score(y_test,y_pred)
pr = precision_score(y_test,y_pred)
re = recall_score(y_test,y_pred)
f1 = f1_score(y_test,y_pred)
cm = confusion_matrix(y_test,y_pred)
print('Accuracy Score: ',acc,'\nPrecision:', pr,'\nRecall Score:', re,'\nF1 Score', f1, '\nConfusion Matrix: \n', cm)
print(classification_report(y_test,y_pred))


Accuracy Score:  0.8722826086956522 
Precision: 0.7419354838709677 
Recall Score: 0.3709677419354839 
F1 Score 0.4946236559139785 
Confusion Matrix: 
 [[298   8]
 [ 39  23]]
              precision    recall  f1-score   support

           0       0.88      0.97      0.93       306
           1       0.74      0.37      0.49        62

    accuracy                           0.87       368
   macro avg       0.81      0.67      0.71       368
weighted avg       0.86      0.87      0.85       368



In [69]:
## Hyperparameter tuning for GradientBoost
# Importing RandomizedSearchCV from sklearn
from sklearn.model_selection import RandomizedSearchCV

# Define Parameters grid for learning_rate, max_depth, n_estimators
param_grid = {
    'learning_rate': [0.01, 0.03, 0.06, 0.1, 0.4, 0.5, 0.6, 0.7],
    'max_depth': [5, 6, 7, 8, 9, 10],
    'n_estimators': [50, 65, 80, 100],
}
GB=GradientBoostingClassifier()  # Assigning GradientBoostingClassifier model into variables

rcv= RandomizedSearchCV(estimator=GB, scoring='f1',refit = True,param_distributions=param_grid, cv=5,
                               verbose=2, n_jobs=-1)
rcv.fit(processed_x_train,y_train)
rcv.best_params_

Fitting 5 folds for each of 10 candidates, totalling 50 fits


{'n_estimators': 65, 'max_depth': 5, 'learning_rate': 0.7}

In [71]:
GB2=GradientBoostingClassifier(n_estimators=65, max_depth=6, learning_rate=0.7)
GB2.fit(processed_x_train,y_train)

y_pred=gbm.predict(processed_x_test)

acc = accuracy_score(y_test,y_pred)
pr = precision_score(y_test,y_pred)
re = recall_score(y_test,y_pred)
f1 = f1_score(y_test,y_pred)
cm = confusion_matrix(y_test,y_pred)
print('Accuracy Score: ',acc,'\nPrecision:', pr,'\nRecall Score:', re,'\nF1 Score', f1, '\nConfusion Matrix: \n', cm)
print(classification_report(y_test,y_pred))


Accuracy Score:  0.8722826086956522 
Precision: 0.7419354838709677 
Recall Score: 0.3709677419354839 
F1 Score 0.4946236559139785 
Confusion Matrix: 
 [[298   8]
 [ 39  23]]
              precision    recall  f1-score   support

           0       0.88      0.97      0.93       306
           1       0.74      0.37      0.49        62

    accuracy                           0.87       368
   macro avg       0.81      0.67      0.71       368
weighted avg       0.86      0.87      0.85       368



## XGBoosting

In [73]:
#!pip install xgboost  #installing model XGBOOST
## model creation
from xgboost import XGBClassifier#importing the model library
xgb_r=XGBClassifier() ## object creation
xgb_r.fit(processed_x_train,y_train)# fitting the data
y_pred=xgb_r.predict(processed_x_test)#predicting the price

acc = accuracy_score(y_test,y_pred)
pr = precision_score(y_test,y_pred)
re = recall_score(y_test,y_pred)
f1 = f1_score(y_test,y_pred)
cm = confusion_matrix(y_test,y_pred)
print('Accuracy Score: ',acc,'\nPrecision:', pr,'\nRecall Score:', re,'\nF1 Score', f1, '\nConfusion Matrix: \n', cm)
print(classification_report(y_test,y_pred))


Accuracy Score:  0.8586956521739131 
Precision: 0.6388888888888888 
Recall Score: 0.3709677419354839 
F1 Score 0.46938775510204084 
Confusion Matrix: 
 [[293  13]
 [ 39  23]]
              precision    recall  f1-score   support

           0       0.88      0.96      0.92       306
           1       0.64      0.37      0.47        62

    accuracy                           0.86       368
   macro avg       0.76      0.66      0.69       368
weighted avg       0.84      0.86      0.84       368



In [None]:
#Hyperparameter tuning
param_grid = {'gamma': [0,0.1,0.2,0.4],
              'learning_rate': [0.01, 0.03, 0.06, 0.1],
              'max_depth': [5,6,7,8,9],
              'n_estimators': [50,65,80],
              'reg_alpha': [0,0.1,0.2,0.4],
              'reg_lambda': [0,0.1,0.2]}

XGB=XGBClassifier(random_state=42,verbosity=0,silent=0)  # Assigning XGBClassifier model into variables
rcv= GridSearchCV(estimator=XGB, scoring='f1',refit=True,param_grid=param_grid,  cv=3,
                               verbose=1, n_jobs=-1)
rcv.fit(processed_x_train,y_train)
rcv.best_params_

Fitting 3 folds for each of 2880 candidates, totalling 8640 fits


In [72]:
XGB2=XGBClassifier(reg_lambda= 0.1, reg_alpha= 0.1, n_estimators=50, max_depth=8, learning_rate=0.1, gamma=0)
XGB2.fit(processed_x_train,y_train)
y_pred=XGB2.predict(processed_x_test)#testing

acc = accuracy_score(y_test,y_pred)
pr = precision_score(y_test,y_pred)
re = recall_score(y_test,y_pred)
f1 = f1_score(y_test,y_pred)
cm = confusion_matrix(y_test,y_pred)
print('Accuracy Score: ',acc,'\nPrecision:', pr,'\nRecall Score:', re,'\nF1 Score', f1, '\nConfusion Matrix: \n', cm)
print(classification_report(y_test,y_pred))

Accuracy Score:  0.8559782608695652 
Precision: 0.68 
Recall Score: 0.27419354838709675 
F1 Score 0.39080459770114945 
Confusion Matrix: 
 [[298   8]
 [ 45  17]]
              precision    recall  f1-score   support

           0       0.87      0.97      0.92       306
           1       0.68      0.27      0.39        62

    accuracy                           0.86       368
   macro avg       0.77      0.62      0.65       368
weighted avg       0.84      0.86      0.83       368

