In [3]:
# Importing the important libraries
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from imblearn.over_sampling import SMOTE  # imblearn library can be installed using pip install imblearn
from imblearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingClassifier

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
pip install imblearn

Collecting imblearn
  Downloading imblearn-0.0-py2.py3-none-any.whl (1.9 kB)
Installing collected packages: imblearn
Successfully installed imblearn-0.0


In [5]:
# Importing dataset and examining it
dataset = pd.read_csv("/content/drive/MyDrive/Datasets/ChurnPrediction.csv")
pd.set_option('display.max_columns', None) # to make sure you can see all the columns in output window
dataset.head()
dataset.shape
dataset.info()
dataset.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 27 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   Age                      1470 non-null   int64 
 1   PastEmployee             1470 non-null   object
 2   BusinessTravel           1470 non-null   object
 3   Department               1470 non-null   object
 4   DistanceFromHome         1470 non-null   int64 
 5   Education                1470 non-null   int64 
 6   EducationField           1470 non-null   object
 7   EnvironmentSatisfaction  1470 non-null   int64 
 8   Gender                   1470 non-null   object
 9   JobInvolvement           1470 non-null   int64 
 10  JobLevel                 1470 non-null   int64 
 11  JobRole                  1470 non-null   object
 12  JobSatisfaction          1470 non-null   int64 
 13  MaritalStatus            1470 non-null   object
 14  MonthlyIncome            1470 non-null  

Unnamed: 0,Age,DistanceFromHome,Education,EnvironmentSatisfaction,JobInvolvement,JobLevel,JobSatisfaction,MonthlyIncome,NumCompaniesWorked,PercentSalaryHike,PerformanceRating,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
count,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0
mean,36.92381,9.192517,2.912925,2.721769,2.729932,2.063946,2.728571,6502.931293,2.693197,15.209524,3.153741,0.793878,11.279592,2.79932,2.761224,7.008163,4.229252,2.187755,4.123129
std,9.135373,8.106864,1.024165,1.093082,0.711561,1.10694,1.102846,4707.956783,2.498009,3.659938,0.360824,0.852077,7.780782,1.289271,0.706476,6.126525,3.623137,3.22243,3.568136
min,18.0,1.0,1.0,1.0,1.0,1.0,1.0,1009.0,0.0,11.0,3.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
25%,30.0,2.0,2.0,2.0,2.0,1.0,2.0,2911.0,1.0,12.0,3.0,0.0,6.0,2.0,2.0,3.0,2.0,0.0,2.0
50%,36.0,7.0,3.0,3.0,3.0,2.0,3.0,4919.0,2.0,14.0,3.0,1.0,10.0,3.0,3.0,5.0,3.0,1.0,3.0
75%,43.0,14.0,4.0,4.0,3.0,3.0,4.0,8379.0,4.0,18.0,3.0,1.0,15.0,3.0,3.0,9.0,7.0,3.0,7.0
max,60.0,29.0,5.0,4.0,4.0,5.0,4.0,19999.0,9.0,25.0,4.0,3.0,40.0,6.0,4.0,40.0,18.0,15.0,17.0


In [6]:
# Converting Categorical features into Numerical features
dataset['PastEmployee'] = dataset['PastEmployee'].map({'Yes':1, 'No':0})
dataset['OverTime'] = dataset['OverTime'].map({'Yes':1, 'No':0})
dataset['Gender'] = dataset['Gender'].map({'Female':1, 'Male':0})
dataset['BusinessTravel'] = dataset['BusinessTravel'].map({'Non-Travel':0, 'Travel_Rarely':1, 'Travel_Frequently':2})
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 27 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   Age                      1470 non-null   int64 
 1   PastEmployee             1470 non-null   int64 
 2   BusinessTravel           1470 non-null   int64 
 3   Department               1470 non-null   object
 4   DistanceFromHome         1470 non-null   int64 
 5   Education                1470 non-null   int64 
 6   EducationField           1470 non-null   object
 7   EnvironmentSatisfaction  1470 non-null   int64 
 8   Gender                   1470 non-null   int64 
 9   JobInvolvement           1470 non-null   int64 
 10  JobLevel                 1470 non-null   int64 
 11  JobRole                  1470 non-null   object
 12  JobSatisfaction          1470 non-null   int64 
 13  MaritalStatus            1470 non-null   object
 14  MonthlyIncome            1470 non-null  

In [7]:
# Converting the categorical features into dummy columns.
categorical_features = ['Department', 'EducationField', 'JobRole', 'MaritalStatus']
final_data = pd.get_dummies(dataset, columns = categorical_features)

In [8]:
# Dividing dataset into label and feature sets (Selecting the target feature)
X = final_data.drop('PastEmployee', axis = 1)
Y = final_data['PastEmployee'] # Labels
print(type(X))
print(type(Y))
print(X.shape)
print(Y.shape)

<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.series.Series'>
(1470, 43)
(1470,)


In [9]:
# Normalizing numerical features so that each feature has mean 0 and variance 1
feature_scaler = StandardScaler()
X_scaled = feature_scaler.fit_transform(X)

In [10]:
# #####################################################
# Implementing Gradient Boost without feature selection
# Tuning the GradientBoost parameter 'n_estimators', 'learning_rate', 'max_depth' and implementing cross-validation using Grid Search
model = Pipeline([
        ('balancing', SMOTE(random_state = 101)),
        ('classification', GradientBoostingClassifier(random_state=1, max_features="sqrt"))
    ])
grid_param = {'classification__n_estimators': [5,10,15,20], 'classification__learning_rate': [0.1,0.15,0.5,0.8],'classification__max_depth': [1,3,8,16,32]}

gd_sr = GridSearchCV(estimator=model, param_grid=grid_param, scoring='recall', cv=5)


In [11]:
# ################################
# Fitting the model to our dataset
gd_sr.fit(X_scaled, Y)

best_parameters = gd_sr.best_params_
print(best_parameters)

best_result = gd_sr.best_score_ # Mean cross-validated score of the best_estimator
print(best_result)

featimp = pd.Series(gd_sr.best_estimator_.named_steps["classification"].feature_importances_, index=list(X)).sort_values(ascending=False) # Getting feature importances list for the best model
print(featimp)


{'classification__learning_rate': 0.1, 'classification__max_depth': 1, 'classification__n_estimators': 15}
0.6502659574468085
OverTime                             0.432943
StockOptionLevel                     0.197706
JobLevel                             0.089552
Age                                  0.055608
JobSatisfaction                      0.051538
JobInvolvement                       0.046359
BusinessTravel                       0.038431
MaritalStatus_Single                 0.037649
MaritalStatus_Divorced               0.027287
Education                            0.022927
Gender                               0.000000
JobRole_Manager                      0.000000
EducationField_Medical               0.000000
EducationField_Other                 0.000000
EducationField_Technical Degree      0.000000
JobRole_Healthcare Representative    0.000000
JobRole_Human Resources              0.000000
JobRole_Laboratory Technician        0.000000
JobRole_Manufacturing Director       0.000000


In [12]:
# Feature Selection
# Selecting features with higher sifnificance and redefining feature set
X_ = dataset[['OverTime', 'StockOptionLevel', 'JobLevel', 'Age','JobSatisfaction','JobInvolvement']]

feature_scaler = StandardScaler()
X_scaled_ = feature_scaler.fit_transform(X_)

In [13]:
# ###########################
# Implementing Gradient Boost
# Tuning the GradientBoost parameter 'n_estimators', 'learning_rate', 'max_depth' and implementing cross-validation using Grid Search
model = Pipeline([
        ('balancing', SMOTE(random_state = 101)),
        ('classification', GradientBoostingClassifier(random_state=1, max_features="sqrt"))
    ])
grid_param = {'classification__n_estimators': [10,15,20], 'classification__learning_rate': [0.1,0.15,0.5,0.8],'classification__max_depth': [1,3,8,16,32]}

gd_sr = GridSearchCV(estimator=model, param_grid=grid_param, scoring='recall', cv=5)

In [14]:
# Fitting the new model and calculating the final recall score
gd_sr.fit(X_scaled_, Y)

best_parameters = gd_sr.best_params_
print(best_parameters)

best_result = gd_sr.best_score_ # Mean cross-validated score of the best_estimator
print(best_result)

{'classification__learning_rate': 0.8, 'classification__max_depth': 1, 'classification__n_estimators': 20}
0.6886524822695035


In [None]:
# Recall                    0.65   | Precison                   0.379
# Learning_rate             0.1    | Learning_rate              0.1
# max_depth                 1      | max_depth                  1
# n_estimators              15     | n_estimators               15

# Recall (Feat Selection)   0.68   | Precison (Feat Selection)  0.38
# Learning_rate             0.8    | Learning_rate              0.8
# max_depth                 1      | max_depth                  1
# n_estimators              20     | n_estimators               20