In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve

from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.pipeline import make_pipeline

#model
from sklearn import svm
from sklearn import tree
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier # pip install xgboost

from imblearn.over_sampling import SMOTE # !pip install imblearn

#Using zscore method to remove outliers
from scipy.stats import zscore

In [16]:
df = pd.read_csv("dataset.csv")
df.shape

(1470, 35)

In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 35 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Age                       1470 non-null   int64 
 1   Attrition                 1470 non-null   object
 2   BusinessTravel            1470 non-null   object
 3   DailyRate                 1470 non-null   int64 
 4   Department                1470 non-null   object
 5   DistanceFromHome          1470 non-null   int64 
 6   Education                 1470 non-null   int64 
 7   EducationField            1470 non-null   object
 8   EmployeeCount             1470 non-null   int64 
 9   EmployeeNumber            1470 non-null   int64 
 10  EnvironmentSatisfaction   1470 non-null   int64 
 11  Gender                    1470 non-null   object
 12  HourlyRate                1470 non-null   int64 
 13  JobInvolvement            1470 non-null   int64 
 14  JobLevel                

In [18]:
df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2


In [22]:
df.isnull().sum()

Age                         0
Attrition                   0
BusinessTravel              0
DailyRate                   0
Department                  0
DistanceFromHome            0
Education                   0
EducationField              0
EmployeeCount               0
EmployeeNumber              0
EnvironmentSatisfaction     0
Gender                      0
HourlyRate                  0
JobInvolvement              0
JobLevel                    0
JobRole                     0
JobSatisfaction             0
MaritalStatus               0
MonthlyIncome               0
MonthlyRate                 0
NumCompaniesWorked          0
Over18                      0
OverTime                    0
PercentSalaryHike           0
PerformanceRating           0
RelationshipSatisfaction    0
StandardHours               0
StockOptionLevel            0
TotalWorkingYears           0
TrainingTimesLastYear       0
WorkLifeBalance             0
YearsAtCompany              0
YearsInCurrentRole          0
YearsSince

## Exploratory Data Analysis

Difference between monthly rate and monthly income - https://www.kaggle.com/datasets/pavansubhasht/ibm-hr-analytics-attrition-dataset/discussion/139552
- shortanswer: monthly rate is internal calculation, monthly income is what employee actually receive

# Feature Engineering

<mark> Dropping columns constant and identifier columns. </mark>

In [14]:
df.drop(columns=["StandardHours", "EmployeeNumber", "EmployeeCount","Over18"], inplace=True) 
#drop because values are the same for all employees, hence, redundanct for accruracy

<mark> Encoding the categorical type data </mark>


In [15]:
# Create a label encoder object
le = LabelEncoder()

In [None]:
df["Attrition"] = le.fit_transform(df["Attrition"])

In [None]:
le_count = 0
for col in df.columns[1:]:
    if df[col].dtype == 'object':
        if len(list(df[col].unique())) <= 2:
            le.fit(df[col])
            df[col] = le.transform(df[col])
            le_count += 1
print('{} columns were label encoded.'.format(le_count))

In [None]:
# convert rest of categorical variable into dummy
df = pd.get_dummies(df, drop_first=True)

In [None]:
df.shape

<mark>Handling imbalance data</mark>

In [None]:
over=SMOTE()

In [None]:
X = df.loc[:, df.columns != 'Attrition']

In [None]:
y = df["Attrition"]

In [None]:
X,y=over.fit_resample(X,y)

In [None]:
plt.figure(figsize=(15,5))
plt.subplot(1,2,1)
y.value_counts().plot.pie(autopct='%1.1f%%')
plt.subplot(1,2,2)
sns.countplot(y)
y.value_counts()

<mark>scaling the data</mark>

In [None]:
#Scaling the data using min max scaler
from sklearn.preprocessing import MinMaxScaler
scaler=MinMaxScaler()

In [None]:
xd=scaler.fit_transform(X)
X=pd.DataFrame(xd,columns=X.columns)

# Modelling

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=7)

In [None]:
# selection of algorithms to consider and set performance measure
models = []
models.append(('Logistic Regression', LogisticRegression()))
models.append(('Random Forest', RandomForestClassifier()))
models.append(('SVM', svm.SVC(probability=True)))
models.append(('KNN', KNeighborsClassifier()))
models.append(('Decision Tree Classifier',DecisionTreeClassifier()))
models.append(('Gaussian NB', GaussianNB()))
models.append(('Adaboost', AdaBoostClassifier()))
models.append(("Gradientboost", GradientBoostingClassifier()))
models.append(("BaggingClassifier", BaggingClassifier()))
models.append(("ExtremeGradientBoost", XGBClassifier()))

In [None]:
train_acc_results = []
test_acc_results = []
#roc_auc_score = []
names = []

# set table to table to populate with performance results
col = ['Algorithm','Train Accuracy Mean', 'Test Accuracy', "Test ROC_Score"]
df_results = pd.DataFrame(columns=col)
i = 0
# evaluate each model using cross-validation
for name, model in models:
    kfold = KFold(n_splits=10)  # 10-fold cross-validation

    # cv accuracy scoring
    cv_acc_results = cross_val_score(model, X_train, y_train, cv=kfold, scoring='accuracy')
    train_acc_results.append(cv_acc_results)
    
    #test accuracy scoring
    model.fit(X_train,y_train)
    pred = model.predict(X_test)
    test_accuracy = accuracy_score(y_test, pred)
    test_acc_results.append(test_accuracy)
    
    #roc auc score
    roc = roc_auc_score(y_test,model.predict_proba(X_test)[:,1])
    #roc_auc_score.append(roc_auc_score(y_test,model.predict_proba(X_test)[:,1]))
    
    names.append(name)
    df_results.loc[i] = [name, round(cv_acc_results.mean()*100, 2), round(test_accuracy*100,2), roc]
    
    i += 1

In [None]:
df_results.sort_values(by=['Test ROC_Score'], ascending=False)

> Random Forest, Gradient Boost, SVM and Xtreme Gradient Boost are doing a good job. So lets perform Hyperparameter tuning on these models to see which model is performing the best.

## Performance

In [None]:
#To evakuate performances of all the models
def performance(p,ytest,m,xtest):
    print('Accuracy',np.round(accuracy_score(p,ytest),4))
    print('AUC_ROC Score',np.round(roc_auc_score(ytest,m.predict_proba(xtest)[:,1]),4))
    print('Confusion Matrix')
    print(confusion_matrix(p,ytest))
    print('Classification Report:')
    print(classification_report(p,ytest))

## Random Forest

In [None]:
params={'n_estimators':[100, 200, 300, 400, 500],
            'criterion':['gini','entropy'],
            'max_depth':[None,1,2,3,4,5,6,7,8,9,10],
           'max_features':["sqrt", "log2", None]}

In [None]:
g=GridSearchCV(RandomForestClassifier(),params,cv=5)

In [None]:
g.fit(X_train, y_train)

In [None]:
print(f'Best Params: {g.best_params_}')
print(f'Best Estimator: {g.best_estimator_}')
print(f'Best Score: {g.best_score_}')

In [None]:
m=RandomForestClassifier(max_features='log2', n_estimators=500)
m.fit(X_train,y_train)
prediction=m.predict(X_test)

In [None]:
performance(prediction,y_test,m,X_test)

## Gradient Boosting

In [None]:
params={'n_estimators':[100,200,300,400,500],
      'learning_rate':[0.001,0.01,0.10,],
      'subsample':[0.5,1],
      'max_depth':[1,2,3,4,5,6,7,8,9,10]}

In [None]:
g=GridSearchCV(GradientBoostingClassifier(),params,cv=5)

In [None]:
g.fit(xtrain,ytrain)

In [None]:
print(f'Best Params: {g.best_params_}')
print(f'Best Estimator: {g.best_estimator_}')
print(f'Best Score: {g.best_score_}')

## Modelling

https://machinelearningmastery.com/hyperparameters-for-classification-machine-learning-algorithms/

https://towardsdatascience.com/will-your-employee-leave-a-machine-learning-model-8484c2a6663e

In [None]:
pipelines = {
       'rf': make_pipeline(
             RandomForestClassifier(random_state=7)),
       'gb': make_pipeline(
             GradientBoostingClassifier(random_state=7)),
       'ab': make_pipeline(
             AdaBoostClassifier(random_state=7)),
       'bc': make_pipeline(
             BaggingClassifier(random_state=7)),
            }

In [None]:
rf_hyperparameters = {
    'randomforestclassifier__n_estimators' : [100, 200],
    'randomforestclassifier__max_features' : ['auto', 'sqrt', 'log2'],
    'randomforestclassifier__min_samples_leaf' : [1, 3, 5, 10]
    }

In [None]:
gb_hyperparameters = {
    'gradientboostingclassifier__n_estimators' : [100,200],
    'gradientboostingclassifier__learning_rate' : [0.001, 0.01, 0.1],
    'gradientboostingclassifier__max_depth' : [3, 7, 9],
    'gradientboostingclassifier__subsample' : [0.5, 0.7, 1.0]
    }

In [None]:
ab_hyperparameters = {
    'adaboostclassifier__n_estimators' : [100, 200],
    'adaboostclassifier__learning_rate' : [0.05, 0.1, 0.2, 1],
    }

In [None]:
bc_hyperparameters = {
    'baggingclassifier__n_estimators' : [100,200]
    }

In [None]:
hyperparameters = {
    'rf' : rf_hyperparameters,
    'gb' : gb_hyperparameters,
    'ab' : ab_hyperparameters,
    'bc' : bc_hyperparameters
    }

In [None]:
fitted_models = {}
for name, pipeline in pipelines.items():
    model = GridSearchCV(pipeline, 
                         hyperparameters[name], 
                         cv=10, 
                         n_jobs=-1, scoring="accuracy")
    
    model.fit(X_train, y_train)
    
    print(f'{name}: {model.best_estimator_}')
    print(f'{name}: {model.best_params_}')
    print(f'{name}: {model.best_score_}')
    print("\n")
    
    
    fitted_models[name] = model

In [None]:
for name, model in fitted_models.items():
    print('Results for:', name)
    
    # obtain predictions
    pred = fitted_models[name].predict(X_test)

    # accuracy score
    print('Accuracy:', accuracy_score(y_test, pred))
    
    # precision
    precision = cm[1][1]/(cm[0][1]+cm[1][1])
    print('Precision:', precision)
    
    # recall
    recall = cm[1][1]/(cm[1][0]+cm[1][1])
    print('Recall:', recall)
    print("\n")