# Predicting the Risk of Employee's Attrition

### 1. Import Libraries

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve

from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.pipeline import make_pipeline

#model
from sklearn import svm
from sklearn import tree
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import ExtraTreesClassifier

from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier # pip install xgboost

from imblearn.over_sampling import SMOTE # !pip install imblearn

#Using zscore method to remove outliers
from scipy.stats import zscore

### 2. Import Dataset

In [5]:
#https://www.kaggle.com/datasets/pavansubhasht/ibm-hr-analytics-attrition-dataset

df = pd.read_csv("dataset.csv") 
df.shape #1470 rows and 35 columns

(1470, 35)

### 3. Exploratory Data Analysis

###### check null

In [6]:
df.isnull().sum() #no null value

Age                         0
Attrition                   0
BusinessTravel              0
DailyRate                   0
Department                  0
DistanceFromHome            0
Education                   0
EducationField              0
EmployeeCount               0
EmployeeNumber              0
EnvironmentSatisfaction     0
Gender                      0
HourlyRate                  0
JobInvolvement              0
JobLevel                    0
JobRole                     0
JobSatisfaction             0
MaritalStatus               0
MonthlyIncome               0
MonthlyRate                 0
NumCompaniesWorked          0
Over18                      0
OverTime                    0
PercentSalaryHike           0
PerformanceRating           0
RelationshipSatisfaction    0
StandardHours               0
StockOptionLevel            0
TotalWorkingYears           0
TrainingTimesLastYear       0
WorkLifeBalance             0
YearsAtCompany              0
YearsInCurrentRole          0
YearsSince

###### check duplicate

In [7]:
df.duplicated().sum() #no duplicated records

0

# <mark> Please ignore the codes below first </mark>

### Feature Engineering

###### Dropping columns with constant value and identifier columns

In [11]:
df.drop(columns=["StandardHours", "EmployeeNumber", "EmployeeCount","Over18"], inplace=True) 
#drop because values are the same for all employees, hence, redundanct for accruracy

In [12]:
df.shape #35 - 4 = 31 remaining cols 

(1470, 31)

###### Encoding the categorical type data 


In [13]:
le = LabelEncoder()

In [14]:
# Label Encoding will be used for columns with 2 or less unique values
le_count = 0
for col in df.columns[1:]:
    if df[col].dtype == 'object':
        if len(list(df[col].unique())) <= 2:
            le.fit(df[col])
            df[col] = le.transform(df[col])
            le_count += 1
print('{} columns were label encoded.'.format(le_count))

3 columns were label encoded.


In [15]:
# convert rest of categorical variable into dummy
df = pd.get_dummies(df, drop_first=True)

In [16]:
# import MinMaxScaler
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range=(0, 5))
HR_col = list(df.columns)
HR_col.remove('Attrition')
for col in HR_col:
    df[col] = df[col].astype(float)
    df[[col]] = scaler.fit_transform(df[[col]])
df['Attrition'] = pd.to_numeric(df['Attrition'], downcast='float')
df.head()

Unnamed: 0,Age,Attrition,DailyRate,DistanceFromHome,Education,EnvironmentSatisfaction,Gender,HourlyRate,JobInvolvement,JobLevel,...,JobRole_Human Resources,JobRole_Laboratory Technician,JobRole_Manager,JobRole_Manufacturing Director,JobRole_Research Director,JobRole_Research Scientist,JobRole_Sales Executive,JobRole_Sales Representative,MaritalStatus_Married,MaritalStatus_Single
0,2.738095,1.0,3.579098,0.0,1.25,1.666667,0.0,4.571429,3.333333,1.25,...,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,5.0
1,3.690476,0.0,0.6335,1.25,0.0,3.333333,5.0,2.214286,1.666667,1.25,...,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,5.0,0.0
2,2.261905,1.0,4.549034,0.178571,1.25,5.0,5.0,4.428571,1.666667,0.0,...,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0
3,1.785714,0.0,4.617037,0.357143,3.75,5.0,0.0,1.857143,3.333333,0.0,...,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,5.0,0.0
4,1.071429,0.0,1.750179,0.178571,0.0,0.0,5.0,0.714286,3.333333,0.0,...,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0


In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 45 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   Age                                1470 non-null   float64
 1   Attrition                          1470 non-null   float32
 2   DailyRate                          1470 non-null   float64
 3   DistanceFromHome                   1470 non-null   float64
 4   Education                          1470 non-null   float64
 5   EnvironmentSatisfaction            1470 non-null   float64
 6   Gender                             1470 non-null   float64
 7   HourlyRate                         1470 non-null   float64
 8   JobInvolvement                     1470 non-null   float64
 9   JobLevel                           1470 non-null   float64
 10  JobSatisfaction                    1470 non-null   float64
 11  MonthlyIncome                      1470 non-null   float

### Generate training set and test set with SMOTE

In [18]:
# Separate input features and target
y = df["Attrition"]
X = df.loc[:, df.columns != 'Attrition']

# setting up testing and training sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=123)


sm = SMOTE(random_state = 0)
X_train, y_train = sm.fit_resample(X_train, y_train)

In [19]:
y_train.value_counts()

0.0    913
1.0    913
Name: Attrition, dtype: int64

### Handling imbalance data with GAN

In [37]:
# setting up testing and training sets
X_train1, X_test1, y_train1, y_test1 = train_test_split(X, y, test_size=0.25, random_state=123)

In [38]:
import torch.nn as nn
import torch

process = X_train1.copy()
process['target'] = y_train1.copy()

In [39]:
process

Unnamed: 0,Age,DailyRate,DistanceFromHome,Education,EnvironmentSatisfaction,Gender,HourlyRate,JobInvolvement,JobLevel,JobSatisfaction,...,JobRole_Laboratory Technician,JobRole_Manager,JobRole_Manufacturing Director,JobRole_Research Director,JobRole_Research Scientist,JobRole_Sales Executive,JobRole_Sales Representative,MaritalStatus_Married,MaritalStatus_Single,target
643,2.857143,4.162491,0.357143,2.50,3.333333,0.0,4.642857,5.000000,1.25,5.000000,...,5.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0
1214,3.095238,2.931281,0.178571,2.50,3.333333,0.0,4.714286,5.000000,2.50,5.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0
1294,2.738095,1.234789,0.714286,2.50,1.666667,5.0,3.928571,5.000000,1.25,1.666667,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0
735,3.571429,0.626342,0.892857,2.50,0.000000,5.0,4.785714,1.666667,1.25,3.333333,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0
186,2.619048,3.174660,0.535714,0.00,5.000000,0.0,1.142857,3.333333,5.00,3.333333,...,0.0,5.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1041,1.190476,2.734431,0.714286,2.50,5.000000,5.0,3.857143,3.333333,1.25,0.000000,...,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,5.0,0.0
1122,1.309524,4.538296,0.357143,0.00,1.666667,5.0,4.071429,3.333333,0.00,0.000000,...,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0
1346,3.214286,1.624911,4.285714,1.25,1.666667,0.0,4.500000,1.666667,1.25,5.000000,...,0.0,0.0,5.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0
1406,4.285714,0.196850,1.607143,2.50,3.333333,0.0,3.357143,3.333333,1.25,0.000000,...,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0


In [40]:
process.target.value_counts()

0.0    913
1.0    189
Name: target, dtype: int64

In [41]:
X_for_generate = process.query("target == 1").iloc[:,:-1].values
X_non_default = process.query('target == 0').iloc[:,:-1].values
X_for_generate = torch.tensor(X_for_generate).type(torch.FloatTensor)

n_generate = X_non_default.shape[0] - X_for_generate.shape[0]

In [46]:
n_generate

724

In [45]:
BATCH_SIZE = 50
LR_G = 0.0001           
LR_D = 0.0001           
N_IDEAS = 20            # Initial idea for G generator (random inspiration)


G = nn.Sequential(                      # generator
    nn.Linear(N_IDEAS, 128),            
    nn.ReLU(),
    nn.Linear(128, 44),     
)


D = nn.Sequential(                      # discriminator
    nn.Linear(44, 128),     
    nn.ReLU(),
    nn.Linear(128, 1),
    nn.Sigmoid(),                       # 0-1
)

# optimizator
opt_D = torch.optim.Adam(D.parameters(),lr=LR_D)
opt_G = torch.optim.Adam(G.parameters(),lr=LR_G)


In [47]:
# GAN
for step in range(3000):
    # Randomly select BATCH real samples with label 1
    chosen_data = np.random.choice((X_for_generate.shape[0]),size=(BATCH_SIZE),replace=False)
    artist_paintings = X_for_generate[chosen_data,:]
    # Generate fake samples using generators
    G_ideas = torch.randn(BATCH_SIZE, N_IDEAS, requires_grad=True)           
    G_paintings = G(G_ideas)                  
    
    prob_artist1 = D(G_paintings)               
    # loss
    G_loss = torch.mean(torch.log(1. - prob_artist1))
    opt_G.zero_grad()
    G_loss.backward()
    opt_G.step()
    
    prob_artist0 = D(artist_paintings)
    prob_artist1 = D(G_paintings.detach())
    # loss
    D_loss = - torch.mean(torch.log(prob_artist0) + torch.log(1. - prob_artist1))
    opt_D.zero_grad()
    D_loss.backward(retain_graph=True)
    opt_D.step()

In [49]:
#GAN fake data
fake_data = G(torch.randn(n_generate,N_IDEAS)).detach().numpy()

X_default = pd.DataFrame(np.concatenate([X_for_generate,fake_data]))
X_default['target'] = 1

X_non_default = pd.DataFrame(X_non_default)
X_non_default['target'] = 0
train_data_gan = pd.concat([X_default,X_non_default])

X_gan = train_data_gan.iloc[:,:-1]
y_gan = train_data_gan.iloc[:,-1]

print(X_gan.shape,y_gan.shape)

(1826, 44) (1826,)


In [51]:
X_gan

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,34,35,36,37,38,39,40,41,42,43
0,1.428571,0.830351,4.464286,3.75,3.333333,0.0,1.571429,1.666667,1.25,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,5.0
1,1.904762,2.462420,4.107143,3.75,0.000000,0.0,0.714286,1.666667,1.25,1.666667,...,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,5.0
2,1.309524,2.841804,3.035714,0.00,3.333333,5.0,4.000000,1.666667,0.00,5.000000,...,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,5.0
3,0.119048,0.719399,0.178571,2.50,1.666667,5.0,1.214286,1.666667,0.00,5.000000,...,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0
4,1.785714,2.544739,2.321429,2.50,3.333333,5.0,2.000000,3.333333,0.00,5.000000,...,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
908,1.190476,2.734431,0.714286,2.50,5.000000,5.0,3.857143,3.333333,1.25,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,5.0
909,1.309524,4.538296,0.357143,0.00,1.666667,5.0,4.071429,3.333333,0.00,0.000000,...,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0
910,3.214286,1.624911,4.285714,1.25,1.666667,0.0,4.500000,1.666667,1.25,5.000000,...,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,5.0,0.0
911,4.285714,0.196850,1.607143,2.50,3.333333,0.0,3.357143,3.333333,1.25,0.000000,...,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,5.0


In [1]:
Y_gan.value_counts()

NameError: name 'Y_gan' is not defined

### Modelling

In [None]:
#X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=7)

In [31]:
# selection of algorithms to consider and set performance measure
models = []
models.append(('Logistic Regression', LogisticRegression()))
models.append(('Random Forest', RandomForestClassifier()))
models.append(('SVM', svm.SVC(probability=True)))
models.append(('KNN', KNeighborsClassifier()))
models.append(('Decision Tree Classifier',DecisionTreeClassifier()))
models.append(('Gaussian NB', GaussianNB()))
models.append(('Adaboost', AdaBoostClassifier()))
models.append(("Gradientboost", GradientBoostingClassifier()))
models.append(("BaggingClassifier", BaggingClassifier()))
models.append(("ExtremeGradientBoost", XGBClassifier()))
models.append(("ExtraTreesClassifier", ExtraTreesClassifier()))

In [32]:
train_acc_results = []
test_acc_results = []
#roc_auc_score = []
names = []

# set table to table to populate with performance results
col = ['Algorithm','Train Accuracy Mean', 'Test Accuracy', "Test ROC Score"]
df_results = pd.DataFrame(columns=col)
i = 0
# evaluate each model using cross-validation
for name, model in models:
    kfold = KFold(n_splits=10)  # 10-fold cross-validation

    # cv accuracy scoring
    cv_acc_results = cross_val_score(model, X_train, y_train, cv=kfold, scoring='accuracy')
    train_acc_results.append(cv_acc_results)
    
    #test accuracy scoring
    model.fit(X_train,y_train)
    pred = model.predict(X_test)
    test_accuracy = accuracy_score(y_test, pred)
    test_acc_results.append(test_accuracy)
    
    #roc auc score
    roc = roc_auc_score(y_test,model.predict_proba(X_test)[:,1])
    #roc_auc_score.append(roc_auc_score(y_test,model.predict_proba(X_test)[:,1]))
    
    names.append(name)
    df_results.loc[i] = [name, round(cv_acc_results.mean()*100, 2), round(test_accuracy*100,2), roc]
    
    i += 1

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [33]:
df_results.sort_values(by=['Test Accuracy'], ascending=False)

Unnamed: 0,Algorithm,Train Accuracy Mean,Test Accuracy,Test ROC Score
7,Gradientboost,91.31,90.76,0.812174
10,ExtraTreesClassifier,96.45,89.67,0.816569
1,Random Forest,93.99,88.32,0.80599
9,ExtremeGradientBoost,92.73,88.04,0.826758
8,BaggingClassifier,90.98,87.5,0.776563
6,Adaboost,89.23,85.33,0.822721
2,SVM,90.59,81.79,0.77819
4,Decision Tree Classifier,85.56,80.43,0.675
0,Logistic Regression,76.07,74.73,0.829492
3,KNN,79.94,60.6,0.645898


In [52]:
train_acc_results = []
test_acc_results = []
#roc_auc_score = []
names = []

# set table to table to populate with performance results
col = ['Algorithm','Train Accuracy Mean', 'Test Accuracy', "Test ROC Score"]
df_results = pd.DataFrame(columns=col)
i = 0
# evaluate each model using cross-validation
for name, model in models:
    kfold = KFold(n_splits=10)  # 10-fold cross-validation

    # cv accuracy scoring
    cv_acc_results = cross_val_score(model, X_gan, y_gan, cv=kfold, scoring='accuracy')
    train_acc_results.append(cv_acc_results)
    
    #test accuracy scoring
    model.fit(X_train,y_train)
    pred = model.predict(X_test)
    test_accuracy = accuracy_score(y_test, pred)
    test_acc_results.append(test_accuracy)
    
    #roc auc score
    roc = roc_auc_score(y_test,model.predict_proba(X_test)[:,1])
    #roc_auc_score.append(roc_auc_score(y_test,model.predict_proba(X_test)[:,1]))
    
    names.append(name)
    df_results.loc[i] = [name, round(cv_acc_results.mean()*100, 2), round(test_accuracy*100,2), roc]
    
    i += 1

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [53]:
df_results.sort_values(by=['Test Accuracy'], ascending=False)

Unnamed: 0,Algorithm,Train Accuracy Mean,Test Accuracy,Test ROC Score
7,Gradientboost,87.42,90.76,0.811654
1,Random Forest,88.79,89.67,0.811849
10,ExtraTreesClassifier,89.12,89.4,0.817578
9,ExtremeGradientBoost,87.2,88.04,0.826758
8,BaggingClassifier,87.64,87.23,0.791829
6,Adaboost,86.49,85.33,0.822721
2,SVM,88.41,81.79,0.77819
4,Decision Tree Classifier,81.16,78.8,0.701042
0,Logistic Regression,87.2,74.73,0.829492
3,KNN,87.96,60.6,0.645898


## Performance

In [None]:
#To evakuate performances of all the models
def performance(p,ytest,m,xtest):
    print('Accuracy',np.round(accuracy_score(p,ytest),4))
    print('AUC_ROC Score',np.round(roc_auc_score(ytest,m.predict_proba(xtest)[:,1]),4))
    print('Confusion Matrix')
    print(confusion_matrix(p,ytest))
    print('Classification Report:')
    print(classification_report(p,ytest))

## Random Forest

In [None]:
params={'n_estimators':[100, 200, 300, 400, 500],
            'criterion':['gini','entropy'],
            'max_depth':[None,1,2,3,4,5,6,7,8,9,10],
           'max_features':["sqrt", "log2", None]}

In [None]:
g=GridSearchCV(RandomForestClassifier(),params,cv=5)

In [None]:
g.fit(X_train, y_train)

In [None]:
print(f'Best Params: {g.best_params_}')
print(f'Best Estimator: {g.best_estimator_}')
print(f'Best Score: {g.best_score_}')

In [None]:
m=RandomForestClassifier(max_features='log2', n_estimators=500)
m.fit(X_train,y_train)
prediction=m.predict(X_test)

In [None]:
performance(prediction,y_test,m,X_test)

## Gradient Boosting

In [None]:
params={'n_estimators':[100,200,300,400,500],
      'learning_rate':[0.001,0.01,0.10,],
      'subsample':[0.5,1],
      'max_depth':[1,2,3,4,5,6,7,8,9,10]}

In [None]:
g=GridSearchCV(GradientBoostingClassifier(),params,cv=5)

In [None]:
g.fit(xtrain,ytrain)

In [None]:
print(f'Best Params: {g.best_params_}')
print(f'Best Estimator: {g.best_estimator_}')
print(f'Best Score: {g.best_score_}')

In [None]:
fitted_models = {}
for name, pipeline in pipelines.items():
    model = GridSearchCV(pipeline, 
                         hyperparameters[name], 
                         cv=10, 
                         n_jobs=-1, scoring="accuracy")
    
    model.fit(X_train, y_train)
    
    print(f'{name}: {model.best_estimator_}')
    print(f'{name}: {model.best_params_}')
    print(f'{name}: {model.best_score_}')
    print("\n")
    
    
    fitted_models[name] = model

In [None]:
for name, model in fitted_models.items():
    print('Results for:', name)
    
    # obtain predictions
    pred = fitted_models[name].predict(X_test)

    # accuracy score
    print('Accuracy:', accuracy_score(y_test, pred))
    
    # precision
    precision = cm[1][1]/(cm[0][1]+cm[1][1])
    print('Precision:', precision)
    
    # recall
    recall = cm[1][1]/(cm[1][0]+cm[1][1])
    print('Recall:', recall)
    print("\n")