# __Employee Performance Analysis__
## __INX Future Inc.__
### __Predict Model__

In [1]:
# general purpose libraries
import os
from dotenv import load_dotenv

# data loading and wrangling libraries for EDA
import pandas as pd
import numpy as np

# data visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns

# data balancing
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline

# data splitting
from sklearn.model_selection import train_test_split

# ml models
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier

# Cross validation
from sklearn.model_selection import KFold, cross_val_score

# ML model evaluation
from sklearn.metrics import accuracy_score, classification_report

# warnings
import warnings
warnings.filterwarnings('ignore')

# load the dot env file that contains the path to data file for data privacy
dotenv_path = os.getcwd()+'\\local.env'
load_dotenv(dotenv_path=dotenv_path)

True

## Import Dataset

In [2]:
main_df = pd.read_excel(os.getenv('data'))
main_df.head(3)

Unnamed: 0,EmpNumber,Age,Gender,EducationBackground,MaritalStatus,EmpDepartment,EmpJobRole,BusinessTravelFrequency,DistanceFromHome,EmpEducationLevel,...,EmpRelationshipSatisfaction,TotalWorkExperienceInYears,TrainingTimesLastYear,EmpWorkLifeBalance,ExperienceYearsAtThisCompany,ExperienceYearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,Attrition,PerformanceRating
0,E1001000,32,Male,Marketing,Single,Sales,Sales Executive,Travel_Rarely,10,3,...,4,10,2,2,10,7,0,8,No,3
1,E1001006,47,Male,Marketing,Single,Sales,Sales Executive,Travel_Rarely,14,4,...,4,20,2,3,7,7,1,7,No,3
2,E1001007,40,Male,Life Sciences,Married,Sales,Sales Executive,Travel_Frequently,5,4,...,3,20,2,3,18,13,1,12,No,4


## Preprocess the data for training and prediction

In [3]:
# lets prepare our dataset

ml_df = main_df.drop('EmpNumber', axis=1)
ml_df.head(2)

cat_df = ml_df.select_dtypes(['object','bool'])
num_df = ml_df.select_dtypes(['int64', 'float64'])

# OneHotEncoding of Categorical features
cat_df = pd.get_dummies(cat_df)

# concatonate the categorical and numberical datasets to make one prepared dataset for RandomForest model
ml_df = pd.concat([cat_df, num_df], axis=1)

# Splitting the dataset into X=features and y=target 
X = ml_df.iloc[:, :-1]
y = ml_df.PerformanceRating

# now lets perform the train test split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.75, random_state=19)

## Balancing the dataset

In [4]:
oversampling = SMOTE()
undersampling = RandomUnderSampler()

steps = (('o', oversampling), ('u', undersampling))
pipeline = Pipeline(steps=steps)

X_bal, y_bal = pipeline.fit_resample(X_train, y_train)

## Training and prediction using GradientBoostClassifier

In [28]:
# Define model parameter dict
gb_params ={
    'n_estimators': 450,
    'max_features': 0.4,
    'learning_rate' : 0.01,
    'max_depth': 12,
    'min_samples_leaf': 2,
    'max_features' : 'auto',
    'random_state' : 0
}

model = GradientBoostingClassifier(**gb_params)

# Train the model
model.fit(X_bal, y_bal)

y_predict_test = model.predict(X_test)

print("Test Accuracy",accuracy_score(y_test, y_predict_test))
print(classification_report(y_test, y_predict_test))

Test Accuracy 0.9033333333333333
              precision    recall  f1-score   support

           2       0.85      0.83      0.84       149
           3       0.94      0.94      0.94       656
           4       0.74      0.79      0.77        95

    accuracy                           0.90       900
   macro avg       0.84      0.85      0.85       900
weighted avg       0.90      0.90      0.90       900



In [29]:
gbprediction = pd.DataFrame({'predicted_values':y_predict_test, 'actual_values':y_test})
gbprediction

Unnamed: 0,predicted_values,actual_values
164,2,2
1137,2,2
598,3,3
992,4,4
324,3,3
...,...,...
607,2,2
689,3,3
925,3,3
242,4,3


## Training with XGBoost Classifier

In [30]:
params = {
    'subsample': 1, 
    'n_estimators': 900, 
    'min_samples_leaf': 2, 
    'max_features': 'auto', 
    'max_depth': 11, 
    'learning_rate': 0.1, 
    'criterion': 'friedman_mse',
    'objective' : 'multi:softprob',
    'n_jobs': -1,
    'eval_metric': 'merror',
    'use_label_encoder': True
}

model = XGBClassifier(**params)

# Train the model
model.fit(X_bal, y_bal)

y_predict_test = model.predict(X_test)

print("Test Accuracy",accuracy_score(y_test, y_predict_test))
print(classification_report(y_test, y_predict_test))

Parameters: { "criterion", "max_features", "min_samples_leaf" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Test Accuracy 0.9133333333333333
              precision    recall  f1-score   support

           2       0.84      0.82      0.83       149
           3       0.94      0.95      0.94       656
           4       0.84      0.80      0.82        95

    accuracy                           0.91       900
   macro avg       0.87      0.86      0.87       900
weighted avg       0.91      0.91      0.91       900



In [31]:
xgbprediction = pd.DataFrame({'predicted_values':y_predict_test, 'actual_values':y_test})
xgbprediction

Unnamed: 0,predicted_values,actual_values
164,2,2
1137,2,2
598,3,3
992,3,4
324,3,3
...,...,...
607,2,2
689,3,3
925,3,3
242,3,3
