In [1]:
import boto3
import pandas as pd; pd.set_option('display.max_columns', 50)
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from scipy.stats import boxcox
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.multiclass import OneVsRestClassifier
from sklearn.tree import DecisionTreeClassifier, plot_tree
import precision_recall_cutoff # Calling .py function
from sklearn.feature_selection import RFE, RFECV
from sklearn.linear_model import LogisticRegression

# Defining the s3 bucket
s3 = boto3.resource('s3')
bucket_name = 'gabriel-predictive-analytics'
bucket = s3.Bucket(bucket_name)

# Defining the file to be read from s3 bucket
file_key = "turnover.csv"

bucket_object = bucket.Object(file_key)
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

# Reading the csv file
turnover = pd.read_csv(file_content_stream)
turnover.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,sales,salary
0,0.38,0.53,2,157,3,0,1,0,sales,low
1,0.8,0.86,5,262,6,0,1,0,sales,medium
2,0.11,0.88,7,272,4,0,1,0,sales,medium
3,0.72,0.87,5,223,5,0,1,0,sales,low
4,0.37,0.52,2,159,3,0,1,0,sales,low


In [2]:
## Changing sales to dummy variables
turnover = pd.concat([turnover.drop(columns = ['sales']), pd.get_dummies(turnover['sales'])], axis = 1)

## Changing sales to dummy variables
turnover = pd.concat([turnover, pd.get_dummies(turnover['salary'])], axis = 1)

turnover.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,salary,IT,RandD,accounting,hr,management,marketing,product_mng,sales,support,technical,high,low,medium
0,0.38,0.53,2,157,3,0,1,0,low,0,0,0,0,0,0,0,1,0,0,0,1,0
1,0.8,0.86,5,262,6,0,1,0,medium,0,0,0,0,0,0,0,1,0,0,0,0,1
2,0.11,0.88,7,272,4,0,1,0,medium,0,0,0,0,0,0,0,1,0,0,0,0,1
3,0.72,0.87,5,223,5,0,1,0,low,0,0,0,0,0,0,0,1,0,0,0,1,0
4,0.37,0.52,2,159,3,0,1,0,low,0,0,0,0,0,0,0,1,0,0,0,1,0


In [3]:
## Creating interactions/features from the decision tree
# interaction 1
turnover['interaction_1'] = np.where((turnover['satisfaction_level'] <= 0.465) & 
                                     (turnover['number_project'] <= 2.5) & 
                                     (turnover['last_evaluation'] <= 0.575), 1, 0)

# interaction 2
turnover['interaction_2'] = np.where((turnover['satisfaction_level'] <= 0.465) & 
                                     (turnover['number_project'] >= 2.5) & 
                                     (turnover['satisfaction_level'] >= 0.115), 1, 0)

# interaction 3
turnover['interaction_3'] = np.where((turnover['satisfaction_level'] > 0.465) & 
                                     (turnover['time_spend_company'] <= 4.5) & 
                                     (turnover['average_montly_hours'] <= 290.5), 1, 0)
turnover.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,salary,IT,RandD,accounting,hr,management,marketing,product_mng,sales,support,technical,high,low,medium,interaction_1,interaction_2,interaction_3
0,0.38,0.53,2,157,3,0,1,0,low,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0
1,0.8,0.86,5,262,6,0,1,0,medium,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0
2,0.11,0.88,7,272,4,0,1,0,medium,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0
3,0.72,0.87,5,223,5,0,1,0,low,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0
4,0.37,0.52,2,159,3,0,1,0,low,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0


In [4]:
# Defining input and target variables
X = turnover.drop(columns = ['left', 'salary'], axis = 1)
Y = turnover['left']

# Splitting the data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, stratify = Y)

# Standardizing the dataset
scaler = MinMaxScaler()
X_train = pd.DataFrame(scaler.fit_transform(X_train), columns = X_train.columns)
X_test = pd.DataFrame(scaler.fit_transform(X_test), columns = X_test.columns)

In [5]:
# Running RFE with Logistic Regression
Logistic_auto_selection = RFECV(estimator = LogisticRegression(), step = 1, min_features_to_select = 2, cv = 3).fit(X_train, Y_train)

# Extracting features that got slected
print(X_train.columns[Logistic_auto_selection.support_])

Index(['last_evaluation', 'number_project', 'average_montly_hours',
       'time_spend_company', 'Work_accident', 'promotion_last_5years', 'IT',
       'RandD', 'product_mng', 'high', 'low', 'interaction_1', 'interaction_2',
       'interaction_3'],
      dtype='object')


In [6]:
## Defining the input and target variables
X_train_1 = X_train[['last_evaluation', 'number_project', 'average_montly_hours',
       'time_spend_company', 'Work_accident', 'promotion_last_5years', 'IT',
       'RandD', 'product_mng', 'high', 'low', 'interaction_1', 'interaction_2',
       'interaction_3']]
X_test_1 = X_test[['last_evaluation', 'number_project', 'average_montly_hours',
       'time_spend_company', 'Work_accident', 'promotion_last_5years', 'IT',
       'RandD', 'product_mng', 'high', 'low', 'interaction_1', 'interaction_2',
       'interaction_3']]

# Building the logistic regression model
logit_md = LogisticRegression().fit(X_train_1, Y_train)

# Predicting on test
logit_preds = logit_md.predict_proba(X_test_1)[:, 1]

# Predicting labels
logit_labels = precision_recall_cutoff.precision_recall_cutoff(Y_test, logit_preds)

# Computing the classification report
print(classification_report(Y_test, logit_labels))

              precision    recall  f1-score   support

           0       0.97      0.99      0.98      2286
           1       0.95      0.90      0.93       714

    accuracy                           0.97      3000
   macro avg       0.96      0.94      0.95      3000
weighted avg       0.97      0.97      0.97      3000



In [7]:
# Running RFE with random forest
auto_feature_selection = RFECV(estimator = RandomForestClassifier(n_estimators = 500, max_depth = 3), step = 1, min_features_to_select = 2, cv = 3).fit(X_train, Y_train)

# Extracting features that got slected
print(X_train.columns[auto_feature_selection.support_])


Index(['satisfaction_level', 'last_evaluation', 'number_project',
       'average_montly_hours', 'time_spend_company', 'Work_accident',
       'interaction_1', 'interaction_2', 'interaction_3'],
      dtype='object')


In [9]:
## Defining the input and target variables
X_train_2 = X_train[['satisfaction_level', 'last_evaluation', 'number_project',
       'average_montly_hours', 'time_spend_company', 'Work_accident',
       'interaction_1', 'interaction_2', 'interaction_3']]
X_test_2 = X_test[['satisfaction_level', 'last_evaluation', 'number_project',
       'average_montly_hours', 'time_spend_company', 'Work_accident',
       'interaction_1', 'interaction_2', 'interaction_3']]

# Random Forest model with top 5
RF = RandomForestClassifier(n_estimators=500, max_depth = 3).fit(X_train_2, Y_train)

# Predicting on test
RF_preds = RF.predict_proba(X_test_2)[:, 1]

# Predicting labels
RF_labels = precision_recall_cutoff.precision_recall_cutoff(Y_test, RF_preds)

# Computing the classification report
print(classification_report(Y_test, RF_labels))

              precision    recall  f1-score   support

           0       0.98      0.99      0.98      2286
           1       0.96      0.92      0.94       714

    accuracy                           0.97      3000
   macro avg       0.97      0.95      0.96      3000
weighted avg       0.97      0.97      0.97      3000



In [None]:
# Based on my results, I would choose Random Forest model to predict employee turnover
# because both models had very similar results, but Random Forest needed less variable to get its results.