### Reading the csv data file and creating a data-frame called turnover

In [5]:
import boto3
import pandas as pd; pd.set_option('display.max_columns', 50)
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from scipy.stats import boxcox
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.multiclass import OneVsRestClassifier
from sklearn.tree import DecisionTreeClassifier, plot_tree
import precision_recall_cutoff # Calling .py function

# Defining the s3 bucket
s3 = boto3.resource('s3')
bucket_name = 'gabriel-predictive-analytics'
bucket = s3.Bucket(bucket_name)

# Defining the file to be read from s3 bucket
file_key = "turnover.csv"

bucket_object = bucket.Object(file_key)
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

# Reading the csv file
turnover = pd.read_csv(file_content_stream)
turnover.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,sales,salary
0,0.38,0.53,2,157,3,0,1,0,sales,low
1,0.8,0.86,5,262,6,0,1,0,sales,medium
2,0.11,0.88,7,272,4,0,1,0,sales,medium
3,0.72,0.87,5,223,5,0,1,0,sales,low
4,0.37,0.52,2,159,3,0,1,0,sales,low


### Using get_dummies function from pandas library to create dummy variables from multi-categorical variables

In [6]:
## Changing sales to dummy variables
turnover = pd.concat([turnover.drop(columns = ['sales']), pd.get_dummies(turnover['sales'])], axis = 1)

## Changing sales to dummy variables
turnover = pd.concat([turnover, pd.get_dummies(turnover['salary'])], axis = 1)

turnover.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,salary,IT,RandD,accounting,hr,management,marketing,product_mng,sales,support,technical,high,low,medium
0,0.38,0.53,2,157,3,0,1,0,low,0,0,0,0,0,0,0,1,0,0,0,1,0
1,0.8,0.86,5,262,6,0,1,0,medium,0,0,0,0,0,0,0,1,0,0,0,0,1
2,0.11,0.88,7,272,4,0,1,0,medium,0,0,0,0,0,0,0,1,0,0,0,0,1
3,0.72,0.87,5,223,5,0,1,0,low,0,0,0,0,0,0,0,1,0,0,0,1,0
4,0.37,0.52,2,159,3,0,1,0,low,0,0,0,0,0,0,0,1,0,0,0,1,0


In [7]:
## Creating interactions/features from the decision tree
# interaction 1
turnover['interaction_1'] = np.where((turnover['satisfaction_level'] <= 0.465) & 
                                     (turnover['number_project'] <= 2.5) & 
                                     (turnover['last_evaluation'] <= 0.575), 1, 0)

# interaction 2
turnover['interaction_2'] = np.where((turnover['satisfaction_level'] <= 0.465) & 
                                     (turnover['number_project'] >= 2.5) & 
                                     (turnover['satisfaction_level'] >= 0.115), 1, 0)

# interaction 3
turnover['interaction_3'] = np.where((turnover['satisfaction_level'] > 0.465) & 
                                     (turnover['time_spend_company'] <= 4.5) & 
                                     (turnover['average_montly_hours'] <= 290.5), 1, 0)
turnover.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,salary,IT,RandD,accounting,hr,management,marketing,product_mng,sales,support,technical,high,low,medium,interaction_1,interaction_2,interaction_3
0,0.38,0.53,2,157,3,0,1,0,low,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0
1,0.8,0.86,5,262,6,0,1,0,medium,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0
2,0.11,0.88,7,272,4,0,1,0,medium,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0
3,0.72,0.87,5,223,5,0,1,0,low,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0
4,0.37,0.52,2,159,3,0,1,0,low,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0


In [8]:
X = turnover.drop(columns = ['left', 'salary'], axis = 1)
Y = turnover['left']

# Splitting the data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, stratify = Y)

In [10]:
## Defining the list to store results
results = list()

for i in range(0,10):
    
    # Splitting the data
    X_training, X_testing, Y_training, Y_testing = train_test_split(X_train, Y_train, test_size = 0.2, stratify = Y_train)
    
    # Building Random Forest model
    RF = RandomForestClassifier(n_estimators = 500, max_depth = 3).fit(X_training, Y_training)
    
    # Extracting feature importances
    results.append(RF.feature_importances_)
    
# Changing to a dataframe
results = pd.DataFrame(results, columns = X.columns)
results

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,IT,RandD,accounting,hr,management,marketing,product_mng,sales,support,technical,high,low,medium,interaction_1,interaction_2,interaction_3
0,0.192501,0.036546,0.127847,0.063569,0.082006,0.00937,0.000788,3.7e-05,0.000286,4.1e-05,0.000147,0.000375,8e-06,0.000116,8.8e-05,3.8e-05,9.6e-05,0.004775,0.005312,0.000968,0.198291,0.040313,0.236484
1,0.184913,0.043852,0.107929,0.065512,0.082596,0.011291,0.001075,2.9e-05,0.000252,4.2e-05,6.4e-05,0.000532,2e-05,4.4e-05,4.8e-05,0.000156,5.5e-05,0.006239,0.003896,0.000577,0.188828,0.039121,0.262929
2,0.17426,0.046249,0.103371,0.069599,0.091871,0.010589,0.000859,5.7e-05,0.000241,3.2e-05,0.000119,0.000545,1.3e-05,3.8e-05,5.9e-05,2.9e-05,9.1e-05,0.00687,0.00621,0.001572,0.197187,0.049201,0.240936
3,0.165645,0.039228,0.102424,0.065916,0.083,0.010616,0.000662,4.2e-05,0.000255,3.5e-05,0.000167,0.000327,7e-06,3.5e-05,5.1e-05,4e-05,3.9e-05,0.005113,0.00748,0.00125,0.198986,0.0511,0.267582
4,0.189009,0.044569,0.116502,0.062404,0.078105,0.009296,0.000911,7.3e-05,0.0002,1.3e-05,0.000144,0.000356,2.1e-05,4.7e-05,9e-05,3.4e-05,6.9e-05,0.004627,0.003677,0.000773,0.196049,0.040391,0.252639
5,0.179397,0.043516,0.116308,0.067521,0.078444,0.008566,0.000697,4.7e-05,0.000225,2.4e-05,0.00024,0.000499,7e-06,0.000127,5.3e-05,4.7e-05,7.6e-05,0.005806,0.006789,0.001434,0.193374,0.044472,0.25233
6,0.194688,0.039282,0.122036,0.068014,0.078881,0.007706,0.000962,3.4e-05,0.000362,2.3e-05,0.00017,0.000346,2e-05,6.7e-05,2.8e-05,7.1e-05,8.2e-05,0.006008,0.005562,0.000737,0.183496,0.036405,0.25502
7,0.183711,0.033607,0.106715,0.070022,0.083731,0.012396,0.00123,3.7e-05,0.000469,4.8e-05,0.000122,0.000308,1e-05,3.7e-05,8.1e-05,3.4e-05,5.1e-05,0.003395,0.006714,0.001155,0.191837,0.045021,0.259269
8,0.200778,0.038906,0.115835,0.059088,0.078324,0.009428,0.001259,2.3e-05,0.000214,0.000131,8.1e-05,0.000547,3e-06,2.8e-05,8.9e-05,3.7e-05,0.000113,0.005188,0.009223,0.00143,0.183826,0.046188,0.249261
9,0.191175,0.041698,0.114264,0.067117,0.089392,0.008923,0.000996,6.1e-05,0.000216,2.6e-05,5.9e-05,0.000271,1.1e-05,7.4e-05,0.000128,5.3e-05,5e-05,0.004383,0.00791,0.000933,0.199108,0.037741,0.235409


### Feature importance

In [11]:
# Computing averages and sorting by importance
results = pd.DataFrame(results.apply(np.mean, axis = 0))
results = pd.DataFrame({'Feature': results.index, 'Importance': results[0].values}).sort_values(by = 'Importance', ascending = False)
results

Unnamed: 0,Feature,Importance
22,interaction_3,0.251186
20,interaction_1,0.193098
0,satisfaction_level,0.185608
2,number_project,0.113323
4,time_spend_company,0.082635
3,average_montly_hours,0.065876
21,interaction_2,0.042995
1,last_evaluation,0.040745
5,Work_accident,0.009818
18,low,0.006277


### Top 5 variables

In [13]:
X_train_1 = X_train[['interaction_3', 'interaction_1', 'satisfaction_level', 'number_project', 'time_spend_company']]
X_test_1 = X_test[['interaction_3', 'interaction_1', 'satisfaction_level', 'number_project', 'time_spend_company']]

# Building Random Forest Model
rf_md = RandomForestClassifier(n_estimators = 500, max_depth = 3).fit(X_train_1, Y_train)

# Predicting on test
rf_pred = rf_md.predict_proba(X_test_1)[:,1]

# Predicting the labels
rf_labels = precision_recall_cutoff.precision_recall_cutoff(Y_test, rf_pred)

# Printing classification report
print(classification_report(Y_test, rf_labels))

              precision    recall  f1-score   support

           0       0.97      0.97      0.97      2286
           1       0.92      0.91      0.91       714

    accuracy                           0.96      3000
   macro avg       0.95      0.94      0.94      3000
weighted avg       0.96      0.96      0.96      3000



### Top 6 variables

In [14]:
X_train_2 = X_train[['interaction_3', 'interaction_1', 'satisfaction_level', 'number_project', 'time_spend_company', 'average_montly_hours']]
X_test_2 = X_test[['interaction_3', 'interaction_1', 'satisfaction_level', 'number_project', 'time_spend_company', 'average_montly_hours']]

# Building Random Forest Model
rf_md = RandomForestClassifier(n_estimators = 500, max_depth = 3).fit(X_train_2, Y_train)

# Predicting on test
rf_pred = rf_md.predict_proba(X_test_2)[:,1]

# Predicting the labels
rf_labels = precision_recall_cutoff.precision_recall_cutoff(Y_test, rf_pred)

# Printing classification report
print(classification_report(Y_test, rf_labels))

              precision    recall  f1-score   support

           0       0.97      0.98      0.98      2286
           1       0.95      0.89      0.92       714

    accuracy                           0.96      3000
   macro avg       0.96      0.94      0.95      3000
weighted avg       0.96      0.96      0.96      3000



# Based on my results, i would use the second model (with top 6 features) to predict employee turnover.