In [1]:
import boto3
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from precision_recall_cutoff import precision_recall_cutoff

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import classification_report

#defining the s3 bucket
s3 = boto3.resource('s3')
bucket_name = 'craig-shaffer-data-445-bucket'
bucket = s3.Bucket(bucket_name)

#defining the file to be read from s3 bucket
file_key = 'turnover.csv'

bucket_object = bucket.Object(file_key)
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

#reading the datafile
turnover = pd.read_csv(file_content_stream)
turnover.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,sales,salary
0,0.38,0.53,2,157,3,0,1,0,sales,low
1,0.8,0.86,5,262,6,0,1,0,sales,medium
2,0.11,0.88,7,272,4,0,1,0,sales,medium
3,0.72,0.87,5,223,5,0,1,0,sales,low
4,0.37,0.52,2,159,3,0,1,0,sales,low


In [2]:
#changing sales and salary to dummies
turnover = pd.concat([turnover.drop(columns=['sales','salary'],axis=1), pd.get_dummies(turnover[['sales','salary']])],axis=1)
turnover.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,sales_IT,sales_RandD,...,sales_hr,sales_management,sales_marketing,sales_product_mng,sales_sales,sales_support,sales_technical,salary_high,salary_low,salary_medium
0,0.38,0.53,2,157,3,0,1,0,0,0,...,0,0,0,0,1,0,0,0,1,0
1,0.8,0.86,5,262,6,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,1
2,0.11,0.88,7,272,4,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,1
3,0.72,0.87,5,223,5,0,1,0,0,0,...,0,0,0,0,1,0,0,0,1,0
4,0.37,0.52,2,159,3,0,1,0,0,0,...,0,0,0,0,1,0,0,0,1,0


In [3]:
#creating new features from InClass_10
turnover['interaction_1'] = np.where((turnover['satisfaction_level'] <= 0.465) & 
                                     (turnover['number_project'] > 2.5) & 
                                     (turnover['satisfaction_level'] >= 0.115), 1, 0)

turnover['interaction_2'] = np.where((turnover['satisfaction_level'] <= 0.465) & 
                                     (turnover['number_project'] <= 2.5) & 
                                     (turnover['last_evaluation'] <= 0.575), 1, 0)

turnover['interaction_3'] = np.where((turnover['satisfaction_level'] > 0.465) & 
                                     (turnover['time_spend_company'] <= 4.5) & 
                                     (turnover['average_montly_hours'] <= 290.5), 1, 0)
turnover.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,sales_IT,sales_RandD,...,sales_product_mng,sales_sales,sales_support,sales_technical,salary_high,salary_low,salary_medium,interaction_1,interaction_2,interaction_3
0,0.38,0.53,2,157,3,0,1,0,0,0,...,0,1,0,0,0,1,0,0,1,0
1,0.8,0.86,5,262,6,0,1,0,0,0,...,0,1,0,0,0,0,1,0,0,0
2,0.11,0.88,7,272,4,0,1,0,0,0,...,0,1,0,0,0,0,1,0,0,0
3,0.72,0.87,5,223,5,0,1,0,0,0,...,0,1,0,0,0,1,0,0,0,0
4,0.37,0.52,2,159,3,0,1,0,0,0,...,0,1,0,0,0,1,0,0,1,0


In [4]:
#defining input and target
x=turnover.drop(columns=['left'],axis=1)
y=turnover['left']

#splitting the data
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,stratify=y)

In [8]:
#defining the empty data frame to store importances
importance = pd.DataFrame(columns = x_train.columns)

for i in range(0, 10):

    #splitting the data
    x_training, x_testing, y_training, y_testing = train_test_split(x_train, y_train, test_size = 0.2, stratify = y_train)
    
    #building the model
    rf_md = RandomForestClassifier(n_estimators = 500, max_depth = 3).fit(x_training, y_training)

    #extracting the feature importance
    importance.loc[i] = rf_md.feature_importances_.T

#average RF feature importance
pd.DataFrame({'Feature':importance.mean().index, 
              'Importance_Score':importance.mean().values}).sort_values(by = 'Importance_Score',
                                                                        ascending = False).reset_index(drop = True)

Unnamed: 0,Feature,Importance_Score
0,interaction_3,0.256014
1,interaction_2,0.189461
2,satisfaction_level,0.183658
3,number_project,0.107635
4,time_spend_company,0.081976
5,average_montly_hours,0.071546
6,interaction_1,0.041865
7,last_evaluation,0.041576
8,Work_accident,0.011212
9,salary_low,0.006967


**Random Forest w/ Top 5 Features**

In [9]:
#top 5 features
x_train_5 = x_train[['interaction_3', 'interaction_2', 'satisfaction_level', 'number_project', 'time_spend_company']]
x_test_5 = x_test[['interaction_3', 'interaction_2', 'satisfaction_level', 'number_project', 'time_spend_company']]

#building the model
rf_md=RandomForestClassifier(n_estimators = 500, max_depth=3).fit(x_train_5,y_train)

#predict on test
rf_pred = rf_md.predict_proba(x_test_5)[:,1]

#changing likelihoods to labels
rf_label = precision_recall_cutoff(y_test, rf_pred)

#print classification report
print(classification_report(y_test,rf_label))

              precision    recall  f1-score   support

           0       0.97      0.96      0.97      2286
           1       0.89      0.90      0.89       714

    accuracy                           0.95      3000
   macro avg       0.93      0.93      0.93      3000
weighted avg       0.95      0.95      0.95      3000



**Random Forest w/ Top 6 Features**

In [10]:
#top 6 features
x_train_6 = x_train[['interaction_3', 'interaction_2', 'satisfaction_level', 'number_project', 'time_spend_company','average_montly_hours']]
x_test_6 = x_test[['interaction_3', 'interaction_2', 'satisfaction_level', 'number_project', 'time_spend_company','average_montly_hours']]

#building the model
rf_md=RandomForestClassifier(n_estimators = 500, max_depth=3).fit(x_train_6,y_train)

#predict on test
rf_pred = rf_md.predict_proba(x_test_6)[:,1]

#changing likelihoods to labels
rf_label = precision_recall_cutoff(y_test, rf_pred)

#print classification report
print(classification_report(y_test,rf_label))

              precision    recall  f1-score   support

           0       0.97      0.97      0.97      2286
           1       0.90      0.91      0.91       714

    accuracy                           0.96      3000
   macro avg       0.94      0.94      0.94      3000
weighted avg       0.96      0.96      0.96      3000



Using the results from part 6 and 7, we would use the Random Forest Classifier model with six predictors to predict left because of slightly increased model performance.