In [6]:
import boto3
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from precision_recall_cutoff import precision_recall_cutoff

from scipy.stats import boxcox
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import classification_report

#defining the s3 bucket
s3 = boto3.resource('s3')
bucket_name = 'craig-shaffer-data-445-bucket'
bucket = s3.Bucket(bucket_name)

#defining the file to be read from s3 bucket
file_key = 'turnover.csv'

bucket_object = bucket.Object(file_key)
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

#reading the datafile
turnover = pd.read_csv(file_content_stream)
turnover.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,sales,salary
0,0.38,0.53,2,157,3,0,1,0,sales,low
1,0.8,0.86,5,262,6,0,1,0,sales,medium
2,0.11,0.88,7,272,4,0,1,0,sales,medium
3,0.72,0.87,5,223,5,0,1,0,sales,low
4,0.37,0.52,2,159,3,0,1,0,sales,low


In [7]:
#changing sales and salary to dummies
turnover = pd.concat([turnover.drop(columns=['sales','salary'],axis=1), pd.get_dummies(turnover[['sales','salary']])],axis=1)
turnover.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,sales_IT,sales_RandD,...,sales_hr,sales_management,sales_marketing,sales_product_mng,sales_sales,sales_support,sales_technical,salary_high,salary_low,salary_medium
0,0.38,0.53,2,157,3,0,1,0,0,0,...,0,0,0,0,1,0,0,0,1,0
1,0.8,0.86,5,262,6,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,1
2,0.11,0.88,7,272,4,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,1
3,0.72,0.87,5,223,5,0,1,0,0,0,...,0,0,0,0,1,0,0,0,1,0
4,0.37,0.52,2,159,3,0,1,0,0,0,...,0,0,0,0,1,0,0,0,1,0


In [8]:
#defining the scaler
scaler=MinMaxScaler()

#changing scale to 0-1
turnover[['number_project','average_montly_hours']] = scaler.fit_transform(turnover[['number_project','average_montly_hours']])

#boxcox transformation
transformed_time_spend = boxcox(turnover['time_spend_company'])
turnover['time_spend_company'] = transformed_time_spend[0]

turnover.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,sales_IT,sales_RandD,...,sales_hr,sales_management,sales_marketing,sales_product_mng,sales_sales,sales_support,sales_technical,salary_high,salary_low,salary_medium
0,0.38,0.53,0.0,0.285047,0.804651,0,1,0,0,0,...,0,0,0,0,1,0,0,0,1,0
1,0.8,0.86,0.6,0.775701,1.098118,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,1
2,0.11,0.88,1.0,0.82243,0.941381,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,1
3,0.72,0.87,0.6,0.593458,1.03233,0,1,0,0,0,...,0,0,0,0,1,0,0,0,1,0
4,0.37,0.52,0.0,0.294393,0.804651,0,1,0,0,0,...,0,0,0,0,1,0,0,0,1,0


In [9]:
#engineering interactions
turnover['interaction_1'] = turnover['satisfaction_level'] * turnover['time_spend_company']
turnover['interaction_2'] = turnover['last_evaluation'] * turnover['promotion_last_5years']
turnover.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,sales_IT,sales_RandD,...,sales_marketing,sales_product_mng,sales_sales,sales_support,sales_technical,salary_high,salary_low,salary_medium,interaction_1,interaction_2
0,0.38,0.53,0.0,0.285047,0.804651,0,1,0,0,0,...,0,0,1,0,0,0,1,0,0.305767,0.0
1,0.8,0.86,0.6,0.775701,1.098118,0,1,0,0,0,...,0,0,1,0,0,0,0,1,0.878494,0.0
2,0.11,0.88,1.0,0.82243,0.941381,0,1,0,0,0,...,0,0,1,0,0,0,0,1,0.103552,0.0
3,0.72,0.87,0.6,0.593458,1.03233,0,1,0,0,0,...,0,0,1,0,0,0,1,0,0.743278,0.0
4,0.37,0.52,0.0,0.294393,0.804651,0,1,0,0,0,...,0,0,1,0,0,0,1,0,0.297721,0.0


In [14]:
#defining input and target variables
x=turnover.drop(columns='left',axis=1)
y=turnover['left']

#splitting the data
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = .2,stratify=y)

**Random Forest 1**

In [15]:
#building the model
rf_md=RandomForestClassifier(n_estimators = 500, max_depth=3).fit(x_train,y_train)

#predict on test
rf_pred = rf_md.predict_proba(x_test)[:,1]

#changing likelihoods to labels
rf_label = precision_recall_cutoff(y_test, rf_pred)

#print classification report
print(classification_report(y_test,rf_label))

              precision    recall  f1-score   support

           0       0.95      0.90      0.93      2286
           1       0.74      0.85      0.79       714

    accuracy                           0.89      3000
   macro avg       0.84      0.88      0.86      3000
weighted avg       0.90      0.89      0.89      3000



**Random Forest 2**

In [16]:
#removing interactions
x_train_new = x_train.drop(columns = ['interaction_1', 'interaction_2'], axis = 1)
x_test_new = x_test.drop(columns = ['interaction_1', 'interaction_2'], axis = 1)

#building the model
rf_md=RandomForestClassifier(n_estimators = 500, max_depth=3).fit(x_train_new,y_train)

#predict on test
rf_pred = rf_md.predict_proba(x_test_new)[:,1]

#changing likelihoods to labels
rf_label = precision_recall_cutoff(y_test, rf_pred)

#print classification report
print(classification_report(y_test,rf_label))

              precision    recall  f1-score   support

           0       0.96      0.89      0.93      2286
           1       0.72      0.89      0.80       714

    accuracy                           0.89      3000
   macro avg       0.84      0.89      0.86      3000
weighted avg       0.90      0.89      0.89      3000



Based on my results, I would use the second model because it doesn't lose performance by removing the interaction features. It is simpler and performs about the same as the first model.