In [21]:
import boto3
import pandas as pd; pd.set_option('display.max_column', 100)
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

from scipy.stats import boxcox
from precision_recall_cutoff import precision_recall_cutoff

from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import recall_score, classification_report

## define bucket in which you are trying to reach
s3 = boto3.resource('s3')
bucket_name = 'daltondencklau-data445-bucket'
bucket = s3.Bucket(bucket_name)

## define csv file to read in the bucket
file_key= 'turnover.csv'

## syntax to allow us to read the file
bucket_object = bucket.Object(file_key)
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

## reading the data file
turnover = pd.read_csv(file_content_stream)
turnover.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,sales,salary
0,0.38,0.53,2,157,3,0,1,0,sales,low
1,0.8,0.86,5,262,6,0,1,0,sales,medium
2,0.11,0.88,7,272,4,0,1,0,sales,medium
3,0.72,0.87,5,223,5,0,1,0,sales,low
4,0.37,0.52,2,159,3,0,1,0,sales,low


In [22]:
## changing 'sales' and 'salary' to dummy variables
turnover = pd.concat([turnover.drop(columns = ['sales', 'salary'], axis = 1), pd.get_dummies(turnover[['sales', 'salary']])], axis = 1)

In [23]:
## defining the scaler
scaler = MinMaxScaler()

## changing scale to 0-1 (averag_monthly_hours, number_projects)
turnover[['number_project', 'average_montly_hours']] = scaler.fit_transform(turnover[['number_project', 'average_montly_hours']])

## using box-cox transformation
transformed_time_spent = boxcox(turnover['time_spend_company'])

turnover['time_spend_company'] = transformed_time_spent[0]

In [24]:
turnover.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,sales_IT,sales_RandD,sales_accounting,sales_hr,sales_management,sales_marketing,sales_product_mng,sales_sales,sales_support,sales_technical,salary_high,salary_low,salary_medium
0,0.38,0.53,0.0,0.285047,0.804651,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0
1,0.8,0.86,0.6,0.775701,1.098118,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1
2,0.11,0.88,1.0,0.82243,0.941381,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1
3,0.72,0.87,0.6,0.593458,1.03233,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0
4,0.37,0.52,0.0,0.294393,0.804651,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0


In [25]:
## Engineering interactions
turnover['interaction_1'] = turnover['satisfaction_level'] * turnover['time_spend_company']
turnover['interaction_2'] = turnover['last_evaluation'] * turnover['promotion_last_5years']
turnover.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,sales_IT,sales_RandD,sales_accounting,sales_hr,sales_management,sales_marketing,sales_product_mng,sales_sales,sales_support,sales_technical,salary_high,salary_low,salary_medium,interaction_1,interaction_2
0,0.38,0.53,0.0,0.285047,0.804651,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0.305767,0.0
1,0.8,0.86,0.6,0.775701,1.098118,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0.878494,0.0
2,0.11,0.88,1.0,0.82243,0.941381,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0.103552,0.0
3,0.72,0.87,0.6,0.593458,1.03233,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0.743278,0.0
4,0.37,0.52,0.0,0.294393,0.804651,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0.297721,0.0


In [26]:
## defining input and target
x = turnover.drop(columns = 'left', axis = 1)
y = turnover['left']

## splitting the data into training and testing
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, stratify = y)

In [27]:
## building random forest models
rf_md1 = RandomForestClassifier(n_estimators = 500, max_depth = 3).fit(x_train, y_train)

## predicting on testing
rf_pred1 = rf_md1.predict_proba(x_test)[:, 1]

## changing likelihoods to labels
rf_label1 = precision_recall_cutoff(y_test, rf_pred1)

## classification report
print('RF MODEL 1:', classification_report(y_test, rf_label1))

RF MODEL 1:               precision    recall  f1-score   support

           0       0.91      0.99      0.95      2286
           1       0.95      0.69      0.80       714

    accuracy                           0.92      3000
   macro avg       0.93      0.84      0.87      3000
weighted avg       0.92      0.92      0.91      3000



In [29]:
## building models without interactions

## remoiving interactions
x_train_new = x_train.drop(columns = ['interaction_1', 'interaction_2'], axis = 1)
x_test_new = x_test.drop(columns = ['interaction_1', 'interaction_2'], axis = 1)

## building random forest models
rf_md2 = RandomForestClassifier(n_estimators = 500, max_depth = 3).fit(x_train_new, y_train)

## predicting on testing
rf_pred2 = rf_md2.predict_proba(x_test_new)[:, 1]

## changing likelihoods to labels
rf_label2 = precision_recall_cutoff(y_test, rf_pred2)

## classification report
print('RF MODEL 2:', classification_report(y_test, rf_label2))

RF MODEL 2:               precision    recall  f1-score   support

           0       0.97      0.87      0.92      2286
           1       0.69      0.92      0.79       714

    accuracy                           0.88      3000
   macro avg       0.83      0.89      0.85      3000
weighted avg       0.90      0.88      0.89      3000



Because this is an inbalanced dataset, we want to model that has good recall and precision and we also want a simpler model. With this in mind paired with the above results, I would chose model 2 without the interactions terms because it is a simplier model with higher performance.