In [34]:
import boto3
import pandas as pd; pd.set_option('display.max_column', 100)
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

from scipy.stats import boxcox

from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import recall_score, classification_report

## define bucket in which you are trying to reach
s3 = boto3.resource('s3')
bucket_name = 'daltondencklau-data445-bucket'
bucket = s3.Bucket(bucket_name)

## define csv file to read in the bucket
file_key= 'turnover.csv'

## syntax to allow us to read the file
bucket_object = bucket.Object(file_key)
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

## reading the data file
turnover = pd.read_csv(file_content_stream)
turnover.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,sales,salary
0,0.38,0.53,2,157,3,0,1,0,sales,low
1,0.8,0.86,5,262,6,0,1,0,sales,medium
2,0.11,0.88,7,272,4,0,1,0,sales,medium
3,0.72,0.87,5,223,5,0,1,0,sales,low
4,0.37,0.52,2,159,3,0,1,0,sales,low


In [35]:
## creating frequency table of 'left'
turnover['left'].value_counts()

0    11428
1     3571
Name: left, dtype: int64

In [36]:
## changing 'sales' and 'salary' to dummy variables
turnover = pd.concat([turnover.drop(columns = ['sales', 'salary'], axis = 1), pd.get_dummies(turnover[['sales', 'salary']])], axis = 1)

In [37]:
turnover.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,sales_IT,sales_RandD,sales_accounting,sales_hr,sales_management,sales_marketing,sales_product_mng,sales_sales,sales_support,sales_technical,salary_high,salary_low,salary_medium
0,0.38,0.53,2,157,3,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0
1,0.8,0.86,5,262,6,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1
2,0.11,0.88,7,272,4,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1
3,0.72,0.87,5,223,5,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0
4,0.37,0.52,2,159,3,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0


In [38]:
## defining the scaler
scaler = MinMaxScaler()

## changing scale to 0-1 (averag_monthly_hours, number_projects)
turnover[['number_project', 'average_montly_hours']] = scaler.fit_transform(turnover[['number_project', 'average_montly_hours']])

## using box-cox transformation
transformed_time_spent = boxcox(turnover['time_spend_company'])

turnover['time_spend_company'] = transformed_time_spent[0]

In [39]:
## defining the input and target variables
X = turnover.drop(columns = 'left', axis = 1)
Y = turnover['left']

## splitting the data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, stratify = Y)

## Building Models

In [41]:
## models
rf_md = RandomForestClassifier(n_estimators = 500, max_depth = 3).fit(X_train, Y_train)
ada_md = AdaBoostClassifier(base_estimator = DecisionTreeClassifier(max_depth = 3), n_estimators = 500, learning_rate = 0.01).fit(X_train, Y_train)

## predicting on the testing data
rf_preds = rf_md.predict_proba(X_test)[:, 1]
ada_preds = ada_md.predict_proba(X_test)[:, 1]

## changing likelihoods to labels
rf_preds = np.where(rf_preds < 0.22, 0, 1)
ada_preds = np.where(ada_preds < 0.22, 0, 1)

## creating classification report
print('RANDOM FOREST')
print(classification_report(Y_test, rf_preds))

print('ADABOOST')
print(classification_report(Y_test, ada_preds))

RANDOM FOREST
              precision    recall  f1-score   support

           0       0.98      0.84      0.90      2286
           1       0.64      0.95      0.77       714

    accuracy                           0.86      3000
   macro avg       0.81      0.89      0.83      3000
weighted avg       0.90      0.86      0.87      3000

ADABOOST
              precision    recall  f1-score   support

           0       1.00      0.32      0.48      2286
           1       0.31      1.00      0.48       714

    accuracy                           0.48      3000
   macro avg       0.66      0.66      0.48      3000
weighted avg       0.84      0.48      0.48      3000



Based on these results, I would choose Random Forest model to predict left because it has the highest overall performance.