In [5]:
import boto3
import pandas as pd; pd.set_option('display.max_column', 100)
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

from scipy.stats import boxcox
from precision_recall_cutoff import precision_recall_cutoff

from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE, RFECV

from sklearn.metrics import recall_score, classification_report

## define bucket in which you are trying to reach
s3 = boto3.resource('s3')
bucket_name = 'daltondencklau-data445-bucket'
bucket = s3.Bucket(bucket_name)

## define csv file to read in the bucket
file_key= 'turnover.csv'

## syntax to allow us to read the file
bucket_object = bucket.Object(file_key)
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

## reading the data file
turnover = pd.read_csv(file_content_stream)
turnover.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,sales,salary
0,0.38,0.53,2,157,3,0,1,0,sales,low
1,0.8,0.86,5,262,6,0,1,0,sales,medium
2,0.11,0.88,7,272,4,0,1,0,sales,medium
3,0.72,0.87,5,223,5,0,1,0,sales,low
4,0.37,0.52,2,159,3,0,1,0,sales,low


In [6]:
## changing 'sales' and 'salary' to dummy variables
turnover = pd.concat([turnover.drop(columns = ['sales', 'salary'], axis = 1), pd.get_dummies(turnover[['sales', 'salary']])], axis = 1)
turnover.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,sales_IT,sales_RandD,sales_accounting,sales_hr,sales_management,sales_marketing,sales_product_mng,sales_sales,sales_support,sales_technical,salary_high,salary_low,salary_medium
0,0.38,0.53,2,157,3,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0
1,0.8,0.86,5,262,6,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1
2,0.11,0.88,7,272,4,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1
3,0.72,0.87,5,223,5,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0
4,0.37,0.52,2,159,3,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0


In [7]:
## defining the scaler
scaler = MinMaxScaler()

## changing scale to 0-1 (averag_monthly_hours, number_projects)
turnover[['number_project', 'average_montly_hours']] = scaler.fit_transform(turnover[['number_project', 'average_montly_hours']])

## using box-cox transformation
transformed_time_spent = boxcox(turnover['time_spend_company'])

turnover['time_spend_company'] = transformed_time_spent[0]

In [8]:
## Engineering interactions
turnover['interaction_1'] = turnover['satisfaction_level'] * turnover['time_spend_company']
turnover['interaction_2'] = turnover['last_evaluation'] * turnover['promotion_last_5years']
turnover.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,sales_IT,sales_RandD,sales_accounting,sales_hr,sales_management,sales_marketing,sales_product_mng,sales_sales,sales_support,sales_technical,salary_high,salary_low,salary_medium,interaction_1,interaction_2
0,0.38,0.53,0.0,0.285047,0.804651,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0.305767,0.0
1,0.8,0.86,0.6,0.775701,1.098118,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0.878494,0.0
2,0.11,0.88,1.0,0.82243,0.941381,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0.103552,0.0
3,0.72,0.87,0.6,0.593458,1.03233,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0.743278,0.0
4,0.37,0.52,0.0,0.294393,0.804651,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0.297721,0.0


In [9]:
## defining input and target
x = turnover.drop(columns = 'left', axis = 1)
y = turnover['left']

## splitting the data into training and testing
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, stratify = y)

### RFE w Random Forest
***

In [10]:
## building model
rf_rfe = RFE(estimator = RandomForestClassifier(n_estimators = 500, max_depth = 3), 
             n_features_to_select = 5).fit(x_train, y_train)

## extracting features
print(x_train.columns[rf_rfe.support_])

Index(['satisfaction_level', 'number_project', 'average_montly_hours',
       'time_spend_company', 'interaction_1'],
      dtype='object')


In [12]:
## defining inputs and target
x_train_1 = x_train[['satisfaction_level', 'number_project', 'average_montly_hours',
       'time_spend_company', 'interaction_1']]

x_test_1 = x_test[['satisfaction_level', 'number_project', 'average_montly_hours',
       'time_spend_company', 'interaction_1']]

## building random forest model with the above important features
rf_md_1 = RandomForestClassifier(n_estimators = 500, max_depth = 3).fit(x_train_1, y_train)

## predicting on the testing
rf_preds_1 = rf_md_1.predict_proba(x_test_1)[:, 1]

## changing likelihoods to labes
rf_labels_1 = precision_recall_cutoff(y_test, rf_preds_1)

## printing classification report
print(classification_report(y_test, rf_labels_1))

              precision    recall  f1-score   support

           0       0.97      0.97      0.97      2286
           1       0.91      0.89      0.90       714

    accuracy                           0.95      3000
   macro avg       0.94      0.93      0.94      3000
weighted avg       0.95      0.95      0.95      3000



### RFECV w Random Forest
***

In [13]:
auto_feature_selection = RFECV(estimator = RandomForestClassifier(n_estimators = 500, max_depth = 3), 
                               step = 1, min_features_to_select = 5, cv = 3).fit(x_train, y_train)

## extracting feature names
print(x_train.columns[auto_feature_selection.support_])

Index(['satisfaction_level', 'last_evaluation', 'number_project',
       'average_montly_hours', 'time_spend_company', 'Work_accident',
       'promotion_last_5years', 'sales_IT', 'sales_RandD', 'sales_accounting',
       'sales_hr', 'sales_management', 'sales_marketing', 'sales_product_mng',
       'sales_sales', 'sales_support', 'sales_technical', 'salary_high',
       'salary_low', 'salary_medium', 'interaction_1', 'interaction_2'],
      dtype='object')


In [16]:
## defining input and target variables for new model
x_train_2 = x_train[['satisfaction_level', 'last_evaluation', 'number_project',
       'average_montly_hours', 'time_spend_company', 'Work_accident',
       'promotion_last_5years', 'sales_IT', 'sales_RandD', 'sales_accounting',
       'sales_hr', 'sales_management', 'sales_marketing', 'sales_product_mng',
       'sales_sales', 'sales_support', 'sales_technical', 'salary_high',
       'salary_low', 'salary_medium', 'interaction_1', 'interaction_2']]

x_test_2 = x_test[['satisfaction_level', 'last_evaluation', 'number_project',
       'average_montly_hours', 'time_spend_company', 'Work_accident',
       'promotion_last_5years', 'sales_IT', 'sales_RandD', 'sales_accounting',
       'sales_hr', 'sales_management', 'sales_marketing', 'sales_product_mng',
       'sales_sales', 'sales_support', 'sales_technical', 'salary_high',
       'salary_low', 'salary_medium', 'interaction_1', 'interaction_2']]

## building random forest model with the above important features
rf_md_2 = RandomForestClassifier(n_estimators = 500, max_depth = 3).fit(x_train_2, y_train)

## predicting on the testing
rf_preds_2 = rf_md_2.predict_proba(x_test_2)[:, 1]

## changing likelihoods to labes
rf_labels_2 = precision_recall_cutoff(y_test, rf_preds_2)

## printing classification report
print(classification_report(y_test, rf_labels_2))

              precision    recall  f1-score   support

           0       0.95      0.89      0.92      2286
           1       0.71      0.86      0.78       714

    accuracy                           0.88      3000
   macro avg       0.83      0.87      0.85      3000
weighted avg       0.89      0.88      0.89      3000



In [17]:
## Based on these results, model 1 is better at predicting employee turnover.