In [1]:
import boto3
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from scipy.stats import boxcox
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.multiclass import OneVsRestClassifier
from sklearn.tree import DecisionTreeClassifier
import precision_recall_cutoff # Calling .py function

# Defining the s3 bucket
s3 = boto3.resource('s3')
bucket_name = 'gabriel-predictive-analytics'
bucket = s3.Bucket(bucket_name)

# Defining the file to be read from s3 bucket
file_key = "turnover.csv"

bucket_object = bucket.Object(file_key)
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

# Reading the csv file
turnover = pd.read_csv(file_content_stream)
turnover.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,sales,salary
0,0.38,0.53,2,157,3,0,1,0,sales,low
1,0.8,0.86,5,262,6,0,1,0,sales,medium
2,0.11,0.88,7,272,4,0,1,0,sales,medium
3,0.72,0.87,5,223,5,0,1,0,sales,low
4,0.37,0.52,2,159,3,0,1,0,sales,low


In [2]:
## Frequency table of the variable left
round(turnover['left'].value_counts() / turnover.shape[0], 2)

0    0.76
1    0.24
Name: left, dtype: float64

In [3]:
turnover['sales'].value_counts()

sales          4140
technical      2720
support        2229
IT             1227
product_mng     902
marketing       858
RandD           787
accounting      767
hr              739
management      630
Name: sales, dtype: int64

In [4]:
## Changing sales to dummy variables
turnover = pd.concat([turnover.drop(columns = ['sales']), pd.get_dummies(turnover['sales'])], axis = 1)

## Changing sales to dummy variables
turnover = pd.concat([turnover, pd.get_dummies(turnover['salary'])], axis = 1)

turnover.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,salary,IT,...,hr,management,marketing,product_mng,sales,support,technical,high,low,medium
0,0.38,0.53,2,157,3,0,1,0,low,0,...,0,0,0,0,1,0,0,0,1,0
1,0.8,0.86,5,262,6,0,1,0,medium,0,...,0,0,0,0,1,0,0,0,0,1
2,0.11,0.88,7,272,4,0,1,0,medium,0,...,0,0,0,0,1,0,0,0,0,1
3,0.72,0.87,5,223,5,0,1,0,low,0,...,0,0,0,0,1,0,0,0,1,0
4,0.37,0.52,2,159,3,0,1,0,low,0,...,0,0,0,0,1,0,0,0,1,0


In [5]:
# Defining sacaler
scaler = MinMaxScaler()

# Changing number_project and average_monthly_company to 0-1 scale
turnover[['number_project_0_1', 'average_monthly_company_0_1']] = scaler.fit_transform(turnover[['number_project', 'average_montly_hours']])

# BoxCox transformation
transformed_time_spend, best_lambda = boxcox(turnover['time_spend_company'])
turnover['time_spend_company_z'] = transformed_time_spend

turnover.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,salary,IT,...,product_mng,sales,support,technical,high,low,medium,number_project_0_1,average_monthly_company_0_1,time_spend_company_z
0,0.38,0.53,2,157,3,0,1,0,low,0,...,0,1,0,0,0,1,0,0.0,0.285047,0.804651
1,0.8,0.86,5,262,6,0,1,0,medium,0,...,0,1,0,0,0,0,1,0.6,0.775701,1.098118
2,0.11,0.88,7,272,4,0,1,0,medium,0,...,0,1,0,0,0,0,1,1.0,0.82243,0.941381
3,0.72,0.87,5,223,5,0,1,0,low,0,...,0,1,0,0,0,1,0,0.6,0.593458,1.03233
4,0.37,0.52,2,159,3,0,1,0,low,0,...,0,1,0,0,0,1,0,0.0,0.294393,0.804651


In [6]:
turnover.columns

Index(['satisfaction_level', 'last_evaluation', 'number_project',
       'average_montly_hours', 'time_spend_company', 'Work_accident', 'left',
       'promotion_last_5years', 'salary', 'IT', 'RandD', 'accounting', 'hr',
       'management', 'marketing', 'product_mng', 'sales', 'support',
       'technical', 'high', 'low', 'medium', 'number_project_0_1',
       'average_monthly_company_0_1', 'time_spend_company_z'],
      dtype='object')

In [7]:
X = turnover.drop(columns = ['number_project', 'average_montly_hours', 'time_spend_company', 'salary'])
Y = turnover['left']

# Splitting the data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, stratify = Y)

In [8]:
# Building Random Forest Model
rf_md = RandomForestClassifier(n_estimators = 500, max_depth = 3).fit(X_train, Y_train)

# Predicting on test
rf_pred = rf_md.predict_proba(X_test)[:,1]

# Predicting the labels
rf_labels = precision_recall_cutoff.precision_recall_cutoff(Y_test, rf_pred)

# Printing classification report
print(classification_report(Y_test, rf_labels))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      2286
           1       1.00      1.00      1.00       714

    accuracy                           1.00      3000
   macro avg       1.00      1.00      1.00      3000
weighted avg       1.00      1.00      1.00      3000



In [10]:
# Building Random Forest Model
ada_md = AdaBoostClassifier(base_estimator = DecisionTreeClassifier(max_depth = 3), n_estimators = 500, learning_rate = 0.01).fit(X_train, Y_train)

# Predicting on test
ada_pred = ada_md.predict_proba(X_test)[:,1]

# Predicting the labels
ada_labels = precision_recall_cutoff.precision_recall_cutoff(Y_test, ada_pred)

# Printing classification report
print(classification_report(Y_test, ada_labels))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      2286
           1       1.00      1.00      1.00       714

    accuracy                           1.00      3000
   macro avg       1.00      1.00      1.00      3000
weighted avg       1.00      1.00      1.00      3000



### Using the results from part 6 and 7, I would use either one of the models to predict left because they got the same results