In [19]:
import boto3
import pandas as pd; pd.set_option('display.max_column', 100)
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import MinMaxScaler

from scipy.stats import boxcox
from precision_recall_cutoff import precision_recall_cutoff
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.feature_selection import RFE, RFECV
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import recall_score, classification_report

## define bucket in which you are trying to reach
s3 = boto3.resource('s3')
bucket_name = 'daltondencklau-data445-bucket'
bucket = s3.Bucket(bucket_name)

## define csv file to read in the bucket
file_key= 'turnover.csv'

## syntax to allow us to read the file
bucket_object = bucket.Object(file_key)
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

## reading the data file
turnover = pd.read_csv(file_content_stream)
turnover.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,sales,salary
0,0.38,0.53,2,157,3,0,1,0,sales,low
1,0.8,0.86,5,262,6,0,1,0,sales,medium
2,0.11,0.88,7,272,4,0,1,0,sales,medium
3,0.72,0.87,5,223,5,0,1,0,sales,low
4,0.37,0.52,2,159,3,0,1,0,sales,low


In [20]:
## Engineering Features from the decision tree model
turnover['interaction_1'] = np.where(((turnover['satisfaction_level'] >= 0.115) &
                                      (turnover['satisfaction_level'] <= 0.465) &
                                      (turnover['number_project'] > 2.5)), 1, 0)
                              
turnover['interaction_2'] = np.where(((turnover['satisfaction_level'] <= 0.465) &
                                      (turnover['satisfaction_level'] <= 2.5) &
                                      (turnover['number_project'] <= 0.575)), 1, 0)

turnover['interaction_3'] = np.where(((turnover['satisfaction_level'] > 0.465) &
                                      (turnover['time_spend_company'] <= 0.45) &
                                      (turnover['average_montly_hours'] <= 290.5)), 1, 0)

In [21]:
## defining input and target
x = turnover[['satisfaction_level', 'number_project', 'time_spend_company', 'interaction_1','interaction_3']]
y = turnover['left']

## splitting the data into training and testing
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, stratify = y)

### GridsearchCV with AdaBoost
***

In [24]:
## defining hyperparameters to consider tuning
ada_param_grid = {'n_estimators': [100, 300],
                 'base_estimator__min_samples_split': [10, 15],
                 'base_estimator__min_samples_leaf': [5, 7],
                 'base_estimator__max_depth': [3, 5, 7],
                 'learning_rate': [0.001]}

## running GridsearchCV
ada_grid_search = GridSearchCV(estimator = AdaBoostClassifier(base_estimator = DecisionTreeClassifier()),
                               param_grid = ada_param_grid, cv = 3, scoring = 'f1', n_jobs = -1).fit(x_train, y_train)

## extracting the best hyperparameter combinationsRF_param_grid
ada_grid_search.best_params_



{'base_estimator__max_depth': 7,
 'base_estimator__min_samples_leaf': 5,
 'base_estimator__min_samples_split': 10,
 'learning_rate': 0.001,
 'n_estimators': 100}

In [27]:
## building ada boost model
ada_md = AdaBoostClassifier(base_estimator = DecisionTreeClassifier(min_samples_split = 10,
                            min_samples_leaf = 5, max_depth = 7), n_estimators = 100,
                            learning_rate = 0.001).fit(x_train, y_train)

## predicting on the testing
ada_preds = ada_md.predict_proba(x_test)[:, 1]

## changing likelihoods to labes
ada_labels = precision_recall_cutoff(y_test, ada_preds)

## printing classification report
print(classification_report(y_test, ada_labels))



              precision    recall  f1-score   support

           0       0.98      0.98      0.98      2286
           1       0.95      0.92      0.93       714

    accuracy                           0.97      3000
   macro avg       0.96      0.95      0.96      3000
weighted avg       0.97      0.97      0.97      3000



### Randomized Search CV with AdaBoost
***

In [28]:
## defining hyperparameters to consider tuning
ada_param_grid2 = {'n_estimators': [100, 300],
                 'base_estimator__min_samples_split': [10, 15],
                 'base_estimator__min_samples_leaf': [5, 7],
                 'base_estimator__max_depth': [3, 5, 7],
                 'learning_rate': [0.01]}

## running GridsearchCV
ada_grid_search2 = RandomizedSearchCV(estimator = AdaBoostClassifier(base_estimator = DecisionTreeClassifier()),
                               param_distributions = ada_param_grid2, cv = 3, scoring = 'f1', n_jobs = -1, n_iter = 10).fit(x_train, y_train)

## extracting the best hyperparameter combinationsRF_param_grid
ada_grid_search2.best_params_



{'n_estimators': 100,
 'learning_rate': 0.01,
 'base_estimator__min_samples_split': 10,
 'base_estimator__min_samples_leaf': 7,
 'base_estimator__max_depth': 7}

In [29]:
## building ada boost model
ada_md2 = AdaBoostClassifier(base_estimator = DecisionTreeClassifier(min_samples_split = 10,
                            min_samples_leaf = 7, max_depth = 7), n_estimators = 100,
                            learning_rate = 0.01).fit(x_train, y_train)

## predicting on the testing
ada_preds2 = ada_md2.predict_proba(x_test)[:, 1]

## changing likelihoods to labes
ada_labels2 = precision_recall_cutoff(y_test, ada_preds2)

## printing classification report
print(classification_report(y_test, ada_labels2))



              precision    recall  f1-score   support

           0       0.98      0.98      0.98      2286
           1       0.95      0.92      0.93       714

    accuracy                           0.97      3000
   macro avg       0.96      0.95      0.96      3000
weighted avg       0.97      0.97      0.97      3000



Based on these classification reports, I cannot make a certain conclusion on what model has the best performance. Both model perfomed exactly the same I would need to do further analysis to determine the preferred model.