In [1]:
import boto3
import pandas as pd; pd.set_option('display.max_column', 100)
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

from scipy.stats import boxcox
from precision_recall_cutoff import precision_recall_cutoff

from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import recall_score, classification_report

## define bucket in which you are trying to reach
s3 = boto3.resource('s3')
bucket_name = 'daltondencklau-data445-bucket'
bucket = s3.Bucket(bucket_name)

## define csv file to read in the bucket
file_key= 'turnover.csv'

## syntax to allow us to read the file
bucket_object = bucket.Object(file_key)
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

## reading the data file
turnover = pd.read_csv(file_content_stream)
turnover.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,sales,salary
0,0.38,0.53,2,157,3,0,1,0,sales,low
1,0.8,0.86,5,262,6,0,1,0,sales,medium
2,0.11,0.88,7,272,4,0,1,0,sales,medium
3,0.72,0.87,5,223,5,0,1,0,sales,low
4,0.37,0.52,2,159,3,0,1,0,sales,low


In [2]:
## changing 'sales' and 'salary' to dummy variables
turnover = pd.concat([turnover.drop(columns = ['sales', 'salary'], axis = 1), pd.get_dummies(turnover[['sales', 'salary']])], axis = 1)
turnover.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,sales_IT,sales_RandD,sales_accounting,sales_hr,sales_management,sales_marketing,sales_product_mng,sales_sales,sales_support,sales_technical,salary_high,salary_low,salary_medium
0,0.38,0.53,2,157,3,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0
1,0.8,0.86,5,262,6,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1
2,0.11,0.88,7,272,4,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1
3,0.72,0.87,5,223,5,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0
4,0.37,0.52,2,159,3,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0


In [3]:
## Engineering Features from the decision tree model (from in class 10)
turnover['interaction_1'] = np.where(((turnover['satisfaction_level'] >= 0.115) &
                                      (turnover['satisfaction_level'] <= 0.465) &
                                      (turnover['number_project'] > 2.5)), 1, 0)
                              
turnover['interaction_2'] = np.where(((turnover['satisfaction_level'] <= 0.465) &
                                      (turnover['satisfaction_level'] <= 2.5) &
                                      (turnover['number_project'] <= 0.575)), 1, 0)

turnover['interaction_3'] = np.where(((turnover['satisfaction_level'] > 0.465) &
                                      (turnover['time_spend_company'] <= 0.45) &
                                      (turnover['average_montly_hours'] <= 290.5)), 1, 0)

In [5]:
## defining input and target variables
x = turnover.drop(columns = 'left', axis = 1)
y = turnover['left']

## splitting the data
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, stratify = y)

In [8]:
## defining a list to store results (feature importances)
results = list()

for i in range(0,10):
    
    ## splitting the training data into train and test
    x_training, x_testing, y_training, y_testing = train_test_split(x_train, y_train, test_size = 0.2, stratify = y_train)
    
    ## building the RF model
    rf_md = RandomForestClassifier(n_estimators = 500, max_depth = 3).fit(x_training, y_training)
    
    ## extracting feature importances
    results.append(rf_md.feature_importances_)
    
## transforming list into dataframe
results = pd.DataFrame(results)
results.columns = x.columns

In [9]:
results

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,sales_IT,sales_RandD,sales_accounting,sales_hr,sales_management,sales_marketing,sales_product_mng,sales_sales,sales_support,sales_technical,salary_high,salary_low,salary_medium,interaction_1,interaction_2,interaction_3
0,0.308647,0.080279,0.201739,0.127367,0.158734,0.029995,0.003055,0.000102,0.000871,8.2e-05,0.000761,0.000957,8.2e-05,0.000169,0.000209,0.000158,0.000228,0.015072,0.016351,0.003221,0.051921,0.0,0.0
1,0.321073,0.068471,0.213418,0.124247,0.162055,0.025852,0.001788,0.000244,0.001273,0.000108,0.000675,0.000537,5.7e-05,8.5e-05,0.000209,0.000239,0.000289,0.01251,0.01253,0.004354,0.049986,0.0,0.0
2,0.302556,0.075657,0.202354,0.154424,0.148196,0.028438,0.001938,0.00024,0.00114,6.9e-05,0.000646,0.000739,3.2e-05,7.1e-05,0.000155,0.000316,0.000392,0.012098,0.018122,0.003342,0.049077,0.0,0.0
3,0.32331,0.075863,0.204801,0.127744,0.164237,0.023534,0.001905,9e-05,0.000978,9.6e-05,0.000282,0.000613,5.8e-05,9.6e-05,0.000359,7.4e-05,0.000117,0.013466,0.013078,0.002217,0.047081,0.0,0.0
4,0.310038,0.073431,0.240354,0.134528,0.146642,0.025264,0.002404,0.000198,0.000772,2.8e-05,0.000568,0.000299,6.7e-05,9.1e-05,0.000268,8.3e-05,0.000296,0.008574,0.013984,0.002254,0.039857,0.0,0.0
5,0.323535,0.065861,0.220749,0.131083,0.139351,0.030964,0.001823,0.000196,0.00127,0.000166,0.000622,0.000702,0.000175,0.000158,0.000297,0.000133,0.000242,0.018533,0.014846,0.003075,0.046221,0.0,0.0
6,0.330933,0.073101,0.195725,0.135466,0.145255,0.025799,0.002216,0.000127,0.00061,0.000133,0.000621,0.000678,5.4e-05,8.6e-05,0.000243,0.000255,0.000382,0.01488,0.014896,0.003378,0.055163,0.0,0.0
7,0.297578,0.069033,0.22065,0.1194,0.175384,0.028784,0.001928,0.000106,0.001277,4.9e-05,0.00038,0.000954,0.000177,0.000172,0.000125,8.8e-05,0.000313,0.012859,0.018327,0.004326,0.04809,0.0,0.0
8,0.308314,0.073551,0.222731,0.130608,0.1533,0.029807,0.002275,0.000154,0.001378,0.000169,0.000633,0.000589,2.8e-05,0.000165,0.00021,0.000295,0.000296,0.013428,0.013697,0.003171,0.0452,0.0,0.0
9,0.343273,0.073555,0.192842,0.128486,0.170864,0.025476,0.001578,0.000145,0.001059,0.000145,0.000524,0.000597,4e-05,8.5e-05,0.000164,0.000162,0.0004,0.00935,0.011897,0.002981,0.036378,0.0,0.0


In [10]:
## averaging the importances for each feature
results = pd.DataFrame(results.apply(np.mean, axis = 0))

In [11]:
results

Unnamed: 0,0
satisfaction_level,0.316926
last_evaluation,0.07288
number_project,0.211536
average_montly_hours,0.131335
time_spend_company,0.156402
Work_accident,0.027391
promotion_last_5years,0.002091
sales_IT,0.00016
sales_RandD,0.001063
sales_accounting,0.000104


In [12]:
## cleaning up the dataframe
results = pd.DataFrame({'Feature': results.index, 'Importances': results[0].values})

In [13]:
results

Unnamed: 0,Feature,Importances
0,satisfaction_level,0.316926
1,last_evaluation,0.07288
2,number_project,0.211536
3,average_montly_hours,0.131335
4,time_spend_company,0.156402
5,Work_accident,0.027391
6,promotion_last_5years,0.002091
7,sales_IT,0.00016
8,sales_RandD,0.001063
9,sales_accounting,0.000104


In [14]:
## sorting the importances
results = results.sort_values(by = 'Importances', ascending = False)

In [15]:
results

Unnamed: 0,Feature,Importances
0,satisfaction_level,0.316926
2,number_project,0.211536
4,time_spend_company,0.156402
3,average_montly_hours,0.131335
1,last_evaluation,0.07288
20,interaction_1,0.046897
5,Work_accident,0.027391
18,salary_low,0.014773
17,salary_high,0.013077
19,salary_medium,0.003232


In [17]:
## Building Random Forest Model . . .