In [1]:
import boto3
import pandas as pd; pd.set_option('display.max_columns', 100)
import numpy as np
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import recall_score, classification_report
from sklearn.impute import SimpleImputer, KNNImputer
from scipy.stats import boxcox
from precision_recall_cutoff import precision_recall_cutoff

s3 = boto3.resource('s3')
bucket_name = 'grant-gonnerman-data-445'
bucket = s3.Bucket(bucket_name)

file_key = 'turnover.csv'

bucket_object = bucket.Object(file_key)
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

# reading data file
turnover = pd.read_csv(file_content_stream)
turnover.head()

Matplotlib is building the font cache; this may take a moment.


Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,sales,salary
0,0.38,0.53,2,157,3,0,1,0,sales,low
1,0.8,0.86,5,262,6,0,1,0,sales,medium
2,0.11,0.88,7,272,4,0,1,0,sales,medium
3,0.72,0.87,5,223,5,0,1,0,sales,low
4,0.37,0.52,2,159,3,0,1,0,sales,low


In [2]:
# changing sales and salary dummies
turnover = pd.concat([turnover.drop(columns = ['sales', 'salary'], axis =  1 ), pd.get_dummies(turnover[['sales', 'salary']])], axis = 1)
turnover.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,sales_IT,sales_RandD,sales_accounting,sales_hr,sales_management,sales_marketing,sales_product_mng,sales_sales,sales_support,sales_technical,salary_high,salary_low,salary_medium
0,0.38,0.53,2,157,3,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0
1,0.8,0.86,5,262,6,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1
2,0.11,0.88,7,272,4,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1
3,0.72,0.87,5,223,5,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0
4,0.37,0.52,2,159,3,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0


In [3]:
# engineering interactions
turnover['interaction_1'] = np.where((turnover['satisfaction_level'] >= .115) & (turnover['satisfaction_level'] <= .465) & (turnover['number_project'] > 2.5), 1, 0)
turnover['interaction_2'] = np.where((turnover['satisfaction_level'] <= .465) & (turnover['number_project'] <= 2.5) & (turnover['last_evaluation'] <= .575), 1, 0)
turnover['interaction_2'] = np.where((turnover['satisfaction_level'] > .465) & (turnover['time_spend_company'] <= 4.5) & (turnover['average_montly_hours'] <= 290.5), 1, 0)

In [4]:
# defining imput and target
x = turnover.drop(columns = 'left', axis = 1)
y = turnover['left']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, stratify = y)

In [17]:
# list to store importances
results = list()

for i in range(0,10):
    #splitting the train data
    x_training, x_testing, y_training, y_testing = train_test_split(x_train, y_train, test_size = 0.2, stratify = y_train)
    
    # building decision tree
    rf_md = RandomForestClassifier(n_estimators = 500, max_depth = 3).fit(x_training, y_training)
    
    # feature importances
    results.append(rf_md.feature_importances_)
    
# transforming ot dataframe
results = pd.DataFrame(results)
results.columns = x.columns
results

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,sales_IT,sales_RandD,sales_accounting,sales_hr,sales_management,sales_marketing,sales_product_mng,sales_sales,sales_support,sales_technical,salary_high,salary_low,salary_medium,interaction_1,interaction_2
0,0.254146,0.048957,0.17772,0.08684,0.098136,0.016163,0.00098,0.000175,0.000531,2.5e-05,0.000233,0.00067,3.9e-05,2.2e-05,8.8e-05,9.5e-05,8.4e-05,0.007152,0.008854,0.001166,0.046813,0.251111
1,0.239643,0.053303,0.145929,0.084947,0.105856,0.013316,0.001167,4.8e-05,0.000489,4.8e-05,0.00033,0.000397,0.000107,0.000133,0.000114,4.6e-05,9.7e-05,0.006851,0.005513,0.001304,0.058909,0.281454
2,0.24324,0.047063,0.147733,0.099336,0.10515,0.011906,0.000513,2.3e-05,0.000949,2.4e-05,0.000155,0.000471,1.3e-05,1.6e-05,9e-05,7.1e-05,0.000213,0.006313,0.006762,0.000754,0.056955,0.272251
3,0.201856,0.052568,0.151476,0.082756,0.103576,0.018523,0.000772,2.5e-05,0.000453,6e-05,0.000157,0.000328,1.8e-05,5.3e-05,0.000154,6.4e-05,0.000175,0.00668,0.008426,0.001636,0.062971,0.307274
4,0.224513,0.048165,0.160006,0.090621,0.106186,0.018635,0.000649,0.000107,0.000469,8.9e-05,0.000183,0.000588,3.1e-05,4.7e-05,9.6e-05,0.000107,0.00019,0.004958,0.012165,0.001913,0.059292,0.27099
5,0.217681,0.046907,0.139363,0.07895,0.105599,0.016823,0.000956,6.5e-05,0.00032,7.7e-05,6.2e-05,0.000531,9e-06,5.1e-05,6e-05,5.1e-05,5.7e-05,0.008604,0.009215,0.001951,0.062363,0.310304
6,0.205714,0.049932,0.15596,0.108119,0.101007,0.014557,0.001435,5.3e-05,0.000796,3.8e-05,5.3e-05,0.000409,1.2e-05,3.7e-05,0.00017,9.3e-05,0.000176,0.005613,0.006172,0.001153,0.057362,0.291139
7,0.219672,0.055924,0.167785,0.092134,0.117603,0.01634,0.000852,4.2e-05,0.000515,4.7e-05,0.000454,0.000359,2.6e-05,8.1e-05,3.5e-05,7.8e-05,0.00036,0.007118,0.010056,0.001334,0.05384,0.255346
8,0.239024,0.046397,0.151937,0.078537,0.111435,0.013002,0.000594,5.9e-05,0.000716,6.3e-05,0.000222,0.000341,2.2e-05,4.8e-05,9.7e-05,5e-05,0.000112,0.006914,0.00631,0.0025,0.057806,0.283814
9,0.228094,0.049795,0.14461,0.079295,0.105366,0.014215,0.000966,5.4e-05,0.000572,4.8e-05,0.00018,0.000496,4.6e-05,7e-06,0.000119,7.3e-05,5.2e-05,0.005996,0.009424,0.001776,0.052596,0.30622


In [18]:
# averaging results
results = pd.DataFrame(results.apply(np.mean, axis = 0))
results = pd.DataFrame({'Feature': results.index, 'Importance': results[0].values})
results = results.sort_values(by = 'Importance', ascending = False)

In [19]:
results

Unnamed: 0,Feature,Importance
21,interaction_2,0.28299
0,satisfaction_level,0.227358
2,number_project,0.154252
4,time_spend_company,0.105991
3,average_montly_hours,0.088153
20,interaction_1,0.056891
1,last_evaluation,0.049901
5,Work_accident,0.015348
18,salary_low,0.00829
17,salary_high,0.00662
