In [14]:
import boto3
import pandas as pd; pd.set_option('display.max_columns', 50)
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import precision_recall_cutoff

### defining bucket###
s3=boto3.resource('s3')
bucket_name= 'dmw-448'
bucket = s3.Bucket(bucket_name)


file_key = 'turnover.csv'
bucket_object = bucket.Object(file_key)
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

## Reading the csv file
turnover = pd.read_csv(file_content_stream)
turnover.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,sales,salary
0,0.38,0.53,2,157,3,0,1,0,sales,low
1,0.8,0.86,5,262,6,0,1,0,sales,medium
2,0.11,0.88,7,272,4,0,1,0,sales,medium
3,0.72,0.87,5,223,5,0,1,0,sales,low
4,0.37,0.52,2,159,3,0,1,0,sales,low


In [15]:
## Changing sales to dummy variables
turnover = pd.concat([turnover.drop(columns = ['sales'], axis = 1), pd.get_dummies(turnover['sales'])], axis = 1)

## Changing salary to dummy variables
turnover = pd.concat([turnover, pd.get_dummies(turnover['salary'])], axis = 1)

turnover.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,salary,IT,RandD,accounting,hr,management,marketing,product_mng,sales,support,technical,high,low,medium
0,0.38,0.53,2,157,3,0,1,0,low,0,0,0,0,0,0,0,1,0,0,0,1,0
1,0.8,0.86,5,262,6,0,1,0,medium,0,0,0,0,0,0,0,1,0,0,0,0,1
2,0.11,0.88,7,272,4,0,1,0,medium,0,0,0,0,0,0,0,1,0,0,0,0,1
3,0.72,0.87,5,223,5,0,1,0,low,0,0,0,0,0,0,0,1,0,0,0,1,0
4,0.37,0.52,2,159,3,0,1,0,low,0,0,0,0,0,0,0,1,0,0,0,1,0


In [16]:
## creating interaction
turnover['interaction_1'] = np.where((turnover['satisfaction_level'] <= 0.465) & (turnover['number_project'] <= 2.5) & (turnover['last_evaluation'] <= 0.575), 1, 0)
turnover['interaction_2'] = np.where((turnover['satisfaction_level'] <= 0.465) & (turnover['number_project'] >= 2.5) & (turnover['satisfaction_level'] >= 0.115), 1, 0)
turnover['interaction_3'] = np.where((turnover['satisfaction_level'] >= 0.465) & (turnover['time_spend_company'] <= 4.5) & (turnover['average_montly_hours'] <= 290.5), 1, 0)

turnover.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,salary,IT,RandD,accounting,hr,management,marketing,product_mng,sales,support,technical,high,low,medium,interaction_1,interaction_2,interaction_3
0,0.38,0.53,2,157,3,0,1,0,low,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0
1,0.8,0.86,5,262,6,0,1,0,medium,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0
2,0.11,0.88,7,272,4,0,1,0,medium,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0
3,0.72,0.87,5,223,5,0,1,0,low,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0
4,0.37,0.52,2,159,3,0,1,0,low,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0


In [17]:
## Defining the input and target variables
X = turnover.drop(columns = ['left', 'salary'], axis = 1)
Y = turnover['left']

## Spliting the data 
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, stratify = Y)



In [24]:
## defining list 
results = list()

for i in range(0,10):
    
    ## split data
    X_training, X_testing, Y_training, Y_testing  = train_test_split(X, Y, test_size = 0.2, stratify = Y)
    
    ## building RF
    RF = RandomForestClassifier(n_estimators = 500, max_depth = 3).fit(X_training, Y_training)
    
    ## extract feature
    results.append(RF.feature_importances_)

    
results = pd.DataFrame(results)
results.columns = X.columns
results

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,IT,RandD,accounting,hr,management,marketing,product_mng,sales,support,technical,high,low,medium,interaction_1,interaction_2,interaction_3
0,0.177818,0.043481,0.114042,0.074934,0.077868,0.011063,0.000835,2e-05,0.000446,4.3e-05,6.4e-05,0.000341,1.4e-05,3.3e-05,5.8e-05,7.4e-05,8.9e-05,0.004962,0.004804,0.000907,0.186936,0.04564,0.255527
1,0.214027,0.045439,0.113074,0.064508,0.082312,0.010426,0.000607,2.8e-05,0.000224,1.1e-05,0.000199,0.000512,1.1e-05,4.6e-05,5.3e-05,5.5e-05,0.000129,0.004738,0.006024,0.000719,0.181427,0.045528,0.229905
2,0.176748,0.038285,0.119712,0.063152,0.088979,0.012023,0.000893,2.2e-05,0.000362,7.8e-05,7.9e-05,0.000381,1.5e-05,5.8e-05,3.3e-05,4.2e-05,0.00011,0.00421,0.006301,0.000963,0.192812,0.039322,0.25542
3,0.187639,0.037908,0.116909,0.078364,0.086373,0.01103,0.000864,4.8e-05,0.000594,2.6e-05,6.9e-05,0.000485,8e-06,3.3e-05,0.000104,4.8e-05,6.9e-05,0.004838,0.00884,0.001145,0.180944,0.041447,0.242216
4,0.18866,0.045038,0.110985,0.065459,0.073765,0.012616,0.001152,2.3e-05,0.000253,7.3e-05,0.000161,0.000217,6e-06,4.2e-05,9e-05,4.1e-05,0.000114,0.006077,0.007805,0.000784,0.183128,0.044002,0.25951
5,0.203365,0.038418,0.107719,0.062119,0.084134,0.013176,0.000597,3e-05,0.000358,2.1e-05,0.00013,0.000263,2.1e-05,5.7e-05,7.8e-05,4.7e-05,6.2e-05,0.003166,0.008521,0.000769,0.190213,0.045729,0.241006
6,0.191996,0.0424,0.101724,0.059355,0.081958,0.011448,0.001141,3.4e-05,0.000175,4.4e-05,8.6e-05,0.000441,9e-06,5.7e-05,6.2e-05,3.8e-05,0.000117,0.004103,0.004748,0.001166,0.185943,0.043601,0.269355
7,0.165558,0.049362,0.123063,0.075427,0.08753,0.010433,0.000646,4.3e-05,0.000426,7.6e-05,7.9e-05,0.000645,5e-06,4e-06,4.3e-05,3.4e-05,0.00015,0.004004,0.004857,0.000652,0.206925,0.031226,0.238812
8,0.19636,0.042218,0.104074,0.073674,0.085964,0.012544,0.001169,2.8e-05,0.000402,5.1e-05,5.2e-05,0.000454,1e-05,3.6e-05,9.9e-05,3.3e-05,5.7e-05,0.004227,0.007381,0.001076,0.18334,0.037182,0.249568
9,0.187372,0.041784,0.120098,0.062909,0.08115,0.00856,0.000683,3.8e-05,0.000284,7e-06,4.3e-05,0.000352,1.9e-05,3.6e-05,7.3e-05,7.2e-05,0.00012,0.004123,0.005534,0.001116,0.177683,0.040144,0.267798


In [25]:
## computoing averages 
results = pd.DataFrame(results.apply(np.mean, axis = 0))

results = pd.DataFrame({'Feature': results.index, 'Importance': results[0].values})
results = results.sort_values(by = 'Importance', ascending = False)
results

Unnamed: 0,Feature,Importance
22,interaction_3,0.250912
0,satisfaction_level,0.188954
20,interaction_1,0.186935
2,number_project,0.11314
4,time_spend_company,0.083003
3,average_montly_hours,0.06799
1,last_evaluation,0.042433
21,interaction_2,0.041382
5,Work_accident,0.011332
18,low,0.006482


In [28]:
X_train_1 = X_train[['interaction_3','interaction_1', 'satisfaction_level', 'number_project', 'time_spend_company']]
X_test_1 = X_test[['interaction_3','interaction_1', 'satisfaction_level', 'number_project', 'time_spend_company']]

## Random forest 
RF = RandomForestClassifier(n_estimators = 500, max_depth = 3).fit(X_train_1, Y_train)

## Predicting on test
RF_pred = RF.predict_proba(X_test_1)[:, 1]

## Predicting the labels
RF_labels = precision_recall_cutoff.precision_recall_cutoff(Y_test, RF_pred)

## Computing the classification report
print(classification_report(Y_test, RF_labels))



              precision    recall  f1-score   support

           0       0.97      0.96      0.97      2286
           1       0.89      0.90      0.89       714

    accuracy                           0.95      3000
   macro avg       0.93      0.93      0.93      3000
weighted avg       0.95      0.95      0.95      3000



In [27]:
X_train_2 = X_train[['interaction_3','interaction_1', 'satisfaction_level', 'number_project', 'time_spend_company', 'average_montly_hours']]
X_test_2 = X_test[['interaction_3','interaction_1', 'satisfaction_level', 'number_project', 'time_spend_company', 'average_montly_hours']]

## Random forest 
RF = RandomForestClassifier(n_estimators = 500, max_depth = 3).fit(X_train_2, Y_train)

## Predicting on test
RF_pred = RF.predict_proba(X_test_2)[:, 1]

## Predicting the labels
RF_labels = precision_recall_cutoff.precision_recall_cutoff(Y_test, RF_pred)

## Computing the classification report
print(classification_report(Y_test, RF_labels))


              precision    recall  f1-score   support

           0       0.97      0.97      0.97      2286
           1       0.92      0.92      0.92       714

    accuracy                           0.96      3000
   macro avg       0.95      0.95      0.95      3000
weighted avg       0.96      0.96      0.96      3000



## based on my results I would go with the second model that has the top 6 results 