In [126]:
#import pandas_profiling as pp
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use('fivethirtyeight')
#plt.style.use('default')

import warnings
warnings.filterwarnings('ignore')

from sklearn.preprocessing import OneHotEncoder, StandardScaler

from sklearn.compose import ColumnTransformer

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression

from sklearn.metrics import classification_report, recall_score

from sklearn.pipeline import Pipeline

from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
import xgboost as xgb
from catboost import CatBoostClassifier

pd.set_option('display.max_rows', 250)
pd.set_option('display.min_rows', 100)
pd.set_option('display.max_columns', 30)

In [127]:
df = pd.read_csv('df_for_preprocessing.csv')

In [128]:
df.shape

(14999, 24)

In [129]:
df.head()

Unnamed: 0,id,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,churn,promotion_last_5years,dept,salary,sat_level_cluster,last_eval_cluster,avg_month_hr_cluster,satisfaction_level^1xlast_evaluation^1,satisfaction_level^1xnumber_project^1,satisfaction_level^1xaverage_montly_hours^1,satisfaction_level^1xtime_spend_company^1,last_evaluation^1xnumber_project^1,last_evaluation^1xaverage_montly_hours^1,last_evaluation^1xtime_spend_company^1,number_project^1xaverage_montly_hours^1,number_project^1xtime_spend_company^1,average_montly_hours^1xtime_spend_company^1
0,0,0.38,0.53,2,157,3,0,1,0,sales,low,unsatisfied,low performance,low,0.2014,0.76,59.66,1.14,1.06,83.21,1.59,314.0,6.0,471.0
1,1,0.8,0.86,5,262,6,0,1,0,sales,medium,very satisfied,high performance,high,0.688,4.0,209.6,4.8,4.3,225.32,5.16,1310.0,30.0,1572.0
2,2,0.11,0.88,7,272,4,0,1,0,sales,medium,very unsatisfied,very high performance,high,0.0968,0.77,29.92,0.44,6.16,239.36,3.52,1904.0,28.0,1088.0
3,3,0.72,0.87,5,223,5,0,1,0,sales,low,satisfied,high performance,medium,0.6264,3.6,160.56,3.6,4.35,194.01,4.35,1115.0,25.0,1115.0
4,4,0.37,0.52,2,159,3,0,1,0,sales,low,unsatisfied,low performance,low,0.1924,0.74,58.83,1.11,1.04,82.68,1.56,318.0,6.0,477.0


In [130]:
df.columns

Index(['id', 'satisfaction_level', 'last_evaluation', 'number_project',
       'average_montly_hours', 'time_spend_company', 'Work_accident', 'churn',
       'promotion_last_5years', 'dept', 'salary', 'sat_level_cluster',
       'last_eval_cluster', 'avg_month_hr_cluster',
       'satisfaction_level^1xlast_evaluation^1',
       'satisfaction_level^1xnumber_project^1',
       'satisfaction_level^1xaverage_montly_hours^1',
       'satisfaction_level^1xtime_spend_company^1',
       'last_evaluation^1xnumber_project^1',
       'last_evaluation^1xaverage_montly_hours^1',
       'last_evaluation^1xtime_spend_company^1',
       'number_project^1xaverage_montly_hours^1',
       'number_project^1xtime_spend_company^1',
       'average_montly_hours^1xtime_spend_company^1'],
      dtype='object')

In [131]:
#https://towardsdatascience.com/7-data-types-a-better-way-to-think-about-data-types-for-machine-learning-939fae99a689

#For personal purposes

useless_var = ['id']

nominal_var = ['dept',
               'salary',
               'sat_level_cluster',
               'last_eval_cluster',
               'avg_month_hr_cluster']

binary_var = ['Work_accident',
              'promotion_last_5years']

target_var = ['churn']

time_var = []

count_var = ['number_project',
             'time_spend_company']

interval_var = ['satisfaction_level',
                'last_evaluation',
                'average_montly_hours',
                'satisfaction_level^1xlast_evaluation^1',
                'satisfaction_level^1xnumber_project^1',
                'satisfaction_level^1xaverage_montly_hours^1',
                'satisfaction_level^1xtime_spend_company^1',
                'last_evaluation^1xnumber_project^1',
                'last_evaluation^1xaverage_montly_hours^1',
                'last_evaluation^1xtime_spend_company^1',
                'number_project^1xaverage_montly_hours^1',
                'number_project^1xtime_spend_company^1',
                'average_montly_hours^1xtime_spend_company^1']

## Transform the data

In [132]:
num_var = count_var + interval_var

In [133]:
nominal_transformer = OneHotEncoder([['sales', 'accounting', 'hr', 'technical', 'support', 'management',
       'IT', 'product_mng', 'marketing', 'RandD'],
                                    ['low', 'medium', 'high'],
                                    ['unsatisfied', 'very satisfied', 'very unsatisfied', 'satisfied'],
                                    ['low performance', 'high performance', 'very high performance',
       'average performance'],
                                    ['low', 'high', 'medium']
                                    ])

num_transformer = StandardScaler()

In [134]:
preprocessor = ColumnTransformer(transformers = [
                                                ('scaler', num_transformer, num_var),
                                                ('onehot', nominal_transformer, nominal_var)
                                                ],
                                remainder = 'passthrough')

In [135]:
# Assign variables
X = df[num_var + nominal_var + binary_var]
y = df[target_var]

# Then split the remaining into training and validation sets
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y,
                                                    test_size = 0.3,
                                                    random_state = 42)

In [136]:
X_train.head()

Unnamed: 0,number_project,time_spend_company,satisfaction_level,last_evaluation,average_montly_hours,satisfaction_level^1xlast_evaluation^1,satisfaction_level^1xnumber_project^1,satisfaction_level^1xaverage_montly_hours^1,satisfaction_level^1xtime_spend_company^1,last_evaluation^1xnumber_project^1,last_evaluation^1xaverage_montly_hours^1,last_evaluation^1xtime_spend_company^1,number_project^1xaverage_montly_hours^1,number_project^1xtime_spend_company^1,average_montly_hours^1xtime_spend_company^1,dept,salary,sat_level_cluster,last_eval_cluster,avg_month_hr_cluster,Work_accident,promotion_last_5years
12602,7,4,0.1,0.84,250,0.084,0.7,25.0,0.4,5.88,210.0,3.36,1750.0,28.0,1000.0,product_mng,low,very unsatisfied,high performance,high,0,0
4889,4,3,0.57,0.68,154,0.3876,2.28,87.78,1.71,2.72,104.72,2.04,616.0,12.0,462.0,management,medium,satisfied,average performance,low,1,0
1572,2,3,0.39,0.48,154,0.1872,0.78,60.06,1.17,0.96,73.92,1.44,308.0,6.0,462.0,technical,low,unsatisfied,low performance,low,0,0
13375,4,4,0.91,0.68,132,0.6188,3.64,120.12,3.64,2.72,89.76,2.72,528.0,16.0,528.0,IT,medium,very satisfied,average performance,low,0,0
879,5,5,0.82,0.97,263,0.7954,4.1,215.66,4.1,4.85,255.11,4.85,1315.0,25.0,1315.0,technical,medium,very satisfied,very high performance,high,0,0


In [137]:
# Fitting the preprocessor
X_train_transformed = preprocessor.fit_transform(X_train)

# Transform the input validation and test sets
X_test_transformed = preprocessor.transform(X_test)

In [138]:
# why we name the transformers
preprocessor.named_transformers_.keys()

dict_keys(['scaler', 'onehot', 'remainder'])

## Put all the transformed X data back into a dataframe

In [139]:
preprocessor.named_transformers_

{'scaler': StandardScaler(copy=True, with_mean=True, with_std=True),
 'onehot': OneHotEncoder(categories=[['sales', 'accounting', 'hr', 'technical', 'support',
                            'management', 'IT', 'product_mng', 'marketing',
                            'RandD'],
                           ['low', 'medium', 'high'],
                           ['unsatisfied', 'very satisfied', 'very unsatisfied',
                            'satisfied'],
                           ['low performance', 'high performance',
                            'very high performance', 'average performance'],
                           ['low', 'high', 'medium']],
               drop=None, dtype=<class 'numpy.float64'>, handle_unknown='error',
               sparse=True),
 'remainder': 'passthrough'}

In [140]:
#Grab the column names for all of our nominal variables
one_hot_columns = preprocessor.named_transformers_.onehot.get_feature_names()

In [141]:
transformed_columns = num_var + list(one_hot_columns) + binary_var

In [142]:
len(transformed_columns)

41

In [143]:
X_train_transformed.shape # Great! They match

(10499, 41)

In [144]:
X_train_transform_df = pd.DataFrame(X_train_transformed, columns = transformed_columns)
X_test_transform_df = pd.DataFrame(X_test_transformed, columns = transformed_columns)

In [145]:
classifiers = [
    LogisticRegression(penalty = 'l1', solver='saga', class_weight='balanced',random_state=42, C=1),
    SVC(kernel="rbf", C=0.025, random_state=42, probability=True),
    DecisionTreeClassifier(random_state=42),
    DecisionTreeRegressor(random_state=42),
    #VotingClassifier(random_state=42),
    RandomForestClassifier(random_state=42),
    AdaBoostClassifier(random_state=42),
    GradientBoostingClassifier(random_state=42),
    xgb.XGBClassifier(verbosity=0,random_state=42),
    CatBoostClassifier(random_seed=42)
    ]

for classifier in classifiers:
    pipe = Pipeline(steps=[('classifier', classifier)])
    pipe.fit(X_train_transform_df, y_train)
    print(f'*****{classifier}*****')
    print(classification_report(y_train,pipe.predict(X_train_transform_df)))
    print(classification_report(y_test,pipe.predict(X_test_transform_df)))
    #print("model recall score: %.3f" % pipe.score(X_test_transform_df, y_test))

*****LogisticRegression(C=1, class_weight='balanced', dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l1',
                   random_state=42, solver='saga', tol=0.0001, verbose=0,
                   warm_start=False)*****
              precision    recall  f1-score   support

           0       0.97      0.92      0.95      8000
           1       0.79      0.92      0.85      2499

    accuracy                           0.92     10499
   macro avg       0.88      0.92      0.90     10499
weighted avg       0.93      0.92      0.93     10499

              precision    recall  f1-score   support

           0       0.97      0.92      0.94      3428
           1       0.77      0.90      0.83      1072

    accuracy                           0.91      4500
   macro avg       0.87      0.91      0.89      4500
weighted avg       0.92      0.91      0.91      4500

*****SVC(

Learning rate set to 0.028117
0:	learn: 0.6474009	total: 39.9ms	remaining: 39.9s
1:	learn: 0.6065504	total: 69.7ms	remaining: 34.8s
2:	learn: 0.5675137	total: 84.8ms	remaining: 28.2s
3:	learn: 0.5342410	total: 97.6ms	remaining: 24.3s
4:	learn: 0.5034254	total: 111ms	remaining: 22.1s
5:	learn: 0.4757161	total: 126ms	remaining: 20.9s
6:	learn: 0.4496665	total: 139ms	remaining: 19.7s
7:	learn: 0.4242752	total: 152ms	remaining: 18.9s
8:	learn: 0.4007833	total: 164ms	remaining: 18.1s
9:	learn: 0.3785277	total: 177ms	remaining: 17.5s
10:	learn: 0.3571645	total: 209ms	remaining: 18.8s
11:	learn: 0.3385943	total: 239ms	remaining: 19.7s
12:	learn: 0.3210804	total: 255ms	remaining: 19.3s
13:	learn: 0.3062765	total: 275ms	remaining: 19.4s
14:	learn: 0.2925361	total: 295ms	remaining: 19.4s
15:	learn: 0.2800812	total: 308ms	remaining: 19s
16:	learn: 0.2683438	total: 323ms	remaining: 18.7s
17:	learn: 0.2580532	total: 336ms	remaining: 18.3s
18:	learn: 0.2466207	total: 348ms	remaining: 18s
19:	learn: 

165:	learn: 0.0738346	total: 3.01s	remaining: 15.1s
166:	learn: 0.0736850	total: 3.04s	remaining: 15.2s
167:	learn: 0.0735656	total: 3.06s	remaining: 15.2s
168:	learn: 0.0734081	total: 3.1s	remaining: 15.3s
169:	learn: 0.0732109	total: 3.13s	remaining: 15.3s
170:	learn: 0.0730580	total: 3.17s	remaining: 15.4s
171:	learn: 0.0729009	total: 3.25s	remaining: 15.7s
172:	learn: 0.0727505	total: 3.34s	remaining: 16s
173:	learn: 0.0726248	total: 3.43s	remaining: 16.3s
174:	learn: 0.0724501	total: 3.51s	remaining: 16.6s
175:	learn: 0.0723180	total: 3.57s	remaining: 16.7s
176:	learn: 0.0721288	total: 3.65s	remaining: 17s
177:	learn: 0.0719948	total: 3.73s	remaining: 17.2s
178:	learn: 0.0718902	total: 3.76s	remaining: 17.3s
179:	learn: 0.0716622	total: 3.8s	remaining: 17.3s
180:	learn: 0.0715155	total: 3.82s	remaining: 17.3s
181:	learn: 0.0714024	total: 3.84s	remaining: 17.3s
182:	learn: 0.0712448	total: 3.89s	remaining: 17.4s
183:	learn: 0.0710271	total: 3.92s	remaining: 17.4s
184:	learn: 0.0708

329:	learn: 0.0540310	total: 7.25s	remaining: 14.7s
330:	learn: 0.0539387	total: 7.29s	remaining: 14.7s
331:	learn: 0.0538407	total: 7.31s	remaining: 14.7s
332:	learn: 0.0537955	total: 7.33s	remaining: 14.7s
333:	learn: 0.0537271	total: 7.35s	remaining: 14.7s
334:	learn: 0.0535701	total: 7.37s	remaining: 14.6s
335:	learn: 0.0535303	total: 7.38s	remaining: 14.6s
336:	learn: 0.0534725	total: 7.4s	remaining: 14.6s
337:	learn: 0.0534197	total: 7.41s	remaining: 14.5s
338:	learn: 0.0533521	total: 7.42s	remaining: 14.5s
339:	learn: 0.0533216	total: 7.43s	remaining: 14.4s
340:	learn: 0.0532379	total: 7.45s	remaining: 14.4s
341:	learn: 0.0531709	total: 7.48s	remaining: 14.4s
342:	learn: 0.0531218	total: 7.51s	remaining: 14.4s
343:	learn: 0.0530647	total: 7.52s	remaining: 14.3s
344:	learn: 0.0529600	total: 7.54s	remaining: 14.3s
345:	learn: 0.0529089	total: 7.56s	remaining: 14.3s
346:	learn: 0.0528159	total: 7.58s	remaining: 14.3s
347:	learn: 0.0526806	total: 7.59s	remaining: 14.2s
348:	learn: 0

495:	learn: 0.0433867	total: 11.2s	remaining: 11.4s
496:	learn: 0.0433632	total: 11.3s	remaining: 11.4s
497:	learn: 0.0433162	total: 11.3s	remaining: 11.4s
498:	learn: 0.0432992	total: 11.3s	remaining: 11.3s
499:	learn: 0.0432615	total: 11.3s	remaining: 11.3s
500:	learn: 0.0431831	total: 11.3s	remaining: 11.3s
501:	learn: 0.0431245	total: 11.3s	remaining: 11.2s
502:	learn: 0.0430754	total: 11.4s	remaining: 11.2s
503:	learn: 0.0430325	total: 11.4s	remaining: 11.2s
504:	learn: 0.0429633	total: 11.4s	remaining: 11.2s
505:	learn: 0.0429472	total: 11.4s	remaining: 11.1s
506:	learn: 0.0429221	total: 11.4s	remaining: 11.1s
507:	learn: 0.0428595	total: 11.5s	remaining: 11.1s
508:	learn: 0.0428023	total: 11.5s	remaining: 11.1s
509:	learn: 0.0427882	total: 11.5s	remaining: 11s
510:	learn: 0.0427187	total: 11.5s	remaining: 11s
511:	learn: 0.0426521	total: 11.5s	remaining: 11s
512:	learn: 0.0426028	total: 11.5s	remaining: 10.9s
513:	learn: 0.0425010	total: 11.5s	remaining: 10.9s
514:	learn: 0.0424

659:	learn: 0.0361681	total: 14.8s	remaining: 7.65s
660:	learn: 0.0361441	total: 14.9s	remaining: 7.65s
661:	learn: 0.0361111	total: 15s	remaining: 7.67s
662:	learn: 0.0360711	total: 15.1s	remaining: 7.68s
663:	learn: 0.0360071	total: 15.1s	remaining: 7.66s
664:	learn: 0.0359093	total: 15.2s	remaining: 7.65s
665:	learn: 0.0358427	total: 15.2s	remaining: 7.63s
666:	learn: 0.0358350	total: 15.2s	remaining: 7.6s
667:	learn: 0.0357822	total: 15.2s	remaining: 7.58s
668:	learn: 0.0357601	total: 15.3s	remaining: 7.55s
669:	learn: 0.0357415	total: 15.3s	remaining: 7.52s
670:	learn: 0.0357321	total: 15.3s	remaining: 7.5s
671:	learn: 0.0357099	total: 15.3s	remaining: 7.47s
672:	learn: 0.0356736	total: 15.3s	remaining: 7.46s
673:	learn: 0.0356248	total: 15.4s	remaining: 7.44s
674:	learn: 0.0355557	total: 15.4s	remaining: 7.41s
675:	learn: 0.0355216	total: 15.4s	remaining: 7.39s
676:	learn: 0.0355155	total: 15.4s	remaining: 7.37s
677:	learn: 0.0354758	total: 15.5s	remaining: 7.34s
678:	learn: 0.03

818:	learn: 0.0304512	total: 19.9s	remaining: 4.39s
819:	learn: 0.0304299	total: 19.9s	remaining: 4.37s
820:	learn: 0.0304122	total: 19.9s	remaining: 4.34s
821:	learn: 0.0304037	total: 19.9s	remaining: 4.32s
822:	learn: 0.0303639	total: 20s	remaining: 4.29s
823:	learn: 0.0303521	total: 20s	remaining: 4.26s
824:	learn: 0.0303404	total: 20s	remaining: 4.24s
825:	learn: 0.0303097	total: 20s	remaining: 4.21s
826:	learn: 0.0302897	total: 20s	remaining: 4.18s
827:	learn: 0.0302296	total: 20s	remaining: 4.16s
828:	learn: 0.0302184	total: 20s	remaining: 4.13s
829:	learn: 0.0301806	total: 20s	remaining: 4.11s
830:	learn: 0.0301724	total: 20.1s	remaining: 4.08s
831:	learn: 0.0301256	total: 20.1s	remaining: 4.06s
832:	learn: 0.0300501	total: 20.1s	remaining: 4.03s
833:	learn: 0.0300379	total: 20.1s	remaining: 4.01s
834:	learn: 0.0300309	total: 20.2s	remaining: 3.98s
835:	learn: 0.0300069	total: 20.2s	remaining: 3.96s
836:	learn: 0.0299873	total: 20.2s	remaining: 3.93s
837:	learn: 0.0299099	total:

977:	learn: 0.0262666	total: 23.6s	remaining: 532ms
978:	learn: 0.0262484	total: 23.7s	remaining: 509ms
979:	learn: 0.0262129	total: 23.8s	remaining: 485ms
980:	learn: 0.0261950	total: 23.9s	remaining: 462ms
981:	learn: 0.0261877	total: 23.9s	remaining: 439ms
982:	learn: 0.0261467	total: 23.9s	remaining: 414ms
983:	learn: 0.0261274	total: 24s	remaining: 390ms
984:	learn: 0.0260936	total: 24s	remaining: 366ms
985:	learn: 0.0260777	total: 24s	remaining: 341ms
986:	learn: 0.0260560	total: 24s	remaining: 317ms
987:	learn: 0.0260032	total: 24.1s	remaining: 293ms
988:	learn: 0.0259407	total: 24.1s	remaining: 268ms
989:	learn: 0.0259337	total: 24.1s	remaining: 244ms
990:	learn: 0.0258942	total: 24.2s	remaining: 220ms
991:	learn: 0.0258734	total: 24.2s	remaining: 195ms
992:	learn: 0.0258359	total: 24.2s	remaining: 171ms
993:	learn: 0.0258242	total: 24.2s	remaining: 146ms
994:	learn: 0.0258035	total: 24.2s	remaining: 122ms
995:	learn: 0.0257951	total: 24.3s	remaining: 97.4ms
996:	learn: 0.02578

In this case, we want to minimize the number of FN (Employees who are incorrectly predicted to not churn; they were actually supposed to churn), so we want a better recall score.