In [48]:
#import pandas_profiling as pp
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use('fivethirtyeight')
#plt.style.use('default')

import warnings
warnings.filterwarnings('ignore')

from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler

from sklearn.feature_selection import chi2, SelectKBest, VarianceThreshold

from sklearn.compose import ColumnTransformer

from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.linear_model import LogisticRegression

from sklearn.metrics import classification_report, recall_score, plot_confusion_matrix

from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, AdaBoostClassifier, GradientBoostingClassifier
import xgboost as xgb
from catboost import CatBoostClassifier

pd.set_option('display.max_rows', 250)
pd.set_option('display.min_rows', 100)
pd.set_option('display.max_columns', 30)

In [2]:
df = pd.read_csv('df_for_preprocessing_ord.csv')

In [3]:
df.shape

(14999, 24)

In [4]:
df.head()

Unnamed: 0,id,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,churn,promotion_last_5years,dept,salary,sat_level_cluster,last_eval_cluster,avg_month_hr_cluster,satisfaction_level^1xlast_evaluation^1,satisfaction_level^1xnumber_project^1,satisfaction_level^1xaverage_montly_hours^1,satisfaction_level^1xtime_spend_company^1,last_evaluation^1xnumber_project^1,last_evaluation^1xaverage_montly_hours^1,last_evaluation^1xtime_spend_company^1,number_project^1xaverage_montly_hours^1,number_project^1xtime_spend_company^1,average_montly_hours^1xtime_spend_company^1
0,0,0.38,0.53,2,157,3,0,1,0,sales,0.0,1.0,0.0,0.0,0.2014,0.76,59.66,1.14,1.06,83.21,1.59,314.0,6.0,471.0
1,1,0.8,0.86,5,262,6,0,1,0,sales,1.0,3.0,3.0,2.0,0.688,4.0,209.6,4.8,4.3,225.32,5.16,1310.0,30.0,1572.0
2,2,0.11,0.88,7,272,4,0,1,0,sales,1.0,0.0,3.0,2.0,0.0968,0.77,29.92,0.44,6.16,239.36,3.52,1904.0,28.0,1088.0
3,3,0.72,0.87,5,223,5,0,1,0,sales,0.0,2.0,3.0,1.0,0.6264,3.6,160.56,3.6,4.35,194.01,4.35,1115.0,25.0,1115.0
4,4,0.37,0.52,2,159,3,0,1,0,sales,0.0,1.0,0.0,0.0,0.1924,0.74,58.83,1.11,1.04,82.68,1.56,318.0,6.0,477.0


In [5]:
df.columns

Index(['id', 'satisfaction_level', 'last_evaluation', 'number_project',
       'average_montly_hours', 'time_spend_company', 'Work_accident', 'churn',
       'promotion_last_5years', 'dept', 'salary', 'sat_level_cluster',
       'last_eval_cluster', 'avg_month_hr_cluster',
       'satisfaction_level^1xlast_evaluation^1',
       'satisfaction_level^1xnumber_project^1',
       'satisfaction_level^1xaverage_montly_hours^1',
       'satisfaction_level^1xtime_spend_company^1',
       'last_evaluation^1xnumber_project^1',
       'last_evaluation^1xaverage_montly_hours^1',
       'last_evaluation^1xtime_spend_company^1',
       'number_project^1xaverage_montly_hours^1',
       'number_project^1xtime_spend_company^1',
       'average_montly_hours^1xtime_spend_company^1'],
      dtype='object')

In [6]:
#https://towardsdatascience.com/7-data-types-a-better-way-to-think-about-data-types-for-machine-learning-939fae99a689

#For personal purposes

useless_var = ['id']

nominal_var = ['dept']

ordinal_var = ['salary',
               'sat_level_cluster',
               'last_eval_cluster',
               'avg_month_hr_cluster']

binary_var = ['Work_accident',
              'promotion_last_5years']

target_var = ['churn']

count_var = ['number_project',
             'time_spend_company']

interval_var = ['satisfaction_level',
                'last_evaluation',
                'average_montly_hours',
                'satisfaction_level^1xlast_evaluation^1',
                'satisfaction_level^1xnumber_project^1',
                'satisfaction_level^1xaverage_montly_hours^1',
                'satisfaction_level^1xtime_spend_company^1',
                'last_evaluation^1xnumber_project^1',
                'last_evaluation^1xaverage_montly_hours^1',
                'last_evaluation^1xtime_spend_company^1',
                'number_project^1xaverage_montly_hours^1',
                'number_project^1xtime_spend_company^1',
                'average_montly_hours^1xtime_spend_company^1']

num_var = count_var + interval_var
cat_var = nominal_var + ordinal_var + binary_var

## Transform the data

In [83]:
model_pipeline = Pipeline(steps=[
    
    ('features', FeatureUnion([
        
        ('numerical_features', ColumnTransformer([
            ('numerical', Pipeline(steps=[
                ('min_max_scale', MinMaxScaler()),
                ('num_selection', VarianceThreshold(threshold = 0.05))]),
            ['number_project', 'time_spend_company', 'satisfaction_level', 'last_evaluation',                                  'average_montly_hours', 'satisfaction_level^1xlast_evaluation^1',                                                  'satisfaction_level^1xnumber_project^1', 'satisfaction_level^1xaverage_montly_hours^1',                            'satisfaction_level^1xtime_spend_company^1', 'last_evaluation^1xnumber_project^1',                                'last_evaluation^1xaverage_montly_hours^1', 'last_evaluation^1xtime_spend_company^1',                              'number_project^1xaverage_montly_hours^1', 'number_project^1xtime_spend_company^1',                                'average_montly_hours^1xtime_spend_company^1'])
        ])),
        
        ('categorical_features', ColumnTransformer([
            ('categorical', Pipeline(steps=[
                ("onehot", OneHotEncoder(handle_unknown="ignore")),
                ("cat_selection", SelectKBest(score_func = chi2, k = 10))]),
            ['dept'])
        ],
        remainder = 'passthrough'))
    ])),
    
    #('classifiers', xgb.XGBClassifier())
])

Helen Tip:
pipeline.named_steps['feature_selection_thing'] will show you your select K best thing.

Add onto that as if it's the feature selector to find what you need

In [80]:
model_pipeline.fit(X_train, y_train)
print(classification_report(y_test,model_pipeline.predict(X_test)))

              precision    recall  f1-score   support

           0       0.97      0.99      0.98      3428
           1       0.97      0.91      0.94      1072

    accuracy                           0.97      4500
   macro avg       0.97      0.95      0.96      4500
weighted avg       0.97      0.97      0.97      4500



In [54]:
parameters = {
    'max_depth': range (2, 10, 1),
    'n_estimators': range(60, 220, 40),
    'learning_rate': [0.1, 0.05]
}

In [55]:
num_pipeline = Pipeline(steps=[
    ('min_max_scale', MinMaxScaler()),
    ('num_selection', VarianceThreshold(threshold = 0.05)),
    ('xg_boost', xgb.XGBClassifier(scale_pos_weight=3.2,verbosity=0,random_state=42))
    ])


cv_pipeline = GridSearchCV(estimator=num_pipeline, param_grid=parameters)
cv_pipeline.fit(X=X_train, y=y_train)
final_pipeline = cv_pipeline.best_estimator_
final_classifier = final_pipeline.named_steps['xg_boost']

ValueError: Invalid parameter learning_rate for estimator Pipeline(memory=None,
         steps=[('min_max_scale',
                 MinMaxScaler(copy=True, feature_range=(0, 1))),
                ('num_selection', VarianceThreshold(threshold=0.05)),
                ('xg_boost',
                 XGBClassifier(base_score=0.5, booster='gbtree',
                               colsample_bylevel=1, colsample_bynode=1,
                               colsample_bytree=1, gamma=0, learning_rate=0.1,
                               max_delta_step=0, max_depth=3,
                               min_child_weight=1, missing=None,
                               n_estimators=100, n_jobs=1, nthread=None,
                               objective='binary:logistic', random_state=42,
                               reg_alpha=0, reg_lambda=1, scale_pos_weight=3.2,
                               seed=None, silent=None, subsample=1,
                               verbosity=0))],
         verbose=False). Check the list of available parameters with `estimator.get_params().keys()`.

In [58]:
xgb.XGBClassifier().get_params().keys()

dict_keys(['base_score', 'booster', 'colsample_bylevel', 'colsample_bynode', 'colsample_bytree', 'gamma', 'learning_rate', 'max_delta_step', 'max_depth', 'min_child_weight', 'missing', 'n_estimators', 'n_jobs', 'nthread', 'objective', 'random_state', 'reg_alpha', 'reg_lambda', 'scale_pos_weight', 'seed', 'silent', 'subsample', 'verbosity'])

In [25]:
cat_pipeline = Pipeline(steps=[
    ("onehot", OneHotEncoder(handle_unknown="ignore")),
    ("cat_selection", SelectKBest(score_func = chi2, k = 10))
])

In [26]:
preprocessor = ColumnTransformer(transformers = [
                                                ('numerical', num_pipeline, num_var),
                                                ('categorical', cat_pipeline, nominal_var)
                                                ],
                                 remainder = 'passthrough')

In [27]:
# Assign variables
X = df[num_var + cat_var]
y = df[target_var]

# Then split the remaining into training and validation sets
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y,
                                                    test_size = 0.3,
                                                    random_state = 42)

In [11]:
nominal_transformer = OneHotEncoder([['sales', 'accounting', 'hr', 'technical', 'support', 'management',
       'IT', 'product_mng', 'marketing', 'RandD'],
                                    ['low', 'medium', 'high'],
                                    ['unsatisfied', 'very satisfied', 'very unsatisfied', 'satisfied'],
                                    ['low performance', 'high performance', 'very high performance',
       'average performance'],
                                    ['low', 'high', 'medium']
                                    ])

num_transformer = StandardScaler()

In [12]:
preprocessor = ColumnTransformer(transformers = [
                                                ('scaler', num_transformer, num_var),
                                                ('onehot', nominal_transformer, ordinal_var)
                                                ],
                                remainder = 'passthrough')

In [56]:
# Assign variables
X = df[num_var + cat_var]
y = df[target_var]

# Then split the remaining into training and validation sets
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y,
                                                    test_size = 0.3,
                                                    random_state = 42)

In [32]:
X_train.shape

(10499, 22)

In [34]:
# Fitting the preprocessor
X_train_transformed = preprocessor.fit_transform(X_train,y_train)

# Transform the input validation and test sets
X_test_transformed = preprocessor.transform(X_test)

In [35]:
# why we name the transformers
preprocessor.named_transformers_.keys()

dict_keys(['numerical', 'categorical', 'remainder'])

## Put all the transformed X data back into a dataframe

In [36]:
preprocessor.named_transformers_

{'numerical': Pipeline(memory=None,
          steps=[('min_max_scale',
                  MinMaxScaler(copy=True, feature_range=(0, 1))),
                 ('num_selection', VarianceThreshold(threshold=0.05))],
          verbose=False),
 'categorical': Pipeline(memory=None,
          steps=[('onehot',
                  OneHotEncoder(categories='auto', drop=None,
                                dtype=<class 'numpy.float64'>,
                                handle_unknown='ignore', sparse=True)),
                 ('cat_selection',
                  SelectKBest(k=10,
                              score_func=<function chi2 at 0x7f885115f378>))],
          verbose=False),
 'remainder': 'passthrough'}

In [44]:
#Grab the column names for all of our nominal variables
preprocessor.named_transformers_.categorical.named_steps.onehot.get_feature_names

<bound method OneHotEncoder.get_feature_names of OneHotEncoder(categories='auto', drop=None, dtype=<class 'numpy.float64'>,
              handle_unknown='ignore', sparse=True)>

In [41]:
one_hot_columns

<bound method OneHotEncoder.get_feature_names of OneHotEncoder(categories='auto', drop=None, dtype=<class 'numpy.float64'>,
              handle_unknown='ignore', sparse=True)>

In [45]:
transformed_columns = num_var + list(one_hot_columns) + binary_var

TypeError: 'method' object is not iterable

In [20]:
len(transformed_columns)

41

In [21]:
X_train_transformed.shape # Great! They match

(10499, 41)

In [22]:
X_train_transform_df = pd.DataFrame(X_train_transformed, columns = transformed_columns)
X_test_transform_df = pd.DataFrame(X_test_transformed, columns = transformed_columns)

In [23]:
classifiers = [
    LogisticRegression(penalty = 'l1', solver='saga', class_weight='balanced',random_state=42, C=1),
    #SVC(kernel="rbf", C=0.025, random_state=42, probability=True, class_weight='balanced'),
    DecisionTreeClassifier(class_weight='balanced', random_state=42),
    DecisionTreeRegressor(random_state=42),
    #VotingClassifier(random_state=42),
    RandomForestClassifier(class_weight='balanced', random_state=42),
    AdaBoostClassifier(random_state=42),
    GradientBoostingClassifier(random_state=42),
    xgb.XGBClassifier(scale_pos_weight=3.2,verbosity=0,random_state=42),
    CatBoostClassifier(class_weights=[1,3.2],verbose=False,random_seed=42)
    ]

for classifier in classifiers:
    pipe = Pipeline(steps=[('classifier', classifier)])
    pipe.fit(X_train_transform_df, y_train)
    print(f'*****{classifier}*****')
    print(classification_report(y_train,pipe.predict(X_train_transform_df)))
    print(classification_report(y_test,pipe.predict(X_test_transform_df)))
    #print("model recall score: %.3f" % pipe.score(X_test_transform_df, y_test))

*****LogisticRegression(C=1, class_weight='balanced', dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l1',
                   random_state=42, solver='saga', tol=0.0001, verbose=0,
                   warm_start=False)*****
              precision    recall  f1-score   support

           0       0.97      0.92      0.95      8000
           1       0.79      0.92      0.85      2499

    accuracy                           0.92     10499
   macro avg       0.88      0.92      0.90     10499
weighted avg       0.93      0.92      0.93     10499

              precision    recall  f1-score   support

           0       0.97      0.92      0.94      3428
           1       0.77      0.90      0.83      1072

    accuracy                           0.91      4500
   macro avg       0.87      0.91      0.89      4500
weighted avg       0.92      0.91      0.91      4500

*****Deci

In this case, we want to minimize the number of FN (Employees who are incorrectly predicted to not churn; they were actually supposed to churn), so we want a better recall score.