### IMPORTS

In [None]:
!pip install -q scikit-optimize

In [11]:
!pip install category_encoders



In [12]:
!pip install xgboost



In [13]:
import os
import math
import timeit
from collections import Counter
import gc
import pickle

import pandas as pd
import numpy as np
from scipy import stats
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.impute import SimpleImputer
from sklearn.metrics import roc_auc_score, confusion_matrix, f1_score, accuracy_score
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.feature_selection import mutual_info_classif as MIC
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

import category_encoders as ce

from sklearn import metrics, tree
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from IPython.display import display, display_html
import xgboost as xgb

### UDFs

In [14]:
def my_print(s):
    """
    Custom print with timestamp.
    """
    print("[{}] {}".format(datetime.now().strftime('%d/%m/%Y %H:%M:%S'), s))

In [15]:
def get_missing_values(pdf):
        """
        Retorna um Pandas Series com as colunas que tem ao
        menos um missing value.
        """
        
        missing = pdf.isnull().sum()
        return missing[missing > 0]

### LOAD DATA

In [16]:
df = pd.read_csv("./data/Churn Modeling.csv")

In [17]:
df.shape

(10000, 14)

In [18]:
get_missing_values(df)

Series([], dtype: int64)

In [19]:
df

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.00,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.80,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.00,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.10,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,9996,15606229,Obijiaku,771,France,Male,39,5,0.00,2,1,0,96270.64,0
9996,9997,15569892,Johnstone,516,France,Male,35,10,57369.61,1,1,1,101699.77,0
9997,9998,15584532,Liu,709,France,Female,36,7,0.00,1,0,1,42085.58,1
9998,9999,15682355,Sabbatini,772,Germany,Male,42,3,75075.31,2,1,0,92888.52,1


In [20]:
df = df.drop(['RowNumber','CustomerId','Surname'], axis=1)

In [21]:
df

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,Female,42,2,0.00,1,1,1,101348.88,1
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,502,France,Female,42,8,159660.80,3,1,0,113931.57,1
3,699,France,Female,39,1,0.00,2,0,0,93826.63,0
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.10,0
...,...,...,...,...,...,...,...,...,...,...,...
9995,771,France,Male,39,5,0.00,2,1,0,96270.64,0
9996,516,France,Male,35,10,57369.61,1,1,1,101699.77,0
9997,709,France,Female,36,7,0.00,1,0,1,42085.58,1
9998,772,Germany,Male,42,3,75075.31,2,1,0,92888.52,1


In [22]:
var_cat = ['Geography','Gender']

In [23]:
var_num = ['CreditScore','Age', 'Tenure', 'Balance','NumOfProducts','EstimatedSalary','HasCrCard','IsActiveMember']

In [24]:
var_target = ['Exited']

In [25]:
features = list(var_cat + var_num)

In [26]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   CreditScore      10000 non-null  int64  
 1   Geography        10000 non-null  object 
 2   Gender           10000 non-null  object 
 3   Age              10000 non-null  int64  
 4   Tenure           10000 non-null  int64  
 5   Balance          10000 non-null  float64
 6   NumOfProducts    10000 non-null  int64  
 7   HasCrCard        10000 non-null  int64  
 8   IsActiveMember   10000 non-null  int64  
 9   EstimatedSalary  10000 non-null  float64
 10  Exited           10000 non-null  int64  
dtypes: float64(2), int64(7), object(2)
memory usage: 859.5+ KB


In [27]:
features

['Geography',
 'Gender',
 'CreditScore',
 'Age',
 'Tenure',
 'Balance',
 'NumOfProducts',
 'EstimatedSalary',
 'HasCrCard',
 'IsActiveMember']

In [28]:
X = df[features]
y = df[var_target]

### Spliting Data

In [29]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

In [30]:
len(X), len(X_train), len(X_test), len(y_train), len(y_test)

(10000, 7500, 2500, 7500, 2500)

In [31]:
X_train

Unnamed: 0,Geography,Gender,CreditScore,Age,Tenure,Balance,NumOfProducts,EstimatedSalary,HasCrCard,IsActiveMember
2967,Germany,Female,579,39,5,117833.30,3,5831.00,0,0
700,France,Female,750,32,5,0.00,2,95611.47,1,0
3481,Spain,Female,729,34,9,53299.96,2,42855.97,1,1
1621,Spain,Male,689,38,5,75075.14,1,8651.92,1,1
800,France,Male,605,52,7,0.00,2,173952.50,1,1
...,...,...,...,...,...,...,...,...,...,...
9225,Germany,Female,594,32,4,120074.97,2,162961.79,1,1
4859,Spain,Female,794,22,4,114440.24,1,107753.07,1,1
3264,France,Male,738,35,5,161274.05,2,181429.87,1,0
9845,Spain,Female,590,38,9,0.00,2,148750.16,1,1


### Pipeline

In [34]:
OHE = OneHotEncoder()
scaler = StandardScaler()
XGB = xgb.XGBClassifier(max_depth=2, eta= 0.35)


transformer = ColumnTransformer([('cat_cols', OHE, var_cat),
                                ('num_cols', scaler, var_num)])

pipe = Pipeline([("preprocessing", transformer),
                ("classifier", XGB)])

pipe.fit(X_train, y_train)

Pipeline(steps=[('preprocessing',
                 ColumnTransformer(transformers=[('cat_cols', OneHotEncoder(),
                                                  ['Geography', 'Gender']),
                                                 ('num_cols', StandardScaler(),
                                                  ['CreditScore', 'Age',
                                                   'Tenure', 'Balance',
                                                   'NumOfProducts',
                                                   'EstimatedSalary',
                                                   'HasCrCard',
                                                   'IsActiveMember'])])),
                ('classifier',
                 XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
                               colsample_bylevel=1, c...
                               eval_metric=None, gamma=0, gpu_id=-1,
                               grow_policy='depthwise', importance_type=

In [35]:
X_test.columns

Index(['Geography', 'Gender', 'CreditScore', 'Age', 'Tenure', 'Balance',
       'NumOfProducts', 'EstimatedSalary', 'HasCrCard', 'IsActiveMember'],
      dtype='object')

In [36]:
predicoes = pipe.predict(X_test)
predicoes[:10]

array([0, 0, 0, 0, 0, 1, 0, 0, 0, 1])

In [37]:
y_test.values[:10].flatten()

array([0, 1, 0, 0, 0, 1, 0, 0, 1, 1], dtype=int64)

In [38]:
accuracy_score(y_test, predicoes)

0.8656

In [39]:
f1_score(y_test, predicoes)

0.6137931034482759

In [40]:
roc_auc_score(y_test, predicoes)

0.7386727503628805

In [43]:

import pickle

pickle.dump(pipe, open('models/pipe.pkl','wb'))

#### Target Enconder
    

In [22]:
target_enconder = ce.TargetEncoder(
    cols=var_cat,
    verbose=10
)
target_enconder.fit(X,y)



TargetEncoder(cols=['Geography', 'Gender'], verbose=10)

In [23]:
X_train = target_enconder.transform(X_train)
X_test = target_enconder.transform(X_test)

#### OneHot Enconder

In [24]:
onehot_enconder = ce.OneHotEncoder(cols=var_cat)
onehot_enconder.fit(X_train)

OneHotEncoder(cols=['Geography', 'Gender'])

In [25]:
X_train_OHE = onehot_enconder.transform(X_train)
X_test_OHE = onehot_enconder.transform(X_test)