# Data

The dataset is related to direct marketing campaigns (phone calls) of a Portuguese banking institution. The classification goal is to predict whether the client will subscribe (1/0) to a term deposit (variable y).

In [1]:
# Carregar bibliotecas
import pandas as pd
import numpy as np
from sklearn import preprocessing
import matplotlib.pyplot as plt 
plt.rc("font", size=14)
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import seaborn as sns
sns.set(style="white")
sns.set(style="whitegrid", color_codes=True)
from sklearn.pipeline import Pipeline

In [2]:
# Não ter problemas com os warnings dos pacotes
import warnings
warnings.filterwarnings('ignore')

In [3]:
# Importa base
data = pd.read_csv("banking.csv")
data.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp_var_rate,cons_price_idx,cons_conf_idx,euribor3m,nr_employed,y
0,44,blue-collar,married,basic.4y,unknown,yes,no,cellular,aug,thu,...,1,999,0,nonexistent,1.4,93.444,-36.1,4.963,5228.1,0
1,53,technician,married,unknown,no,no,no,cellular,nov,fri,...,1,999,0,nonexistent,-0.1,93.2,-42.0,4.021,5195.8,0
2,28,management,single,university.degree,no,yes,no,cellular,jun,thu,...,3,6,2,success,-1.7,94.055,-39.8,0.729,4991.6,1
3,39,services,married,high.school,no,no,no,cellular,apr,fri,...,2,999,0,nonexistent,-1.8,93.075,-47.1,1.405,5099.1,0
4,55,retired,married,basic.4y,no,yes,no,cellular,aug,fri,...,1,3,1,success,-2.9,92.201,-31.4,0.869,5076.2,1


# Input variables
1. age (numeric) <br> 
<br>
2. job : type of job (categorical: “admin”, “blue-collar”, “entrepreneur”, “housemaid”, “management”, “retired”, “self-employed”, “services”, “student”, “technician”, “unemployed”, “unknown”) <br>
<br>
3. marital : marital status (categorical: “divorced”, “married”, “single”, “unknown”) <br>
<br>
4. education (categorical: “basic.4y”, “basic.6y”, “basic.9y”, “high.school”, “illiterate”, “professional.course”, “university.degree”, “unknown”) <br>
<br>
5. default: has credit in default? (categorical: “no”, “yes”, “unknown”) <br>
<br>
6. housing: has housing loan? (categorical: “no”, “yes”, “unknown”) <br>
<br>
7. loan: has personal loan? (categorical: “no”, “yes”, “unknown”) <br>
<br>
8. contact: contact communication type (categorical: “cellular”, “telephone”) <br>
<br>
9. month: last contact month of year (categorical: “jan”, “feb”, “mar”, …, “nov”, “dec”) <br>
<br>
10. day_of_week: last contact day of the week (categorical: “mon”, “tue”, “wed”, “thu”, “fri”) <br>
<br>
11. duration: last contact duration, in seconds (numeric). Important note: this attribute highly affects the output target (e.g., if duration=0 then y=’no’). The duration is not known before a call is performed, also, after the end of the call, y is obviously known. Thus, this input should only be included for benchmark purposes and should be discarded if the intention is to have a realistic predictive model <br>
<br>
12. campaign: number of contacts performed during this campaign and for this client (numeric, includes last contact) <br>
<br>
13. pdays: number of days that passed by after the client was last contacted from a previous campaign (numeric; 999 means client was not previously contacted) <br>
<br>
14. previous: number of contacts performed before this campaign and for this client (numeric) <br>
<br>
15. poutcome: outcome of the previous marketing campaign (categorical: “failure”, “nonexistent”, “success”) <br>
<br>
16. emp.var.rate: employment variation rate — (numeric) <br>
<br>
17. cons.price.idx: consumer price index — (numeric) <br>
<br>
18. cons.conf.idx: consumer confidence index — (numeric) <br>
<br>
19. euribor3m: euribor 3 month rate — (numeric) <br>
<br>
20. nr.employed: number of employees — (numeric) <br>

### Predict variable (desired target):

y — has the client subscribed a term deposit? (binary: “1”, means “Yes”, “0” means “No”)

# Pipeline

Antes de iniciar os testes da pipeline, eu vou separar o modelo em treino e teste para conseguir validar.

In [4]:
# Separa as informações de preditoras
X = data.loc[:, data.columns != 'y']
y = data.loc[:, data.columns == 'y']

In [5]:
# Cria bases de teste e treino
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [6]:
# Import usefull libraries
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.pipeline import FeatureUnion, Pipeline

Vou criar uma etapa de seleção de variáveis. Daí, uma base "crua" será passada e teremos apenas as variáveis que vamos considerar no modelo.

In [7]:
# Custom Transformer that extracts columns passed as argument to its constructor
class FeatureSelector(BaseEstimator, TransformerMixin):
    # Class Constructor
    def __init__(self, feature_names):
        self._feature_names = feature_names

    # Return self nothing else to do here
    def fit(self, X, y=None):
        return self
    
    # Method that describes what we neeed this transformer to do
    def transform(self, X, y=None):
        return X[self._feature_names]

In [8]:
# Defining the steps in the categorical pipeline
categorical_features = ['marital', 'contact', 'job']

feature_selector = Pipeline(steps=[('feature_selector', FeatureSelector(categorical_features))])

Vou usar o método `.fit_transform()` para testar a pipeline.

In [9]:
X_temp = feature_selector.fit_transform(X_train)

Observe que eu não precisei colocar a informação das categorias para selecionar as variáveis que queremos, na pipeline já salvamos isso.

In [10]:
X_temp.head()

Unnamed: 0,marital,contact,job
31880,single,cellular,technician
38177,married,telephone,admin.
2459,married,telephone,management
756,married,cellular,blue-collar
11275,single,cellular,admin.


In [11]:
# Converts certain features to binary
class CategoricalBinary(TransformerMixin):
    import pandas as pd
    # Class Constructor
    def __init__(self):
        pass
    
    # Return self nothing else to do here
    def fit(self, X, y=None):
        return self
    
    # Faz as transformações com a função get_dummies
    def transform(self, X, y=None):
        X = pd.get_dummies(X, columns=X.columns.tolist())
        return X

In [12]:
# Defining the steps in the categorical pipeline
categorical_features = ['marital', 'contact', 'job']

categorical_transform = Pipeline(steps=[('feature_selector', FeatureSelector(categorical_features)), 
                                        ('categorical_dummy', CategoricalBinary())])

In [13]:
X_temp = categorical_transform.fit_transform(X_train)
X_temp.head()

Unnamed: 0,marital_divorced,marital_married,marital_single,marital_unknown,contact_cellular,contact_telephone,job_admin.,job_blue-collar,job_entrepreneur,job_housemaid,job_management,job_retired,job_self-employed,job_services,job_student,job_technician,job_unemployed,job_unknown
31880,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0
38177,0,1,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0
2459,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0
756,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0
11275,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0


A seguir separamos as variáveis que entraram no modelo com as variáveis que estarão no basal. Além disso, garantimos que todas as variáveis que estão no modelo, estejam disponíveis para o cálculo.

In [14]:
# Custom Transformer that extracts columns passed as argument to its constructor
class ModelFeatureSelector(BaseEstimator, TransformerMixin):
    # Class Constructor
    def __init__(self, feature_names):
        self._feature_names = feature_names

    # Return self nothing else to do here
    def fit(self, X, y=None):
        return self
    
    # Method that describes what we neeed this transformer to do
    def transform(self, X, y=None):
        # Garante que as variáveis que não foram criadas na dummificação estejam disponíveis para o cálculo
        for i in self._feature_names:
            if i not in X.columns.tolist(): 
                X.loc[:, str(i)] = 0

        return X[self._feature_names]

In [15]:
# Defining the steps in the categorical pipeline
model_features = ['marital_divorced', 'marital_married', 'marital_single', 
                        'contact_cellular', 'job_admin.', 'job_blue-collar', 
                        'job_entrepreneur', 'job_housemaid', 'job_management', 
                        'job_retired', 'job_self-employed', 'job_services',
                        'job_student', 'job_technician', 'job_unemployed']

model_features_selector = Pipeline(steps=[('model_features', ModelFeatureSelector(model_features))])

In [16]:
model_features_selector.fit_transform(X_train)

Unnamed: 0,marital_divorced,marital_married,marital_single,contact_cellular,job_admin.,job_blue-collar,job_entrepreneur,job_housemaid,job_management,job_retired,job_self-employed,job_services,job_student,job_technician,job_unemployed
31880,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
38177,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2459,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
756,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
11275,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20757,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
32103,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
30403,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
21243,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


Apenas ficamos com as variáveis selecionadas.

Por fim, adicionamos um modelo logístico para calcular o predito.

In [17]:
# Defining the steps in the categorical pipeline
categorical_features = ['marital', 'contact', 'job']

model_features = ['marital_divorced', 'marital_married', 'marital_single',
                  'contact_cellular', 'job_admin.', 'job_blue-collar',
                  'job_entrepreneur', 'job_housemaid', 'job_management',
                  'job_retired', 'job_self-employed', 'job_services',
                  'job_student', 'job_technician', 'job_unemployed']

categorical_transform = Pipeline(steps=[('feature_selector', FeatureSelector(categorical_features)),
                                        ('categorical_dummy', CategoricalBinary()),
                                        ('model_features', ModelFeatureSelector(model_features)),
                                        ('logreg', LogisticRegression(class_weight='balanced', solver='liblinear'))])

# Modelo em pipeline

In [18]:
#Observe a base treino
X_train.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,job_blue-collar,job_entrepreneur,job_housemaid,job_management,job_retired,job_self-employed,job_services,job_student,job_technician,job_unemployed
31880,31,technician,single,university.degree,no,no,no,cellular,nov,tue,...,0,0,0,0,0,0,0,0,0,0
38177,44,admin.,married,professional.course,no,yes,no,telephone,may,mon,...,0,0,0,0,0,0,0,0,0,0
2459,43,management,married,professional.course,no,yes,no,telephone,may,mon,...,0,0,0,0,0,0,0,0,0,0
756,35,blue-collar,married,basic.4y,no,no,no,cellular,nov,tue,...,0,0,0,0,0,0,0,0,0,0
11275,33,admin.,single,university.degree,no,yes,yes,cellular,may,thu,...,0,0,0,0,0,0,0,0,0,0


In [19]:
# Modelo pipeline
categorical_transform.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('feature_selector', FeatureSelector(feature_names=None)),
                ('categorical_dummy',
                 <__main__.CategoricalBinary object at 0x000001E57A1ABB08>),
                ('model_features', ModelFeatureSelector(feature_names=None)),
                ('logreg',
                 LogisticRegression(C=1.0, class_weight='balanced', dual=False,
                                    fit_intercept=True, intercept_scaling=1,
                                    l1_ratio=None, max_iter=100,
                                    multi_class='warn', n_jobs=None,
                                    penalty='l2', random_state=None,
                                    solver='liblinear', tol=0.0001, verbose=0,
                                    warm_start=False))],
         verbose=False)

In [20]:
# Importa o pacote que serializa objetos. Ele vai nos auxiliar na portabilidade do nosso modelo. 
# É o equivalente do pickle para pipelines
import dill

# Objetivo final: salve seu modelo num arquivo .pkl
# Esse arquivo será salvo no diretório desse bloco de anotações
with open('model_pipe.pkl', 'wb') as f:
    dill.dump(categorical_transform, f)

# load
obj = dill.load(open('model_pipe.pkl', 'rb'))

In [21]:
# Predito classe
print(obj.predict(X_test))

# Predito probabilidade
print(obj.predict_proba(X_test))

[0 1 0 ... 0 0 0]
[[0.55401264 0.44598736]
 [0.49040043 0.50959957]
 [0.72977412 0.27022588]
 ...
 [0.74223654 0.25776346]
 [0.50944944 0.49055056]
 [0.72738996 0.27261004]]


In [22]:
# Observe que a base teste está intacta
X_test.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,duration,campaign,pdays,previous,poutcome,emp_var_rate,cons_price_idx,cons_conf_idx,euribor3m,nr_employed
15500,51,blue-collar,married,basic.9y,no,yes,no,cellular,apr,mon,104,4,999,0,nonexistent,-1.8,93.075,-47.1,1.405,5099.1
29034,31,technician,married,professional.course,no,no,no,cellular,apr,mon,551,3,999,1,failure,-1.8,93.075,-47.1,1.466,5099.1
11199,33,blue-collar,single,basic.9y,no,no,no,telephone,jul,tue,116,2,999,0,nonexistent,1.4,93.918,-42.7,4.962,5228.1
22247,29,admin.,married,high.school,no,no,no,telephone,jun,mon,100,1,999,0,nonexistent,1.4,94.465,-41.8,4.961,5228.1
27180,57,housemaid,married,basic.4y,no,no,no,cellular,nov,fri,216,1,999,0,nonexistent,-0.1,93.2,-42.0,4.021,5195.8
