# Predicting Churn

In [35]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

np.random.seed(123)
plt.rc('figure', figsize=(10, 6))
np.set_printoptions(precision=3)
pd.options.display.max_rows = 20
pd.options.display.max_columns = 30

## Perguntas:

1. Qual a taxa atual de Churn da TopBank? Como ela varia mensalmente?  
2. Qual a Performance do modelo em classificar os clientes como churns?  
3. Qual o retorno esperado, em termos de faturamento, se a empresa utilizar seu modelo para evitar o churn dos clientes?  
4. Para quais clientes você daria o incentivo financeiro e qual seria esse valor, de modo a maximizar o ROI (Retorno sobre o investimento).  
Lembrando que a soma dos incentivos para cada cliente não pode ultrapassar os  R$10.000,00

### Dataset

**RowNumber**: corresponds to the record (row) number and has no effect on the output.  
**CustomerId**: contains random values and has no effect on customer leaving the bank.  
**Surname**: the surname of a customer has no impact on their decision to leave the bank.  
**CreditScore**: can have an effect on customer churn, since a customer with a higher credit score is less likely to leave the bank.  
**Geography**: a customer’s location can affect their decision to leave the bank.  
**Gender**: it’s interesting to explore whether gender plays a role in a customer leaving the bank.  
**Age**: this is certainly relevant, since older customers are less likely to leave their bank than younger ones.  
**Tenure**: refers to the number of years that the customer has been a client of the bank. Normally, older clients are more loyal and less likely to leave a bank.  
**Balance**: also a very good indicator of customer churn, as people with a higher balance in their accounts are less likely to leave the bank compared to those with lower balances.  
**NumOfProducts**: refers to the number of products that a customer has purchased through the bank.  
**HasCrCard**: denotes whether or not a customer has a credit card. This column is also relevant, since people with a credit card are less likely to leave the bank.  
**IsActiveMember**: active customers are less likely to leave the bank.  
**EstimatedSalary**: as with balance, people with lower salaries are more likely to leave the bank compared to those with higher salaries.  
**Exited**: whether or not the customer left the bank. (0=No,1=Yes)

## Read Data Set:

In [36]:
df = pd.read_csv('data/churn.csv')
print(df.columns)
df.head(2)

Index(['RowNumber', 'CustomerId', 'Surname', 'CreditScore', 'Geography',
       'Gender', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard',
       'IsActiveMember', 'EstimatedSalary', 'Exited'],
      dtype='object')


Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0


In [37]:
cols = ['CreditScore', 'Geography', 'Gender', 'Age', 'Tenure', 'Balance',
        'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary']
Xraw = df[cols].copy()
yraw = df['Exited'].copy()

In [38]:
Xraw.sample(2)

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
2656,779,Spain,Female,34,5,0.0,2,0,1,111676.63
445,748,France,Female,26,1,77780.29,1,0,1,183049.41


In [39]:
dumb = train_test_split(Xraw, yraw, test_size=0.2, random_state=42)
X, X_test, y, y_test = dumb[0].copy(), dumb[1], dumb[2].copy(), dumb[3]
# check if proportion is stratified (it should):
print(f'prop of churn in train: {y.sum() / len(y)}')
print(f'prop of churn in test: {y_test.sum() / len(y_test)}')

prop of churn in train: 0.2055
prop of churn in test: 0.1965


# Feature Engineering:

In [40]:
#mean_sal = X.EstimatedSalary.mean()
#print(f'Mean Salary: {mean_sal:,.2f}')
#X['Return'] = X.loc[:, 'EstimatedSalary'].apply(lambda x: int(x > mean_sal))
#X_test['Return'] = X_test.loc[:, 'EstimatedSalary'].apply(lambda x: int(x > mean_sal))

# Pipeline

In [41]:
ohe = OneHotEncoder()
ct = make_column_transformer((ohe, ['Geography', 'Gender']), remainder='passthrough')
logreg = LogisticRegression(solver='liblinear', random_state=1)

In [42]:
pipe = make_pipeline(ct, logreg)
pipe.fit(X, y);

In [43]:
pipe.predict(X)

array([0, 0, 0, ..., 0, 0, 0])

In [44]:
cross_val_score(pipe, X, y, cv=5, scoring='accuracy').mean()

0.792

In [45]:
params = {}
params['logisticregression__penalty'] = ['l1', 'l2']
params['logisticregression__C'] = [0.1, 1., 10.]
params

{'logisticregression__penalty': ['l1', 'l2'],
 'logisticregression__C': [0.1, 1.0, 10.0]}

In [46]:
grid = GridSearchCV(pipe, params, cv=5, scoring='accuracy')
grid.fit(X, y);

In [47]:
results = pd.DataFrame(grid.cv_results_)
results.sort_values('rank_test_score', inplace=True)
results

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_logisticregression__C,param_logisticregression__penalty,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
2,0.056967,0.019259,0.00302,5.5e-05,1.0,l1,"{'logisticregression__C': 1.0, 'logisticregres...",0.80375,0.796875,0.81875,0.813125,0.8225,0.811,0.009475,1
4,0.028746,0.004583,0.003022,8.3e-05,10.0,l1,"{'logisticregression__C': 10.0, 'logisticregre...",0.80375,0.795,0.819375,0.811875,0.823125,0.810625,0.010255,2
0,0.054586,0.014241,0.003653,0.000549,0.1,l1,"{'logisticregression__C': 0.1, 'logisticregres...",0.804375,0.79875,0.813125,0.81375,0.82,0.81,0.00751,3
1,0.025282,0.002935,0.003041,0.000107,0.1,l2,"{'logisticregression__C': 0.1, 'logisticregres...",0.79875,0.7825,0.793125,0.79375,0.791875,0.792,0.005294,4
3,0.025696,0.001905,0.00307,7.7e-05,1.0,l2,"{'logisticregression__C': 1.0, 'logisticregres...",0.79875,0.7825,0.793125,0.79375,0.791875,0.792,0.005294,4
5,0.024348,0.001582,0.002977,3e-05,10.0,l2,"{'logisticregression__C': 10.0, 'logisticregre...",0.79875,0.7825,0.793125,0.79375,0.791875,0.792,0.005294,4


In [71]:
rf = RandomForestClassifier(n_estimators=100)

In [72]:
pipe = make_pipeline(ct, rf)
pipe.fit(X, y);

In [68]:
cross_val_score(pipe, X, y, cv=5, scoring='accuracy').mean()

0.85875

In [77]:
pipe.named_steps.keys()

dict_keys(['columntransformer', 'randomforestclassifier'])

In [80]:
params = {}
params['randomforestclassifier__min_samples_split'] = [2, 10, 20]
params['randomforestclassifier__min_samples_leaf'] = [1, 10, 50]
params['randomforestclassifier__class_weight'] = [None, 'balanced']
params

{'randomforestclassifier__min_samples_split': [2, 10, 20],
 'randomforestclassifier__min_samples_leaf': [1, 10, 50],
 'randomforestclassifier__class_weight': [None, 'balanced']}

In [81]:
grid = GridSearchCV(pipe, params, cv=5, scoring='accuracy')
grid.fit(X, y);

In [82]:
results = pd.DataFrame(grid.cv_results_)
results.sort_values('rank_test_score', inplace=True)
results

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_randomforestclassifier__class_weight,param_randomforestclassifier__min_samples_leaf,param_randomforestclassifier__min_samples_split,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
1,0.497173,0.024813,0.027072,0.001505,,1,10,"{'randomforestclassifier__class_weight': None,...",0.871875,0.863125,0.869375,0.860625,0.855625,0.864125,0.005884,1
5,0.495733,0.02569,0.028698,0.003235,,10,20,"{'randomforestclassifier__class_weight': None,...",0.871875,0.855,0.865,0.86125,0.8575,0.862125,0.005937,2
3,0.47827,0.035899,0.029224,0.003168,,10,2,"{'randomforestclassifier__class_weight': None,...",0.874375,0.854375,0.859375,0.8625,0.858125,0.86175,0.006828,3
4,0.471356,0.017378,0.027157,0.002603,,10,10,"{'randomforestclassifier__class_weight': None,...",0.873125,0.855,0.863125,0.86125,0.85625,0.86175,0.00644,3
2,0.470955,0.025574,0.026442,0.002273,,1,20,"{'randomforestclassifier__class_weight': None,...",0.875625,0.8525,0.86375,0.856875,0.859375,0.861625,0.007892,5
9,0.55574,0.039446,0.029971,0.001701,balanced,1,2,{'randomforestclassifier__class_weight': 'bala...,0.859375,0.858125,0.865625,0.85875,0.856875,0.85975,0.003052,6
0,0.524265,0.015461,0.028026,0.000315,,1,2,"{'randomforestclassifier__class_weight': None,...",0.85875,0.855,0.86625,0.859375,0.85875,0.859625,0.003657,7
10,0.48237,0.00433,0.027635,0.001025,balanced,1,10,{'randomforestclassifier__class_weight': 'bala...,0.843125,0.855625,0.86,0.845625,0.85,0.850875,0.006232,8
8,0.386335,0.037143,0.024576,0.003073,,50,20,"{'randomforestclassifier__class_weight': None,...",0.846875,0.83875,0.8425,0.8425,0.843125,0.84275,0.00258,9
7,0.387331,0.02501,0.023265,0.002356,,50,10,"{'randomforestclassifier__class_weight': None,...",0.839375,0.834375,0.8475,0.845625,0.8425,0.841875,0.00466,10


In [83]:
print(grid.best_score_)
grid.best_params_

0.8641249999999999


{'randomforestclassifier__class_weight': None,
 'randomforestclassifier__min_samples_leaf': 1,
 'randomforestclassifier__min_samples_split': 10}