# Previsão de Churn

***
## 4. Pipeline de Inferência

### Objetivo:
Partindo do modelo em pickle salvo na etapa anterior:
- carregar base de dados de teste oferecida no Case
- carregar modelo pickle salvo
- realizar predição dos novos dados

****
### Carrega bibliotecas


In [1]:
# Bibliotecas
import pandas as pd
import pickle


### Inputação dos dados


In [17]:
# Buscar dados do arquivo CSV
csv_file = "../data/01_raw/database_test.csv"
df = pd.read_csv(csv_file, sep=";")

df

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,10001,15798485,Copley,565,France,Male,31,1,0.00,1,0,1,20443.08
1,10002,15588959,T'ang,569,France,Male,34,4,0.00,1,0,1,4045.90
2,10003,15624896,Ku,669,France,Female,20,7,0.00,2,1,0,128838.67
3,10004,15639629,McConnan,694,France,Male,39,4,173255.48,1,1,1,81293.10
4,10005,15638852,Ts'ui,504,Spain,Male,28,10,109291.36,1,1,1,187593.15
...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,10996,15730373,Starks,531,France,Female,34,10,118306.79,1,1,0,26493.05
996,10997,15716191,Dixon,575,Germany,Male,49,2,136822.70,1,1,0,2487.74
997,10998,15673900,Wilkinson,520,France,Female,74,4,0.00,1,0,0,26742.92
998,10999,15581432,Oatley,675,Spain,Male,23,8,0.00,2,0,0,162342.21


### Aplica pre-processamento da mesma forma que foi feita para o treinamento do modelo

In [18]:
# Remove colunas que não serão utilizadas
df1 = df.drop(columns = ['RowNumber', 'CustomerId', 'Surname'])
df1

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,565,France,Male,31,1,0.00,1,0,1,20443.08
1,569,France,Male,34,4,0.00,1,0,1,4045.90
2,669,France,Female,20,7,0.00,2,1,0,128838.67
3,694,France,Male,39,4,173255.48,1,1,1,81293.10
4,504,Spain,Male,28,10,109291.36,1,1,1,187593.15
...,...,...,...,...,...,...,...,...,...,...
995,531,France,Female,34,10,118306.79,1,1,0,26493.05
996,575,Germany,Male,49,2,136822.70,1,1,0,2487.74
997,520,France,Female,74,4,0.00,1,0,0,26742.92
998,675,Spain,Male,23,8,0.00,2,0,0,162342.21


In [4]:
# Separação das variáveis

# Variáveis Continuas
x_cont = ['CreditScore', 'Balance', 'Age', 'NumOfProducts', 'EstimatedSalary', 'Tenure']
x_cont

['CreditScore', 'Balance', 'Age', 'NumOfProducts', 'EstimatedSalary', 'Tenure']

In [5]:
# Variáveis Categóricas
x_cat = list(set(df1)-set(x_cont))
print(x_cat)

x_dummies = df1[x_cat]
x_dummies

['Geography', 'HasCrCard', 'Gender', 'IsActiveMember']


Unnamed: 0,Geography,HasCrCard,Gender,IsActiveMember
0,France,0,Male,1
1,France,0,Male,1
2,France,1,Female,0
3,France,1,Male,1
4,Spain,1,Male,1
...,...,...,...,...
995,France,1,Female,0
996,Germany,1,Male,0
997,France,0,Female,0
998,Spain,0,Male,0


In [6]:
# Processamento das variaveis categoricas

from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df1['Gender'] = le.fit_transform(df1['Gender'])

df1

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,565,France,1,31,1,0.00,1,0,1,20443.08
1,569,France,1,34,4,0.00,1,0,1,4045.90
2,669,France,0,20,7,0.00,2,1,0,128838.67
3,694,France,1,39,4,173255.48,1,1,1,81293.10
4,504,Spain,1,28,10,109291.36,1,1,1,187593.15
...,...,...,...,...,...,...,...,...,...,...
995,531,France,0,34,10,118306.79,1,1,0,26493.05
996,575,Germany,1,49,2,136822.70,1,1,0,2487.74
997,520,France,0,74,4,0.00,1,0,0,26742.92
998,675,Spain,1,23,8,0.00,2,0,0,162342.21


In [7]:
df_final = pd.get_dummies(data=df1, columns=['Geography'])

df_final

Unnamed: 0,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Geography_France,Geography_Germany,Geography_Spain
0,565,1,31,1,0.00,1,0,1,20443.08,1,0,0
1,569,1,34,4,0.00,1,0,1,4045.90,1,0,0
2,669,0,20,7,0.00,2,1,0,128838.67,1,0,0
3,694,1,39,4,173255.48,1,1,1,81293.10,1,0,0
4,504,1,28,10,109291.36,1,1,1,187593.15,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...
995,531,0,34,10,118306.79,1,1,0,26493.05,1,0,0
996,575,1,49,2,136822.70,1,1,0,2487.74,0,1,0
997,520,0,74,4,0.00,1,0,0,26742.92,1,0,0
998,675,1,23,8,0.00,2,0,0,162342.21,0,0,1


In [8]:
# Feature Engineering

df_final['Salary_per_Age'] = df_final['EstimatedSalary'] / df_final['Age']
df_final['CreditScore_per_Products'] = df_final['CreditScore'] / df_final['NumOfProducts']
df_final['CreditScore_per_Salary'] = df_final['CreditScore'] / df_final['EstimatedSalary']


df_final

Unnamed: 0,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Geography_France,Geography_Germany,Geography_Spain,Salary_per_Age,CreditScore_per_Products,CreditScore_per_Salary
0,565,1,31,1,0.00,1,0,1,20443.08,1,0,0,659.454194,565.0,0.027638
1,569,1,34,4,0.00,1,0,1,4045.90,1,0,0,118.997059,569.0,0.140636
2,669,0,20,7,0.00,2,1,0,128838.67,1,0,0,6441.933500,334.5,0.005193
3,694,1,39,4,173255.48,1,1,1,81293.10,1,0,0,2084.438462,694.0,0.008537
4,504,1,28,10,109291.36,1,1,1,187593.15,0,0,1,6699.755357,504.0,0.002687
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,531,0,34,10,118306.79,1,1,0,26493.05,1,0,0,779.207353,531.0,0.020043
996,575,1,49,2,136822.70,1,1,0,2487.74,0,1,0,50.770204,575.0,0.231133
997,520,0,74,4,0.00,1,0,0,26742.92,1,0,0,361.390811,520.0,0.019444
998,675,1,23,8,0.00,2,0,0,162342.21,0,0,1,7058.356957,337.5,0.004158


In [9]:
new_cont = ['Salary_per_Age', 'CreditScore_per_Products', 'CreditScore_per_Salary']

for new_var_cont in new_cont:
    x_cont.append(new_var_cont)

x_cont

['CreditScore',
 'Balance',
 'Age',
 'NumOfProducts',
 'EstimatedSalary',
 'Tenure',
 'Salary_per_Age',
 'CreditScore_per_Products',
 'CreditScore_per_Salary']

In [10]:
# Carrega scaler no arquivo pickle
with open('../data/06_models/scaler.pkl', 'rb') as file:
    scaler = pickle.load(file)

In [11]:
# Aplica scaler
df_final[x_cont] = scaler.fit_transform(df_final[x_cont])

df_final

Unnamed: 0,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Geography_France,Geography_Germany,Geography_Spain,Salary_per_Age,CreditScore_per_Products,CreditScore_per_Salary
0,0.411157,1,0.178082,0.1,0.000000,0.000000,0,1,0.101298,1,0,0,0.070530,0.603432,0.008799
1,0.419421,1,0.219178,0.4,0.000000,0.000000,0,1,0.019060,1,0,0,0.012171,0.608998,0.048133
2,0.626033,0,0.027397,0.7,0.000000,0.333333,1,0,0.644939,1,0,0,0.694920,0.282699,0.000986
3,0.677686,1,0.287671,0.4,0.819096,0.000000,1,1,0.406481,1,0,0,0.224399,0.782931,0.002150
4,0.285124,1,0.136986,1.0,0.516695,0.000000,1,1,0.939612,0,0,1,0.722760,0.518553,0.000114
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0.340909,0,0.219178,1.0,0.559317,0.000000,1,0,0.131640,1,0,0,0.083461,0.556122,0.006156
996,0.431818,1,0.424658,0.2,0.646854,0.000000,1,0,0.011246,0,1,0,0.004804,0.617347,0.079635
997,0.318182,0,0.767123,0.4,0.000000,0.000000,0,0,0.132894,1,0,0,0.038345,0.540816,0.005947
998,0.638430,1,0.068493,0.8,0.000000,0.333333,0,0,0.812970,0,0,1,0.761482,0.286874,0.000626


In [12]:
# Carrega modelo no arquivo pickle
with open('../data/06_models/rf_model.pkl', 'rb') as file:
    rf_model = pickle.load(file)

In [13]:
df['predictedValues'] = rf_model.predict(df_final)

df

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,predictedValues
0,10001,15798485,Copley,565,France,Male,31,1,0.00,1,0,1,20443.08,0
1,10002,15588959,T'ang,569,France,Male,34,4,0.00,1,0,1,4045.90,0
2,10003,15624896,Ku,669,France,Female,20,7,0.00,2,1,0,128838.67,0
3,10004,15639629,McConnan,694,France,Male,39,4,173255.48,1,1,1,81293.10,0
4,10005,15638852,Ts'ui,504,Spain,Male,28,10,109291.36,1,1,1,187593.15,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,10996,15730373,Starks,531,France,Female,34,10,118306.79,1,1,0,26493.05,0
996,10997,15716191,Dixon,575,Germany,Male,49,2,136822.70,1,1,0,2487.74,1
997,10998,15673900,Wilkinson,520,France,Female,74,4,0.00,1,0,0,26742.92,0
998,10999,15581432,Oatley,675,Spain,Male,23,8,0.00,2,0,0,162342.21,0


In [14]:
# Saida no formato solicitado no Case
df_out = df[['RowNumber', 'predictedValues']]

df_out

Unnamed: 0,RowNumber,predictedValues
0,10001,0
1,10002,0
2,10003,0
3,10004,0
4,10005,0
...,...,...
995,10996,0
996,10997,1
997,10998,0
998,10999,0


In [15]:
print(f'Quantidade de prediçães de Churn positivo: {len(df_out[df_out["predictedValues"] == 1])}')
print(f'Quantidade de prediçães de Churn negativo: {len(df_out[df_out["predictedValues"] == 0])}')

Quantidade de prediçães de Churn positivo: 89
Quantidade de prediçães de Churn negativo: 911


***
### Persistência dos dados de resposta


In [16]:
df_out.to_csv('../data/08_reporting/predicted_data.csv', index=False)

***
# Conclusão

Com o procedimento apresentado foi possível carregar os dados de teste, o scaler utilizado na normalização dos dados e o modelo treinado na etapa anterior; aplicar o mesmo processamento dos dados de treino do modelo e fazer a predição.