# 6. Modelo - Inferência

Inputa dados de teste, faz pre-processamento, carrega modelo salvo e faz predição.



In [1]:
# Bibliotecas
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
import pickle


In [2]:
# Não utilizar notação cientifica para melhorar a visualização
pd.set_option('display.float_format', lambda x: f'{x:.2f}')

# Carrega dados:
# Base de teste
df_teste = pd.read_csv('../data/1_raw/base_pagamentos_teste.csv')
print(f"Tamanho da base de teste: {len(df_teste)}")

# Feature PORCENTAGEM_INADIMPLENCIA_HIST (como são clientes recorrentes, a feature do historico de inadimplencia calculadas previamente e mantidas em banco)
df_features_info = pd.read_csv('../data/3_primary/df_features_info.csv')

# Base Info
df_base_info = pd.read_csv('../data/1_raw/base_info.csv')

# Merges
df = df_teste.merge(df_features_info, on='ID_CLIENTE', how='left')
df = df.merge(df_base_info, on=['ID_CLIENTE', 'SAFRA_REF'], how='left')

# Converte colunas com data para detetime
df['DATA_CADASTRO'] = pd.to_datetime(df['DATA_CADASTRO'])
df['DATA_EMISSAO_DOCUMENTO'] = pd.to_datetime(df['DATA_EMISSAO_DOCUMENTO'])
df['DATA_VENCIMENTO'] = pd.to_datetime(df['DATA_VENCIMENTO'])

df

Tamanho da base de teste: 12275


Unnamed: 0,ID_CLIENTE,SAFRA_REF,DATA_EMISSAO_DOCUMENTO,DATA_VENCIMENTO,VALOR_A_PAGAR,TAXA,PORCENTAGEM_INADIMPLENCIA_HIST,DDD,FLAG_PF,SEGMENTO_INDUSTRIAL,DOMINIO_EMAIL,PORTE,CEP_2_DIG,DDD_1,DATA_CADASTRO,RENDA_MES_ANTERIOR,NO_FUNCIONARIOS
0,5058298901476893676,2021-07,2021-07-14,2021-08-04,11204.75,4.99,0.00,54.00,PJ,Serviços,GMAIL,MEDIO,99.00,5.00,2014-03-30,467430.00,154.00
1,274692171162531764,2021-07,2021-07-08,2021-08-23,60718.50,5.99,2.65,19.00,PJ,Serviços,GMAIL,PEQUENO,13.00,1.00,2005-08-03,417192.00,104.00
2,274692171162531764,2021-07,2021-07-11,2021-08-25,60718.50,5.99,2.65,19.00,PJ,Serviços,GMAIL,PEQUENO,13.00,1.00,2005-08-03,417192.00,104.00
3,274692171162531764,2021-07,2021-07-16,2021-08-30,62250.00,5.99,2.65,19.00,PJ,Serviços,GMAIL,PEQUENO,13.00,1.00,2005-08-03,417192.00,104.00
4,465309249432033993,2021-07,2021-07-05,2021-07-30,26593.95,6.99,0.00,69.00,PJ,Comércio,GMAIL,GRANDE,76.00,6.00,2014-07-23,873938.00,119.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12270,705648002974742140,2021-11,2021-11-25,2021-12-13,47010.00,6.99,14.29,67.00,PJ,Indústria,YAHOO,MEDIO,79.00,6.00,2020-01-26,,
12271,4993499380140734678,2021-11,2021-11-25,2021-12-13,122875.35,8.99,2.74,31.00,PJ,Serviços,GMAIL,PEQUENO,37.00,3.00,2015-11-24,311422.00,150.00
12272,4614484019183480654,2021-11,2021-11-26,2028-09-30,302200.00,5.99,1.82,51.00,PJ,Indústria,YAHOO,MEDIO,93.00,5.00,2011-02-14,,
12273,1299146298565441811,2021-11,2021-11-26,2021-12-13,143791.85,5.99,6.06,21.00,PJ,Serviços,HOTMAIL,MEDIO,21.00,2.00,2000-08-15,220341.00,122.00


In [3]:
df.columns

Index(['ID_CLIENTE', 'SAFRA_REF', 'DATA_EMISSAO_DOCUMENTO', 'DATA_VENCIMENTO',
       'VALOR_A_PAGAR', 'TAXA', 'PORCENTAGEM_INADIMPLENCIA_HIST', 'DDD',
       'FLAG_PF', 'SEGMENTO_INDUSTRIAL', 'DOMINIO_EMAIL', 'PORTE', 'CEP_2_DIG',
       'DDD_1', 'DATA_CADASTRO', 'RENDA_MES_ANTERIOR', 'NO_FUNCIONARIOS'],
      dtype='object')

#### Verificação de inconsistência na base de pagamentos de teste

Assim como ao verificar a base de pagamentos de desenvolvimento, verifiquei se haviam inconsistência entre as datas de vencimento e emissão do documento.

In [4]:
# Gera variável DIAS_EMISSAO_VENCIMENTO, que contém os dias corridos entre a data de emissão e a data de pagamento
df['DIAS_EMISSAO_VENCIMENTO'] = (df['DATA_VENCIMENTO'] - df['DATA_EMISSAO_DOCUMENTO']).dt.days

# Caso o DIAS_EMISSAO_VENCIMENTO seja menor que 0, será considerado um caso de inconsistencia, pois o documento teria sido pago antes da emissão, sinalizado como True ou False na variável INCONSISTENCIA
df['INCONSISTENCIA'] = df.apply(lambda x: x['DIAS_EMISSAO_VENCIMENTO'] < 0, axis=1)

df_inconsistencia = df[df['INCONSISTENCIA']]

print(f"Há {len(df_inconsistencia)} registros com data de vencimento antes da data de emissão")
df_inconsistencia

Há 2 registros com data de vencimento antes da data de emissão


Unnamed: 0,ID_CLIENTE,SAFRA_REF,DATA_EMISSAO_DOCUMENTO,DATA_VENCIMENTO,VALOR_A_PAGAR,TAXA,PORCENTAGEM_INADIMPLENCIA_HIST,DDD,FLAG_PF,SEGMENTO_INDUSTRIAL,DOMINIO_EMAIL,PORTE,CEP_2_DIG,DDD_1,DATA_CADASTRO,RENDA_MES_ANTERIOR,NO_FUNCIONARIOS,DIAS_EMISSAO_VENCIMENTO,INCONSISTENCIA
6463,5503073374822479037,2021-09,2021-09-13,2021-04-26,25838.37,4.99,4.72,37.0,PJ,Indústria,GMAIL,MEDIO,21.0,3.0,2000-08-15,253199.0,118.0,-140,True
6527,4095739077267179172,2021-09,2021-09-13,2021-04-26,68695.44,5.99,13.85,21.0,PJ,Comércio,YAHOO,PEQUENO,27.0,2.0,2011-02-14,48865.0,134.0,-140,True


In [5]:
# Trata essa inconsistencia atribuindo a data de pagamento à data de emissão
df.loc[df['INCONSISTENCIA'], 'DATA_VENCIMENTO'] = df.loc[df['INCONSISTENCIA'], 'DATA_EMISSAO_DOCUMENTO']

# Verifica se o problema foi resolvido
df['DIAS_EMISSAO_VENCIMENTO'] = (df['DATA_VENCIMENTO'] - df['DATA_EMISSAO_DOCUMENTO']).dt.days
df['INCONSISTENCIA'] = df.apply(lambda x: x['DIAS_EMISSAO_VENCIMENTO'] < 0, axis=1)
df_inconsistencia = df[df['INCONSISTENCIA']]

print(f"Agora há {len(df_inconsistencia)} registros com data de vencimento antes da data de emissão")
df_inconsistencia

Agora há 0 registros com data de vencimento antes da data de emissão


Unnamed: 0,ID_CLIENTE,SAFRA_REF,DATA_EMISSAO_DOCUMENTO,DATA_VENCIMENTO,VALOR_A_PAGAR,TAXA,PORCENTAGEM_INADIMPLENCIA_HIST,DDD,FLAG_PF,SEGMENTO_INDUSTRIAL,DOMINIO_EMAIL,PORTE,CEP_2_DIG,DDD_1,DATA_CADASTRO,RENDA_MES_ANTERIOR,NO_FUNCIONARIOS,DIAS_EMISSAO_VENCIMENTO,INCONSISTENCIA


In [6]:
# Dropa coluna auxiliar
df = df.drop(columns=['INCONSISTENCIA'])
df

Unnamed: 0,ID_CLIENTE,SAFRA_REF,DATA_EMISSAO_DOCUMENTO,DATA_VENCIMENTO,VALOR_A_PAGAR,TAXA,PORCENTAGEM_INADIMPLENCIA_HIST,DDD,FLAG_PF,SEGMENTO_INDUSTRIAL,DOMINIO_EMAIL,PORTE,CEP_2_DIG,DDD_1,DATA_CADASTRO,RENDA_MES_ANTERIOR,NO_FUNCIONARIOS,DIAS_EMISSAO_VENCIMENTO
0,5058298901476893676,2021-07,2021-07-14,2021-08-04,11204.75,4.99,0.00,54.00,PJ,Serviços,GMAIL,MEDIO,99.00,5.00,2014-03-30,467430.00,154.00,21
1,274692171162531764,2021-07,2021-07-08,2021-08-23,60718.50,5.99,2.65,19.00,PJ,Serviços,GMAIL,PEQUENO,13.00,1.00,2005-08-03,417192.00,104.00,46
2,274692171162531764,2021-07,2021-07-11,2021-08-25,60718.50,5.99,2.65,19.00,PJ,Serviços,GMAIL,PEQUENO,13.00,1.00,2005-08-03,417192.00,104.00,45
3,274692171162531764,2021-07,2021-07-16,2021-08-30,62250.00,5.99,2.65,19.00,PJ,Serviços,GMAIL,PEQUENO,13.00,1.00,2005-08-03,417192.00,104.00,45
4,465309249432033993,2021-07,2021-07-05,2021-07-30,26593.95,6.99,0.00,69.00,PJ,Comércio,GMAIL,GRANDE,76.00,6.00,2014-07-23,873938.00,119.00,25
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12270,705648002974742140,2021-11,2021-11-25,2021-12-13,47010.00,6.99,14.29,67.00,PJ,Indústria,YAHOO,MEDIO,79.00,6.00,2020-01-26,,,18
12271,4993499380140734678,2021-11,2021-11-25,2021-12-13,122875.35,8.99,2.74,31.00,PJ,Serviços,GMAIL,PEQUENO,37.00,3.00,2015-11-24,311422.00,150.00,18
12272,4614484019183480654,2021-11,2021-11-26,2028-09-30,302200.00,5.99,1.82,51.00,PJ,Indústria,YAHOO,MEDIO,93.00,5.00,2011-02-14,,,2500
12273,1299146298565441811,2021-11,2021-11-26,2021-12-13,143791.85,5.99,6.06,21.00,PJ,Serviços,HOTMAIL,MEDIO,21.00,2.00,2000-08-15,220341.00,122.00,17


In [7]:
df.columns

Index(['ID_CLIENTE', 'SAFRA_REF', 'DATA_EMISSAO_DOCUMENTO', 'DATA_VENCIMENTO',
       'VALOR_A_PAGAR', 'TAXA', 'PORCENTAGEM_INADIMPLENCIA_HIST', 'DDD',
       'FLAG_PF', 'SEGMENTO_INDUSTRIAL', 'DOMINIO_EMAIL', 'PORTE', 'CEP_2_DIG',
       'DDD_1', 'DATA_CADASTRO', 'RENDA_MES_ANTERIOR', 'NO_FUNCIONARIOS',
       'DIAS_EMISSAO_VENCIMENTO'],
      dtype='object')

In [8]:
# Gera variável TEMPO_VIDA_CLIENTE, que contém os dias corridos entre a data de cadastro e a data de emissão
df['TEMPO_VIDA_CLIENTE'] = (df['DATA_EMISSAO_DOCUMENTO'] - df['DATA_CADASTRO']).dt.days
df['VALOR_PAGAR_POR_RENDA_ANTERIOR'] = df['VALOR_A_PAGAR'] / df['RENDA_MES_ANTERIOR']
df['VALOR_PAGAR_POR_NO_FUNCIONARIO'] = df['VALOR_A_PAGAR'] / df['NO_FUNCIONARIOS']

In [9]:
print("Quantidade de clientes sem histórico: ", len(df[df['PORCENTAGEM_INADIMPLENCIA_HIST'].isnull()]['ID_CLIENTE'].unique()))
print("Quantidade de clientes com informações completas: ", len(df['ID_CLIENTE'].unique()))

Quantidade de clientes sem histórico:  105
Quantidade de clientes com informações completas:  976


Após esse merge, nota-se então que, embora o enunciado do desafio cite que são clientes recorrentes, há na base de teste 105 clientes novos, que desconhecemos o histórico de pagamento.

Como o XGBoost lida com valores NaN, a variável "PORCENTAGEM_INADIMPLENCIA_HIST" desses clientes será mantida como NaN. 

In [10]:
# Criando features com dia do mês, dia da semana, mês e ano de cada data

# Converter colunas com data de str para datetime
df['DATA_CADASTRO'] = pd.to_datetime(df['DATA_CADASTRO'])
df['DATA_EMISSAO_DOCUMENTO'] = pd.to_datetime(df['DATA_EMISSAO_DOCUMENTO'])
df['DATA_VENCIMENTO'] = pd.to_datetime(df['DATA_VENCIMENTO'])
df['SAFRA_REF'] = pd.to_datetime(df['SAFRA_REF'])

# Extrai dia do mês de cada data
df['DIA_MES_EMISSAO'] = df['DATA_EMISSAO_DOCUMENTO'].dt.day
df['DIA_MES_VENCIMENTO'] = df['DATA_VENCIMENTO'].dt.day
df['DIA_MES_CADASTRO'] = df['DATA_CADASTRO'].dt.day
df['DIA_MES_SAFRA_REF'] = df['SAFRA_REF'].dt.day

# Extrai dia da semana de cada data
df['DIA_SEMANA_EMISSAO'] = df['DATA_EMISSAO_DOCUMENTO'].dt.day_name()
df['DIA_SEMANA_VENCIMENTO'] = df['DATA_VENCIMENTO'].dt.day_name()
df['DIA_SEMANA_CADASTRO'] = df['DATA_CADASTRO'].dt.day_name()
df['DIA_SEMANA_SAFRA_REF'] = df['SAFRA_REF'].dt.day_name()

# Extrai mês de cada data
df['MES_EMISSAO_DOCUMENTO'] = df['DATA_EMISSAO_DOCUMENTO'].dt.month
df['MES_VENCIMENTO'] = df['DATA_VENCIMENTO'].dt.month
df['MES_CADASTRO'] = df['DATA_CADASTRO'].dt.month
df['MES_SAFRA_REF'] = df['SAFRA_REF'].dt.month

# Extrai ano de cada data
df['ANO_EMISSAO_DOCUMENTO'] = df['DATA_EMISSAO_DOCUMENTO'].dt.year
df['ANO_VENCIMENTO'] = df['DATA_VENCIMENTO'].dt.year
df['ANO_CADASTRO'] = df['DATA_CADASTRO'].dt.year
df['ANO_SAFRA_REF'] = df['SAFRA_REF'].dt.year


df

Unnamed: 0,ID_CLIENTE,SAFRA_REF,DATA_EMISSAO_DOCUMENTO,DATA_VENCIMENTO,VALOR_A_PAGAR,TAXA,PORCENTAGEM_INADIMPLENCIA_HIST,DDD,FLAG_PF,SEGMENTO_INDUSTRIAL,...,DIA_SEMANA_CADASTRO,DIA_SEMANA_SAFRA_REF,MES_EMISSAO_DOCUMENTO,MES_VENCIMENTO,MES_CADASTRO,MES_SAFRA_REF,ANO_EMISSAO_DOCUMENTO,ANO_VENCIMENTO,ANO_CADASTRO,ANO_SAFRA_REF
0,5058298901476893676,2021-07-01,2021-07-14,2021-08-04,11204.75,4.99,0.00,54.00,PJ,Serviços,...,Sunday,Thursday,7,8,3.00,7,2021,2021,2014.00,2021
1,274692171162531764,2021-07-01,2021-07-08,2021-08-23,60718.50,5.99,2.65,19.00,PJ,Serviços,...,Wednesday,Thursday,7,8,8.00,7,2021,2021,2005.00,2021
2,274692171162531764,2021-07-01,2021-07-11,2021-08-25,60718.50,5.99,2.65,19.00,PJ,Serviços,...,Wednesday,Thursday,7,8,8.00,7,2021,2021,2005.00,2021
3,274692171162531764,2021-07-01,2021-07-16,2021-08-30,62250.00,5.99,2.65,19.00,PJ,Serviços,...,Wednesday,Thursday,7,8,8.00,7,2021,2021,2005.00,2021
4,465309249432033993,2021-07-01,2021-07-05,2021-07-30,26593.95,6.99,0.00,69.00,PJ,Comércio,...,Wednesday,Thursday,7,7,7.00,7,2021,2021,2014.00,2021
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12270,705648002974742140,2021-11-01,2021-11-25,2021-12-13,47010.00,6.99,14.29,67.00,PJ,Indústria,...,Sunday,Monday,11,12,1.00,11,2021,2021,2020.00,2021
12271,4993499380140734678,2021-11-01,2021-11-25,2021-12-13,122875.35,8.99,2.74,31.00,PJ,Serviços,...,Tuesday,Monday,11,12,11.00,11,2021,2021,2015.00,2021
12272,4614484019183480654,2021-11-01,2021-11-26,2028-09-30,302200.00,5.99,1.82,51.00,PJ,Indústria,...,Monday,Monday,11,9,2.00,11,2021,2028,2011.00,2021
12273,1299146298565441811,2021-11-01,2021-11-26,2021-12-13,143791.85,5.99,6.06,21.00,PJ,Serviços,...,Tuesday,Monday,11,12,8.00,11,2021,2021,2000.00,2021


In [11]:
variaveis_label = ['DDD', 'FLAG_PF', 'PORTE', 'CEP_2_DIG', 'DDD_1', 'DIA_MES_EMISSAO', 'DIA_MES_VENCIMENTO', 'DIA_MES_CADASTRO', 'DIA_MES_SAFRA_REF',
                   'DIA_SEMANA_EMISSAO', 'DIA_SEMANA_VENCIMENTO', 'DIA_SEMANA_CADASTRO', 'DIA_SEMANA_SAFRA_REF', 'MES_EMISSAO_DOCUMENTO', 'MES_VENCIMENTO',
                   'MES_CADASTRO', 'MES_SAFRA_REF', 'ANO_EMISSAO_DOCUMENTO', 'ANO_VENCIMENTO', 'ANO_CADASTRO', 'ANO_SAFRA_REF']
variaveis_one_hot = ['SEGMENTO_INDUSTRIAL', 'DOMINIO_EMAIL']

for var in variaveis_label:
    df[var] = LabelEncoder().fit_transform(df[var])

for var in variaveis_one_hot:
    # Redimensiona a variável para uma matriz com uma única coluna
    var_reshaped = df[var].values.reshape(-1, 1)
    
    # Aplica o OneHotEncoder na variável redimensionada
    encoder = OneHotEncoder()
    encoded_data = encoder.fit_transform(var_reshaped)
    
    # Obtém os nomes das colunas codificadas
    column_names = encoder.get_feature_names_out([var])
    
    # Cria um novo dataframe com as colunas codificadas
    encoded_df = pd.DataFrame(encoded_data.toarray(), columns=column_names)
    
    # Concatena o dataframe codificado com o dataframe original
    df = pd.concat([df, encoded_df], axis=1)
    
    # Remove a coluna original do dataframe
    df.drop(columns=[var], inplace=True)

df

Unnamed: 0,ID_CLIENTE,SAFRA_REF,DATA_EMISSAO_DOCUMENTO,DATA_VENCIMENTO,VALOR_A_PAGAR,TAXA,PORCENTAGEM_INADIMPLENCIA_HIST,DDD,FLAG_PF,PORTE,...,SEGMENTO_INDUSTRIAL_Indústria,SEGMENTO_INDUSTRIAL_Serviços,SEGMENTO_INDUSTRIAL_nan,DOMINIO_EMAIL_AOL,DOMINIO_EMAIL_BOL,DOMINIO_EMAIL_GMAIL,DOMINIO_EMAIL_HOTMAIL,DOMINIO_EMAIL_OUTLOOK,DOMINIO_EMAIL_YAHOO,DOMINIO_EMAIL_nan
0,5058298901476893676,2021-07-01,2021-07-14,2021-08-04,11204.75,4.99,0.00,36,1,1,...,0.00,1.00,0.00,0.00,0.00,1.00,0.00,0.00,0.00,0.00
1,274692171162531764,2021-07-01,2021-07-08,2021-08-23,60718.50,5.99,2.65,11,1,2,...,0.00,1.00,0.00,0.00,0.00,1.00,0.00,0.00,0.00,0.00
2,274692171162531764,2021-07-01,2021-07-11,2021-08-25,60718.50,5.99,2.65,11,1,2,...,0.00,1.00,0.00,0.00,0.00,1.00,0.00,0.00,0.00,0.00
3,274692171162531764,2021-07-01,2021-07-16,2021-08-30,62250.00,5.99,2.65,11,1,2,...,0.00,1.00,0.00,0.00,0.00,1.00,0.00,0.00,0.00,0.00
4,465309249432033993,2021-07-01,2021-07-05,2021-07-30,26593.95,6.99,0.00,46,1,0,...,0.00,0.00,0.00,0.00,0.00,1.00,0.00,0.00,0.00,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12270,705648002974742140,2021-11-01,2021-11-25,2021-12-13,47010.00,6.99,14.29,45,1,1,...,1.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00,0.00
12271,4993499380140734678,2021-11-01,2021-11-25,2021-12-13,122875.35,8.99,2.74,17,1,2,...,0.00,1.00,0.00,0.00,0.00,1.00,0.00,0.00,0.00,0.00
12272,4614484019183480654,2021-11-01,2021-11-26,2028-09-30,302200.00,5.99,1.82,33,1,1,...,1.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00,0.00
12273,1299146298565441811,2021-11-01,2021-11-26,2021-12-13,143791.85,5.99,6.06,12,1,1,...,0.00,1.00,0.00,0.00,0.00,0.00,1.00,0.00,0.00,0.00


In [12]:
# Dropa variáveis com data
df_to_pred = df.drop(columns=['DATA_EMISSAO_DOCUMENTO', 'DATA_VENCIMENTO', 'SAFRA_REF'])

df_to_pred = df_to_pred[['RENDA_MES_ANTERIOR', 'NO_FUNCIONARIOS', 'VALOR_A_PAGAR', 'TAXA', 'DIAS_EMISSAO_VENCIMENTO', 'DDD', 'FLAG_PF', 'PORTE', 'CEP_2_DIG',
                         'DDD_1', 'TEMPO_VIDA_CLIENTE', 'PORCENTAGEM_INADIMPLENCIA_HIST', 'VALOR_PAGAR_POR_RENDA_ANTERIOR', 'VALOR_PAGAR_POR_NO_FUNCIONARIO',
                         'DIA_MES_EMISSAO', 'DIA_MES_VENCIMENTO', 'DIA_MES_CADASTRO', 'DIA_MES_SAFRA_REF', 'DIA_SEMANA_EMISSAO', 'DIA_SEMANA_VENCIMENTO', 
                         'DIA_SEMANA_CADASTRO', 'DIA_SEMANA_SAFRA_REF', 'MES_EMISSAO_DOCUMENTO', 'MES_VENCIMENTO', 'MES_CADASTRO', 'MES_SAFRA_REF', 
                         'ANO_EMISSAO_DOCUMENTO', 'ANO_VENCIMENTO', 'ANO_CADASTRO', 'ANO_SAFRA_REF', 'SEGMENTO_INDUSTRIAL_Comércio', 'SEGMENTO_INDUSTRIAL_Indústria',
                         'SEGMENTO_INDUSTRIAL_Serviços', 'SEGMENTO_INDUSTRIAL_nan', 'DOMINIO_EMAIL_AOL', 'DOMINIO_EMAIL_BOL', 'DOMINIO_EMAIL_GMAIL',
                         'DOMINIO_EMAIL_HOTMAIL', 'DOMINIO_EMAIL_OUTLOOK', 'DOMINIO_EMAIL_YAHOO', 'DOMINIO_EMAIL_nan']]

df_to_pred

Unnamed: 0,RENDA_MES_ANTERIOR,NO_FUNCIONARIOS,VALOR_A_PAGAR,TAXA,DIAS_EMISSAO_VENCIMENTO,DDD,FLAG_PF,PORTE,CEP_2_DIG,DDD_1,...,SEGMENTO_INDUSTRIAL_Indústria,SEGMENTO_INDUSTRIAL_Serviços,SEGMENTO_INDUSTRIAL_nan,DOMINIO_EMAIL_AOL,DOMINIO_EMAIL_BOL,DOMINIO_EMAIL_GMAIL,DOMINIO_EMAIL_HOTMAIL,DOMINIO_EMAIL_OUTLOOK,DOMINIO_EMAIL_YAHOO,DOMINIO_EMAIL_nan
0,467430.00,154.00,11204.75,4.99,21,36,1,1,87,5,...,0.00,1.00,0.00,0.00,0.00,1.00,0.00,0.00,0.00,0.00
1,417192.00,104.00,60718.50,5.99,46,11,1,2,3,1,...,0.00,1.00,0.00,0.00,0.00,1.00,0.00,0.00,0.00,0.00
2,417192.00,104.00,60718.50,5.99,45,11,1,2,3,1,...,0.00,1.00,0.00,0.00,0.00,1.00,0.00,0.00,0.00,0.00
3,417192.00,104.00,62250.00,5.99,45,11,1,2,3,1,...,0.00,1.00,0.00,0.00,0.00,1.00,0.00,0.00,0.00,0.00
4,873938.00,119.00,26593.95,6.99,25,46,1,0,64,6,...,0.00,0.00,0.00,0.00,0.00,1.00,0.00,0.00,0.00,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12270,,,47010.00,6.99,18,45,1,1,67,6,...,1.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00,0.00
12271,311422.00,150.00,122875.35,8.99,18,17,1,2,27,3,...,0.00,1.00,0.00,0.00,0.00,1.00,0.00,0.00,0.00,0.00
12272,,,302200.00,5.99,2500,33,1,1,81,5,...,1.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00,0.00
12273,220341.00,122.00,143791.85,5.99,17,12,1,1,11,2,...,0.00,1.00,0.00,0.00,0.00,0.00,1.00,0.00,0.00,0.00


In [13]:
# Carrega modelo no arquivo pickle
with open('../model/model.pkl', 'rb') as arquivo:
    model = pickle.load(arquivo)

In [14]:
df['PREDICT_PROBA'] = model.predict_proba(df_to_pred)[:, 1]

df

Unnamed: 0,ID_CLIENTE,SAFRA_REF,DATA_EMISSAO_DOCUMENTO,DATA_VENCIMENTO,VALOR_A_PAGAR,TAXA,PORCENTAGEM_INADIMPLENCIA_HIST,DDD,FLAG_PF,PORTE,...,SEGMENTO_INDUSTRIAL_Serviços,SEGMENTO_INDUSTRIAL_nan,DOMINIO_EMAIL_AOL,DOMINIO_EMAIL_BOL,DOMINIO_EMAIL_GMAIL,DOMINIO_EMAIL_HOTMAIL,DOMINIO_EMAIL_OUTLOOK,DOMINIO_EMAIL_YAHOO,DOMINIO_EMAIL_nan,PREDICT_PROBA
0,5058298901476893676,2021-07-01,2021-07-14,2021-08-04,11204.75,4.99,0.00,36,1,1,...,1.00,0.00,0.00,0.00,1.00,0.00,0.00,0.00,0.00,0.00
1,274692171162531764,2021-07-01,2021-07-08,2021-08-23,60718.50,5.99,2.65,11,1,2,...,1.00,0.00,0.00,0.00,1.00,0.00,0.00,0.00,0.00,0.00
2,274692171162531764,2021-07-01,2021-07-11,2021-08-25,60718.50,5.99,2.65,11,1,2,...,1.00,0.00,0.00,0.00,1.00,0.00,0.00,0.00,0.00,0.00
3,274692171162531764,2021-07-01,2021-07-16,2021-08-30,62250.00,5.99,2.65,11,1,2,...,1.00,0.00,0.00,0.00,1.00,0.00,0.00,0.00,0.00,0.00
4,465309249432033993,2021-07-01,2021-07-05,2021-07-30,26593.95,6.99,0.00,46,1,0,...,0.00,0.00,0.00,0.00,1.00,0.00,0.00,0.00,0.00,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12270,705648002974742140,2021-11-01,2021-11-25,2021-12-13,47010.00,6.99,14.29,45,1,1,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00,0.00,0.01
12271,4993499380140734678,2021-11-01,2021-11-25,2021-12-13,122875.35,8.99,2.74,17,1,2,...,1.00,0.00,0.00,0.00,1.00,0.00,0.00,0.00,0.00,0.00
12272,4614484019183480654,2021-11-01,2021-11-26,2028-09-30,302200.00,5.99,1.82,33,1,1,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00,0.00,0.07
12273,1299146298565441811,2021-11-01,2021-11-26,2021-12-13,143791.85,5.99,6.06,12,1,1,...,1.00,0.00,0.00,0.00,0.00,1.00,0.00,0.00,0.00,0.00


In [15]:
df['INADIMPLENTE'] = df['PREDICT_PROBA']
df = df[['ID_CLIENTE', 'SAFRA_REF', 'INADIMPLENTE']]

df

Unnamed: 0,ID_CLIENTE,SAFRA_REF,INADIMPLENTE
0,5058298901476893676,2021-07-01,0.00
1,274692171162531764,2021-07-01,0.00
2,274692171162531764,2021-07-01,0.00
3,274692171162531764,2021-07-01,0.00
4,465309249432033993,2021-07-01,0.00
...,...,...,...
12270,705648002974742140,2021-11-01,0.01
12271,4993499380140734678,2021-11-01,0.00
12272,4614484019183480654,2021-11-01,0.07
12273,1299146298565441811,2021-11-01,0.00


In [16]:
lista_teste = [1 if valor > 0.5 else 0 for valor in df['INADIMPLENTE']]

df['TESTE'] = lista_teste
df['TESTE'].unique()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['TESTE'] = lista_teste


array([0, 1], dtype=int64)

In [17]:
df['TESTE'].value_counts()

0    11886
1      389
Name: TESTE, dtype: int64

In [18]:
df = df.drop(columns='TESTE')
df

Unnamed: 0,ID_CLIENTE,SAFRA_REF,INADIMPLENTE
0,5058298901476893676,2021-07-01,0.00
1,274692171162531764,2021-07-01,0.00
2,274692171162531764,2021-07-01,0.00
3,274692171162531764,2021-07-01,0.00
4,465309249432033993,2021-07-01,0.00
...,...,...,...
12270,705648002974742140,2021-11-01,0.01
12271,4993499380140734678,2021-11-01,0.00
12272,4614484019183480654,2021-11-01,0.07
12273,1299146298565441811,2021-11-01,0.00


****
### Persistência dos dados - Resultado


In [19]:
df.to_csv('../data/4_prediction/df_results.csv', index=False)