# 4. Pré-Processamento


Este arquivo contém o código responsável pelo pré-processamento dos dados antes de alimentar o modelo de machine learning.

Aqui inclui etapas como a seleção de variáveis, a codificação de variáveis categóricas e o tratamento de valores ausentes.



In [1]:
# Bibliotecas
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder


In [2]:
# Não utilizar notação cientifica para melhorar a visualização
pd.set_option('display.float_format', lambda x: f'{x:.2f}')

# Carrega dados
df = pd.read_csv('../data/2_intermediate/3_df_eda.csv')


In [3]:
# Coluna que apesar de ser inteiro, denota categoria
df['DDD'] = df['DDD'].fillna(0).astype(int).astype(str)
df['CEP_2_DIG'] = df['CEP_2_DIG'].fillna(0).astype(int).astype(str)
df['DDD_1'] = df['DDD_1'].fillna(0).astype(int).astype(str)

# Dropa ID_CLIENTE
df = df.drop(columns=['ID_CLIENTE'])

# Dropa variáveis indisponíveis a partir dos dados de teste
df = df.drop(columns=['DIAS_ATRASO', 'DIAS_EMISSAO_PAGAMENTO', 'ANO_PAGAMENTO', 'DIA_MES_PAGAMENTO', 'DIA_SEMANA_PAGAMENTO', 'MES_PAGAMENTO'])


df

Unnamed: 0,SAFRA_REF,RENDA_MES_ANTERIOR,NO_FUNCIONARIOS,DATA_EMISSAO_DOCUMENTO,DATA_PAGAMENTO,DATA_VENCIMENTO,VALOR_A_PAGAR,TAXA,INADIMPLENTE,DIAS_EMISSAO_VENCIMENTO,...,DIA_SEMANA_CADASTRO,DIA_SEMANA_SAFRA_REF,MES_EMISSAO_DOCUMENTO,MES_VENCIMENTO,MES_CADASTRO,MES_SAFRA_REF,ANO_EMISSAO_DOCUMENTO,ANO_VENCIMENTO,ANO_CADASTRO,ANO_SAFRA_REF
0,2018-09-01,16913.00,92.00,2018-09-16,2018-10-09,2018-10-08,22427.25,5.99,False,22,...,Thursday,Saturday,9,10,8,9,2018,2018,2013,2018
1,2018-09-01,16913.00,92.00,2018-09-23,2018-10-15,2018-10-15,35608.11,5.99,False,22,...,Thursday,Saturday,9,10,8,9,2018,2018,2013,2018
2,2018-10-01,236447.00,93.00,2018-10-08,2018-10-30,2018-10-29,17988.49,5.99,False,21,...,Thursday,Monday,10,10,8,10,2018,2018,2013,2018
3,2018-10-01,236447.00,93.00,2018-10-17,2018-11-07,2018-11-06,41998.20,6.99,False,20,...,Thursday,Monday,10,11,8,10,2018,2018,2013,2018
4,2018-10-01,236447.00,93.00,2018-10-21,2018-11-12,2018-11-12,35514.41,6.99,False,22,...,Thursday,Monday,10,11,8,10,2018,2018,2013,2018
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73432,2021-06-01,343507.00,134.00,2021-06-13,2021-07-26,2021-07-28,17302.05,4.99,False,45,...,Sunday,Tuesday,6,7,12,6,2021,2021,2015,2021
73433,2021-06-01,392975.00,121.00,2021-06-14,2021-06-28,2021-06-29,19799.30,6.99,False,15,...,Tuesday,Tuesday,6,6,7,6,2021,2021,2019,2021
73434,2021-06-01,392975.00,121.00,2021-06-17,2021-07-01,2021-07-02,19913.15,11.99,False,15,...,Tuesday,Tuesday,6,7,7,6,2021,2021,2019,2021
73435,2021-06-01,70449.00,141.00,2021-06-14,2021-07-13,2021-07-09,2806.09,6.99,False,25,...,Monday,Tuesday,6,7,12,6,2021,2021,2019,2021


In [4]:
df.columns

Index(['SAFRA_REF', 'RENDA_MES_ANTERIOR', 'NO_FUNCIONARIOS',
       'DATA_EMISSAO_DOCUMENTO', 'DATA_PAGAMENTO', 'DATA_VENCIMENTO',
       'VALOR_A_PAGAR', 'TAXA', 'INADIMPLENTE', 'DIAS_EMISSAO_VENCIMENTO',
       'DDD', 'FLAG_PF', 'SEGMENTO_INDUSTRIAL', 'DOMINIO_EMAIL', 'PORTE',
       'CEP_2_DIG', 'DDD_1', 'TEMPO_VIDA_CLIENTE', 'DATA_CADASTRO',
       'PORCENTAGEM_INADIMPLENCIA_HIST', 'VALOR_PAGAR_POR_RENDA_ANTERIOR',
       'VALOR_PAGAR_POR_NO_FUNCIONARIO', 'DIA_MES_EMISSAO',
       'DIA_MES_VENCIMENTO', 'DIA_MES_CADASTRO', 'DIA_MES_SAFRA_REF',
       'DIA_SEMANA_EMISSAO', 'DIA_SEMANA_VENCIMENTO', 'DIA_SEMANA_CADASTRO',
       'DIA_SEMANA_SAFRA_REF', 'MES_EMISSAO_DOCUMENTO', 'MES_VENCIMENTO',
       'MES_CADASTRO', 'MES_SAFRA_REF', 'ANO_EMISSAO_DOCUMENTO',
       'ANO_VENCIMENTO', 'ANO_CADASTRO', 'ANO_SAFRA_REF'],
      dtype='object')

****
#### Conversão das variáveis categóricas



In [5]:
variaveis_label = ['INADIMPLENTE', 'DDD', 'FLAG_PF', 'PORTE', 'CEP_2_DIG', 'DDD_1', 'DIA_MES_EMISSAO', 'DIA_MES_VENCIMENTO', 'DIA_MES_CADASTRO', 'DIA_MES_SAFRA_REF',
                   'DIA_SEMANA_EMISSAO', 'DIA_SEMANA_VENCIMENTO', 'DIA_SEMANA_CADASTRO', 'DIA_SEMANA_SAFRA_REF', 'MES_EMISSAO_DOCUMENTO', 'MES_VENCIMENTO',
                   'MES_CADASTRO', 'MES_SAFRA_REF', 'ANO_EMISSAO_DOCUMENTO', 'ANO_VENCIMENTO', 'ANO_CADASTRO', 'ANO_SAFRA_REF']
variaveis_one_hot = ['SEGMENTO_INDUSTRIAL', 'DOMINIO_EMAIL']

for var in variaveis_label:
    df[var] = LabelEncoder().fit_transform(df[var])

for var in variaveis_one_hot:
    # Redimensiona a variável para uma matriz com uma única coluna
    var_reshaped = df[var].values.reshape(-1, 1)
    
    # Aplica o OneHotEncoder na variável redimensionada
    encoder = OneHotEncoder()
    encoded_data = encoder.fit_transform(var_reshaped)
    
    # Obtém os nomes das colunas codificadas
    column_names = encoder.get_feature_names_out([var])
    
    # Cria um novo dataframe com as colunas codificadas
    encoded_df = pd.DataFrame(encoded_data.toarray(), columns=column_names)
    
    # Concatena o dataframe codificado com o dataframe original
    df = pd.concat([df, encoded_df], axis=1)
    
    # Remove a coluna original do dataframe
    df.drop(columns=[var], inplace=True)

df

Unnamed: 0,SAFRA_REF,RENDA_MES_ANTERIOR,NO_FUNCIONARIOS,DATA_EMISSAO_DOCUMENTO,DATA_PAGAMENTO,DATA_VENCIMENTO,VALOR_A_PAGAR,TAXA,INADIMPLENTE,DIAS_EMISSAO_VENCIMENTO,...,SEGMENTO_INDUSTRIAL_Indústria,SEGMENTO_INDUSTRIAL_Serviços,SEGMENTO_INDUSTRIAL_nan,DOMINIO_EMAIL_AOL,DOMINIO_EMAIL_BOL,DOMINIO_EMAIL_GMAIL,DOMINIO_EMAIL_HOTMAIL,DOMINIO_EMAIL_OUTLOOK,DOMINIO_EMAIL_YAHOO,DOMINIO_EMAIL_nan
0,2018-09-01,16913.00,92.00,2018-09-16,2018-10-09,2018-10-08,22427.25,5.99,0,22,...,0.00,1.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00,0.00
1,2018-09-01,16913.00,92.00,2018-09-23,2018-10-15,2018-10-15,35608.11,5.99,0,22,...,0.00,1.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00,0.00
2,2018-10-01,236447.00,93.00,2018-10-08,2018-10-30,2018-10-29,17988.49,5.99,0,21,...,0.00,1.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00,0.00
3,2018-10-01,236447.00,93.00,2018-10-17,2018-11-07,2018-11-06,41998.20,6.99,0,20,...,0.00,1.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00,0.00
4,2018-10-01,236447.00,93.00,2018-10-21,2018-11-12,2018-11-12,35514.41,6.99,0,22,...,0.00,1.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73432,2021-06-01,343507.00,134.00,2021-06-13,2021-07-26,2021-07-28,17302.05,4.99,0,45,...,0.00,0.00,0.00,0.00,1.00,0.00,0.00,0.00,0.00,0.00
73433,2021-06-01,392975.00,121.00,2021-06-14,2021-06-28,2021-06-29,19799.30,6.99,0,15,...,1.00,0.00,0.00,0.00,1.00,0.00,0.00,0.00,0.00,0.00
73434,2021-06-01,392975.00,121.00,2021-06-17,2021-07-01,2021-07-02,19913.15,11.99,0,15,...,1.00,0.00,0.00,0.00,1.00,0.00,0.00,0.00,0.00,0.00
73435,2021-06-01,70449.00,141.00,2021-06-14,2021-07-13,2021-07-09,2806.09,6.99,0,25,...,1.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00,0.00,0.00


****
### 2.1. Data Leakage


In [6]:
# Verificação da data de termino do dataset de desenvolvimento
print(f"Última data no dataset de desenvolvimento: {df.sort_values('DATA_EMISSAO_DOCUMENTO').reset_index(drop=True)['DATA_EMISSAO_DOCUMENTO'].tolist()[-1]}")

# Verificação da data de inicio do dataset de teste
df_teste = pd.read_csv('../data/1_raw/base_pagamentos_teste.csv')
print(f"Primeira data no dataset de teste: {df_teste.sort_values('DATA_EMISSAO_DOCUMENTO').reset_index(drop=True)['DATA_EMISSAO_DOCUMENTO'][0]}")

Última data no dataset de desenvolvimento: 2021-06-30
Primeira data no dataset de teste: 2021-07-01


Como o dataset de teste contém dados a partir de julho/2021 e o dataset de treino inclui dados até junho/2021, não há sobreposição de dados entre os conjuntos de treino e teste. Isso significa que o modelo não terá acesso a informações futuras durante o treinamento (data leakage), o que poderia levar a resultados superestimados e não realistas.

****
#### Exclusão de colunas com data

In [7]:
# Dropa variáveis com data
df = df.drop(columns=['DATA_CADASTRO', 'DATA_EMISSAO_DOCUMENTO', 'DATA_PAGAMENTO', 'DATA_VENCIMENTO', 'SAFRA_REF'])
df

Unnamed: 0,RENDA_MES_ANTERIOR,NO_FUNCIONARIOS,VALOR_A_PAGAR,TAXA,INADIMPLENTE,DIAS_EMISSAO_VENCIMENTO,DDD,FLAG_PF,PORTE,CEP_2_DIG,...,SEGMENTO_INDUSTRIAL_Indústria,SEGMENTO_INDUSTRIAL_Serviços,SEGMENTO_INDUSTRIAL_nan,DOMINIO_EMAIL_AOL,DOMINIO_EMAIL_BOL,DOMINIO_EMAIL_GMAIL,DOMINIO_EMAIL_HOTMAIL,DOMINIO_EMAIL_OUTLOOK,DOMINIO_EMAIL_YAHOO,DOMINIO_EMAIL_nan
0,16913.00,92.00,22427.25,5.99,0,22,68,1,2,55,...,0.00,1.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00,0.00
1,16913.00,92.00,35608.11,5.99,0,22,68,1,2,55,...,0.00,1.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00,0.00
2,236447.00,93.00,17988.49,5.99,0,21,68,1,2,55,...,0.00,1.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00,0.00
3,236447.00,93.00,41998.20,6.99,0,20,68,1,2,55,...,0.00,1.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00,0.00
4,236447.00,93.00,35514.41,6.99,0,22,68,1,2,55,...,0.00,1.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73432,343507.00,134.00,17302.05,4.99,0,45,0,1,1,26,...,0.00,0.00,0.00,0.00,1.00,0.00,0.00,0.00,0.00,0.00
73433,392975.00,121.00,19799.30,6.99,0,15,0,1,2,3,...,1.00,0.00,0.00,0.00,1.00,0.00,0.00,0.00,0.00,0.00
73434,392975.00,121.00,19913.15,11.99,0,15,0,1,2,3,...,1.00,0.00,0.00,0.00,1.00,0.00,0.00,0.00,0.00,0.00
73435,70449.00,141.00,2806.09,6.99,0,25,9,1,1,10,...,1.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00,0.00,0.00


In [8]:
df.columns

Index(['RENDA_MES_ANTERIOR', 'NO_FUNCIONARIOS', 'VALOR_A_PAGAR', 'TAXA',
       'INADIMPLENTE', 'DIAS_EMISSAO_VENCIMENTO', 'DDD', 'FLAG_PF', 'PORTE',
       'CEP_2_DIG', 'DDD_1', 'TEMPO_VIDA_CLIENTE',
       'PORCENTAGEM_INADIMPLENCIA_HIST', 'VALOR_PAGAR_POR_RENDA_ANTERIOR',
       'VALOR_PAGAR_POR_NO_FUNCIONARIO', 'DIA_MES_EMISSAO',
       'DIA_MES_VENCIMENTO', 'DIA_MES_CADASTRO', 'DIA_MES_SAFRA_REF',
       'DIA_SEMANA_EMISSAO', 'DIA_SEMANA_VENCIMENTO', 'DIA_SEMANA_CADASTRO',
       'DIA_SEMANA_SAFRA_REF', 'MES_EMISSAO_DOCUMENTO', 'MES_VENCIMENTO',
       'MES_CADASTRO', 'MES_SAFRA_REF', 'ANO_EMISSAO_DOCUMENTO',
       'ANO_VENCIMENTO', 'ANO_CADASTRO', 'ANO_SAFRA_REF',
       'SEGMENTO_INDUSTRIAL_Comércio', 'SEGMENTO_INDUSTRIAL_Indústria',
       'SEGMENTO_INDUSTRIAL_Serviços', 'SEGMENTO_INDUSTRIAL_nan',
       'DOMINIO_EMAIL_AOL', 'DOMINIO_EMAIL_BOL', 'DOMINIO_EMAIL_GMAIL',
       'DOMINIO_EMAIL_HOTMAIL', 'DOMINIO_EMAIL_OUTLOOK', 'DOMINIO_EMAIL_YAHOO',
       'DOMINIO_EMAIL_nan'],
 

****
#### Persistencia dos dados


In [9]:
df.to_csv('../data/3_primary/df_to_model.csv', index=False)