# Análise exploratória dos dados
_EDA - Exploratory Data Analysis_

---

## Sumário

1. **Importação de bibliotecas**
2. **Carregamento das bases**
3. **Divisão da base em treino, validação e teste**
    - 3.1. Salvando os dataframes em formato parquet


<br>

---

<br>

## 1. Importação de bibliotecas

In [1]:
# Importação de pacotes e definição de parâmetros globais

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import gc
import os

from sklearn.model_selection import train_test_split

In [2]:
# Configurações para exibição de dados no Jupyter Notebook

# Configurar para exibir todas as colunas do Dataframe
pd.set_option('display.max_columns', None)

# Configurar para exibir o conteúdo completo das colunas
pd.set_option('display.max_colwidth', None)

# Configurar a supressão de mensagens de aviso durante a execução
warnings.filterwarnings('ignore')

# Configurar estilo dos gráficos do seaborn
sns.set_style('whitegrid')

## 2. Carregamento da base

In [3]:
# Efetuando a limpeza da memória antes do carregamento dos dados
print(f'\nQuantidade de objetos removidos da memória: {gc.collect()}')


Quantidade de objetos removidos da memória: 0


In [4]:
# Caminho do arquivo de dados
caminho = f'dados/ABT'

# Criando um dataframe a partir dos arquivos no diretório de dados
df = pd.read_parquet(caminho, engine='pyarrow')

In [5]:
df.head(10)

Unnamed: 0,id,date,client_id,card_id,amount,use_chip,merchant_id,merchant_city,merchant_state,zip,mcc,errors,id_card,client_id_card,card_brand,card_type,card_number,expires,cvv,has_chip,num_cards_issued,credit_limit,acct_open_date,year_pin_last_changed,card_on_dark_web,id_client,current_age,retirement_age,birth_year,birth_month,gender,address,latitude,longitude,per_capita_income,yearly_income,total_debt,credit_score,num_credit_cards,code,description,transaction_id,is_fraud
0,7475806,2010-01-01 09:05:00,1840,4568,$2.02,Swipe Transaction,35451,Beaverton,OR,97005.0,5812,,4568,1840,Visa,Debit (Prepaid),4733359418335581,09/2021,67,YES,2,$4,09/2004,2008,No,1840,46,71,1974,2,Female,576 Martin Luther King Street,45.49,-122.8,$21702,$44249,$103229,706,5,5812,Eating Places and Restaurants,7475806.0,No
1,7477473,2010-01-01 15:08:00,538,4161,$7.48,Swipe Transaction,26810,Winterville,NC,28590.0,5541,,4161,538,Mastercard,Debit,5885105668024939,12/2014,750,YES,2,$6993,08/2005,2016,No,538,66,69,1954,2,Female,7888 Fourth Street,35.3,-77.15,$14844,$30265,$36789,814,4,5541,Service Stations,7477473.0,No
2,7477784,2010-01-01 16:18:00,724,2876,$1.70,Swipe Transaction,59935,Cushing,OK,74023.0,5499,,2876,724,Mastercard,Debit,5832356224925490,06/2024,245,YES,2,$16476,05/2005,2008,No,724,45,72,1974,5,Female,819 El Camino Boulevard,35.97,-96.76,$17237,$35142,$107898,731,4,5499,Miscellaneous Food Stores,,
3,7477811,2010-01-01 16:25:00,377,1175,$-53.00,Swipe Transaction,43293,Withee,WI,54498.0,5499,,1175,377,Mastercard,Debit,5009400051376027,11/2023,417,YES,1,$30403,02/2009,2014,No,377,80,67,1940,1,Female,305 Pine Avenue,47.39,-122.26,$24884,$39110,$363,750,5,5499,Miscellaneous Food Stores,,
4,7478410,2010-01-01 18:55:00,1362,2145,$-295.00,Swipe Transaction,96185,Bladensburg,MD,20710.0,7011,,2145,1362,Mastercard,Debit,5566695688917047,03/2017,309,NO,2,$29708,03/2007,2009,No,1362,58,67,1962,1,Male,3385 Hill Lane,38.78,-77.27,$35563,$72510,$44317,727,4,7011,"Lodging - Hotels, Motels, Resorts",,
5,7478830,2010-01-01 21:25:00,1466,5884,$17.59,Online Transaction,16798,ONLINE,,,4121,,5884,1466,Mastercard,Debit,5946854129119703,09/2020,405,YES,1,$1866,12/2007,2014,No,1466,36,75,1983,4,Female,3194 Norfolk Street,38.64,-75.61,$17624,$35933,$23451,812,2,4121,Taxicabs and Limousines,,
6,7479105,2010-01-01 23:02:00,1693,5940,$4.33,Online Transaction,85247,ONLINE,,,5815,,5940,1693,Mastercard,Debit,5128104617797218,03/2017,726,YES,1,$33506,12/2008,2011,No,1693,36,69,1983,4,Female,478 East Drive,33.61,-111.89,$36300,$74016,$85204,702,2,5815,"Digital Goods - Media, Books, Apps",7479105.0,No
7,7480284,2010-01-02 11:11:00,1674,2873,$27.78,Swipe Transaction,60569,Jonesboro,AR,72401.0,5300,,2873,1674,Amex,Credit,366520954874839,05/2022,447,YES,2,$8800,05/2005,2011,No,1674,70,64,1949,4,Male,5073 Wessex Avenue,35.49,-90.35,$14172,$26858,$11245,712,2,5300,Wholesale Clubs,7480284.0,No
8,7480339,2010-01-02 11:27:00,1070,4138,$35.20,Swipe Transaction,99256,Marion,IA,52302.0,5411,,4138,1070,Mastercard,Debit,5588241759620390,08/2022,902,YES,1,$28666,08/2004,2010,No,1070,61,65,1958,11,Male,841 Wessex Boulevard,42.03,-91.58,$25275,$51528,$58509,745,6,5411,"Grocery Stores, Supermarkets",,
9,7480412,2010-01-02 11:46:00,509,4588,$5.54,Swipe Transaction,60569,Charmco,WV,25958.0,5300,,4588,509,Visa,Debit,4262181069766792,07/2022,519,YES,1,$12721,09/2005,2015,No,509,33,66,1986,7,Male,239 Sussex Drive,38.41,-82.43,$21842,$44534,$107410,702,4,5300,Wholesale Clubs,7480412.0,No


## 3. Divisão da base em treino, validação e teste

In [6]:
# Criando os dataframes com base no target 'is_fraud'

# DataFrame de treino e validação (valores 'Yes' e 'No')
df_train_val = df[df['is_fraud'].isin(['Yes', 'No'])].copy()

# DataFrame de teste (valores None)
df_test = df[df['is_fraud'].isnull()].copy()

In [7]:
# Dividindo o dataframe de treino e validação, mantendo a proporção das classes do target 'is_fraud'
df_train, df_val = train_test_split(
    df_train_val, test_size=0.2, random_state=42, stratify=df_train_val['is_fraud']
)

In [8]:
# Exibindo a quantidade de linhas e colunas dos dataframes

# Criação de um dicionário com os dataframes e seus respectivos nomes
dfs = {
    'df_train': df_train,
    'df_val': df_val,
    'df_test': df_test,
}

# Iteração sobre o dicionário para exibir o nome e as dimensões dos dataframes
print(f'\nVOLUMETRIA')
for nome, df in dfs.items():
    print(f'\n{nome}')
    print(f'-'*45)
    print(f'Quantidade de linhas (registros):  {df.shape[0]:,}')
    print(f'Quantidade de colunas (variáveis): {df.shape[1]:,}')  


VOLUMETRIA

df_train
---------------------------------------------
Quantidade de linhas (registros):  3,566,068
Quantidade de colunas (variáveis): 43

df_val
---------------------------------------------
Quantidade de linhas (registros):  891,518
Quantidade de colunas (variáveis): 43

df_test
---------------------------------------------
Quantidade de linhas (registros):  2,194,300
Quantidade de colunas (variáveis): 43


### 3.1. Salvando os dataframes em formato parquet

In [9]:
# Iteração sobre o dicionário para exportação dos dataframes em formato parquet
for nome, df in dfs.items():
    
    # Diretório onde os dados serão salvos
    caminho = f'dados/dados_parquet/'
    
    # Cria o diretório se não existir
    os.makedirs(caminho, exist_ok=True)
    
    try:      
        # Exportar para Parquet
        df.to_parquet(caminho + nome + '.parquet', engine='pyarrow', index=False)
        
        # Valida a quantidade de linhas e colunas
        print(f'\nO {nome} possui {df.shape[0]:,} linhas e {df.shape[1]:,} colunas')
    except Exception as e:
        print(f'\nErro ao exportar {nome}:\n {e}')


O df_train possui 3,566,068 linhas e 43 colunas

O df_val possui 891,518 linhas e 43 colunas

O df_test possui 2,194,300 linhas e 43 colunas


## 4. Análise do dataframe _df_train_val_

In [10]:
# Exibindo a quantidade de linhas e colunas do dataframe

print(f'\nVOLUMETRIA\n')
print(f'Quantidade de linhas (registros):  {df_train_val.shape[0]:,}')
print(f'Quantidade de colunas (variáveis): {df_train_val.shape[1]:,}')


VOLUMETRIA

Quantidade de linhas (registros):  4,457,586
Quantidade de colunas (variáveis): 43


In [11]:
df_train_val.head(10)

Unnamed: 0,id,date,client_id,card_id,amount,use_chip,merchant_id,merchant_city,merchant_state,zip,mcc,errors,id_card,client_id_card,card_brand,card_type,card_number,expires,cvv,has_chip,num_cards_issued,credit_limit,acct_open_date,year_pin_last_changed,card_on_dark_web,id_client,current_age,retirement_age,birth_year,birth_month,gender,address,latitude,longitude,per_capita_income,yearly_income,total_debt,credit_score,num_credit_cards,code,description,transaction_id,is_fraud
0,7475806,2010-01-01 09:05:00,1840,4568,$2.02,Swipe Transaction,35451,Beaverton,OR,97005.0,5812,,4568,1840,Visa,Debit (Prepaid),4733359418335581,09/2021,67,YES,2,$4,09/2004,2008,No,1840,46,71,1974,2,Female,576 Martin Luther King Street,45.49,-122.8,$21702,$44249,$103229,706,5,5812,Eating Places and Restaurants,7475806.0,No
1,7477473,2010-01-01 15:08:00,538,4161,$7.48,Swipe Transaction,26810,Winterville,NC,28590.0,5541,,4161,538,Mastercard,Debit,5885105668024939,12/2014,750,YES,2,$6993,08/2005,2016,No,538,66,69,1954,2,Female,7888 Fourth Street,35.3,-77.15,$14844,$30265,$36789,814,4,5541,Service Stations,7477473.0,No
6,7479105,2010-01-01 23:02:00,1693,5940,$4.33,Online Transaction,85247,ONLINE,,,5815,,5940,1693,Mastercard,Debit,5128104617797218,03/2017,726,YES,1,$33506,12/2008,2011,No,1693,36,69,1983,4,Female,478 East Drive,33.61,-111.89,$36300,$74016,$85204,702,2,5815,"Digital Goods - Media, Books, Apps",7479105.0,No
7,7480284,2010-01-02 11:11:00,1674,2873,$27.78,Swipe Transaction,60569,Jonesboro,AR,72401.0,5300,,2873,1674,Amex,Credit,366520954874839,05/2022,447,YES,2,$8800,05/2005,2011,No,1674,70,64,1949,4,Male,5073 Wessex Avenue,35.49,-90.35,$14172,$26858,$11245,712,2,5300,Wholesale Clubs,7480284.0,No
9,7480412,2010-01-02 11:46:00,509,4588,$5.54,Swipe Transaction,60569,Charmco,WV,25958.0,5300,,4588,509,Visa,Debit,4262181069766792,07/2022,519,YES,1,$12721,09/2005,2015,No,509,33,66,1986,7,Male,239 Sussex Drive,38.41,-82.43,$21842,$44534,$107410,702,4,5300,Wholesale Clubs,7480412.0,No
11,7482356,2010-01-02 20:04:00,1936,5914,$7.45,Swipe Transaction,21739,Richmond,VT,5477.0,5300,,5914,1936,Visa,Debit,4653215018449189,09/2014,624,YES,1,$27006,12/2007,2007,No,1936,86,68,1933,7,Female,406 El Camino Boulevard,44.4,-73.0,$26951,$35685,$1135,714,5,5300,Wholesale Clubs,7482356.0,No
14,7482814,2010-01-02 23:36:00,1896,4974,$16.08,Swipe Transaction,60569,Plymouth,MI,48170.0,5300,,4974,1896,Visa,Credit,4468355695964457,12/2023,955,YES,1,$14000,10/2002,2008,No,1896,50,79,1969,9,Female,6695 River Lane,41.91,-83.38,$19736,$40246,$74352,641,5,5300,Wholesale Clubs,7482814.0,No
16,7484660,2010-01-03 13:14:00,1857,5089,$39.77,Swipe Transaction,91128,Morris Plains,NJ,7950.0,5411,,5089,1857,Mastercard,Credit,5571571366314376,07/2024,126,YES,1,$27700,10/2007,2013,No,1857,32,66,1987,8,Male,4063 Burns Boulevard,40.77,-74.39,$47698,$97248,$197100,775,5,5411,"Grocery Stores, Supermarkets",7484660.0,No
18,7486009,2010-01-03 18:19:00,1079,5826,$188.80,Swipe Transaction,5373,Rockville Centre,NY,11570.0,4900,,5826,1079,Amex,Credit,362822137135948,10/2022,44,YES,1,$13400,12/2005,2010,No,1079,65,60,1954,11,Female,422 Madison Lane,40.66,-73.63,$48994,$103294,$39076,831,3,4900,"Utilities - Electric, Gas, Water, Sanitary",7486009.0,No
20,7487202,2010-01-04 07:35:00,1786,5463,$10.28,Swipe Transaction,60354,Louisville,OH,44641.0,5411,,5463,1786,Mastercard,Credit,5639561447744152,11/2020,247,YES,1,$18400,11/2006,2014,No,1786,48,63,1971,3,Female,7554 Sixth Street,40.83,-81.26,$18936,$38611,$93255,755,5,5411,"Grocery Stores, Supermarkets",7487202.0,No


In [12]:
# Função para geração de um dataframe de metadados

def gerar_metadados(dataframe):
    '''
    Gera um dataframe contendo metadados das colunas do dataframe fornecido.

    :param dataframe: Dataframe
        DataFrame para o qual os metadados serão gerados.
    :return: DataFrame
        DataFrame contendo os metadados.
    '''
    
    # Calculando as métricas
    metadados = pd.DataFrame({
        'Variável': dataframe.columns,
        'Tipo': dataframe.dtypes,
        'Qtde de nulos': dataframe.isnull().sum(),
        '% de nulos': dataframe.isnull().mean() * 100,
        'Cardinalidade': dataframe.nunique()
    })

    # Ordenando o DataFrame pelos valores de 'Qtde de nulos' em ordem decrescente
    metadados = metadados \
        .sort_values(by='Qtde de nulos', ascending=False) \
        .reset_index(drop=True)
    
    # Arredondando a coluna '% de nulos' para duas casas decimais
    metadados['% de nulos'] = metadados['% de nulos'].round(2)

    return metadados

In [13]:
# Exibindo os metadados do dataframe

gerar_metadados(df_train_val)

Unnamed: 0,Variável,Tipo,Qtde de nulos,% de nulos,Cardinalidade
0,errors,object,4386447,98.4,22
1,zip,float64,554327,12.44,23127
2,merchant_state,object,524586,11.77,191
3,client_id,int32,0,0.0,1219
4,date,datetime64[ns],0,0.0,2685814
5,card_id,int32,0,0.0,4070
6,amount,object,0,0.0,55649
7,merchant_id,int32,0,0.0,53024
8,use_chip,object,0,0.0,3
9,merchant_city,object,0,0.0,11528


In [18]:
# Função para limpar e converter colunas monetárias

def limpar_e_converter_monetarios(dataframe, colunas):
    '''
    Limpa e converte as colunas monetárias do dataframe fornecido.

    :param dataframe: DataFrame
        DataFrame a ser limpo e convertido.
    :param colunas: List
        Lista de colunas a serem limpadas e convertidas.
    :return: DataFrame  
        DataFrame com as colunas limpas e convertidas.
    '''
    
    # Expressão regular para remover símbolos de dólar e vírgulas
    for coluna in colunas:
        # Remover '$' e ',' usando regex e converter para float
        dataframe[coluna] = dataframe[coluna].replace({'\$': '', ',': ''}, regex=True)
        # Convertendo para float, valores não convertíveis se tornam NaN
        dataframe[coluna] = pd.to_numeric(dataframe[coluna], errors='coerce').astype(float)

    return dataframe

In [19]:
# Lista de colunas monetárias
colunas_monetarias = ['amount', 'credit_limit', 'per_capita_income', 'yearly_income', 'total_debt']

# Aplicando a função para limpar e converter as colunas monetárias
df_train_val = limpar_e_converter_monetarios(df_train_val, colunas_monetarias)

In [20]:
# Exibindo os metadados do dataframe

gerar_metadados(df_train_val)

Unnamed: 0,Variável,Tipo,Qtde de nulos,% de nulos,Cardinalidade
0,errors,object,4386447,98.4,22
1,zip,float64,554327,12.44,23127
2,merchant_state,object,524586,11.77,191
3,client_id,int32,0,0.0,1219
4,date,datetime64[ns],0,0.0,2685814
5,card_id,int32,0,0.0,4070
6,amount,float64,0,0.0,55648
7,merchant_id,int32,0,0.0,53024
8,use_chip,object,0,0.0,3
9,merchant_city,object,0,0.0,11528


In [21]:
df_train_val.head(10)

Unnamed: 0,id,date,client_id,card_id,amount,use_chip,merchant_id,merchant_city,merchant_state,zip,mcc,errors,id_card,client_id_card,card_brand,card_type,card_number,expires,cvv,has_chip,num_cards_issued,credit_limit,acct_open_date,year_pin_last_changed,card_on_dark_web,id_client,current_age,retirement_age,birth_year,birth_month,gender,address,latitude,longitude,per_capita_income,yearly_income,total_debt,credit_score,num_credit_cards,code,description,transaction_id,is_fraud
0,7475806,2010-01-01 09:05:00,1840,4568,2.02,Swipe Transaction,35451,Beaverton,OR,97005.0,5812,,4568,1840,Visa,Debit (Prepaid),4733359418335581,09/2021,67,YES,2,4.0,09/2004,2008,No,1840,46,71,1974,2,Female,576 Martin Luther King Street,45.49,-122.8,21702.0,44249.0,103229.0,706,5,5812,Eating Places and Restaurants,7475806.0,No
1,7477473,2010-01-01 15:08:00,538,4161,7.48,Swipe Transaction,26810,Winterville,NC,28590.0,5541,,4161,538,Mastercard,Debit,5885105668024939,12/2014,750,YES,2,6993.0,08/2005,2016,No,538,66,69,1954,2,Female,7888 Fourth Street,35.3,-77.15,14844.0,30265.0,36789.0,814,4,5541,Service Stations,7477473.0,No
6,7479105,2010-01-01 23:02:00,1693,5940,4.33,Online Transaction,85247,ONLINE,,,5815,,5940,1693,Mastercard,Debit,5128104617797218,03/2017,726,YES,1,33506.0,12/2008,2011,No,1693,36,69,1983,4,Female,478 East Drive,33.61,-111.89,36300.0,74016.0,85204.0,702,2,5815,"Digital Goods - Media, Books, Apps",7479105.0,No
7,7480284,2010-01-02 11:11:00,1674,2873,27.78,Swipe Transaction,60569,Jonesboro,AR,72401.0,5300,,2873,1674,Amex,Credit,366520954874839,05/2022,447,YES,2,8800.0,05/2005,2011,No,1674,70,64,1949,4,Male,5073 Wessex Avenue,35.49,-90.35,14172.0,26858.0,11245.0,712,2,5300,Wholesale Clubs,7480284.0,No
9,7480412,2010-01-02 11:46:00,509,4588,5.54,Swipe Transaction,60569,Charmco,WV,25958.0,5300,,4588,509,Visa,Debit,4262181069766792,07/2022,519,YES,1,12721.0,09/2005,2015,No,509,33,66,1986,7,Male,239 Sussex Drive,38.41,-82.43,21842.0,44534.0,107410.0,702,4,5300,Wholesale Clubs,7480412.0,No
11,7482356,2010-01-02 20:04:00,1936,5914,7.45,Swipe Transaction,21739,Richmond,VT,5477.0,5300,,5914,1936,Visa,Debit,4653215018449189,09/2014,624,YES,1,27006.0,12/2007,2007,No,1936,86,68,1933,7,Female,406 El Camino Boulevard,44.4,-73.0,26951.0,35685.0,1135.0,714,5,5300,Wholesale Clubs,7482356.0,No
14,7482814,2010-01-02 23:36:00,1896,4974,16.08,Swipe Transaction,60569,Plymouth,MI,48170.0,5300,,4974,1896,Visa,Credit,4468355695964457,12/2023,955,YES,1,14000.0,10/2002,2008,No,1896,50,79,1969,9,Female,6695 River Lane,41.91,-83.38,19736.0,40246.0,74352.0,641,5,5300,Wholesale Clubs,7482814.0,No
16,7484660,2010-01-03 13:14:00,1857,5089,39.77,Swipe Transaction,91128,Morris Plains,NJ,7950.0,5411,,5089,1857,Mastercard,Credit,5571571366314376,07/2024,126,YES,1,27700.0,10/2007,2013,No,1857,32,66,1987,8,Male,4063 Burns Boulevard,40.77,-74.39,47698.0,97248.0,197100.0,775,5,5411,"Grocery Stores, Supermarkets",7484660.0,No
18,7486009,2010-01-03 18:19:00,1079,5826,188.8,Swipe Transaction,5373,Rockville Centre,NY,11570.0,4900,,5826,1079,Amex,Credit,362822137135948,10/2022,44,YES,1,13400.0,12/2005,2010,No,1079,65,60,1954,11,Female,422 Madison Lane,40.66,-73.63,48994.0,103294.0,39076.0,831,3,4900,"Utilities - Electric, Gas, Water, Sanitary",7486009.0,No
20,7487202,2010-01-04 07:35:00,1786,5463,10.28,Swipe Transaction,60354,Louisville,OH,44641.0,5411,,5463,1786,Mastercard,Credit,5639561447744152,11/2020,247,YES,1,18400.0,11/2006,2014,No,1786,48,63,1971,3,Female,7554 Sixth Street,40.83,-81.26,18936.0,38611.0,93255.0,755,5,5411,"Grocery Stores, Supermarkets",7487202.0,No
