# Processamento de Dados

In [1]:
import os
import pandas as pd
from dotenv import load_dotenv
from sklearn.model_selection import train_test_split

In [2]:
pd.set_option('display.max_rows', 500)

In [3]:
load_dotenv()

print(os.environ['NOTEBOOKS_RAW_DATA_PATH'])
print(os.environ['NOTEBOOKS_PROCESSED_DATA_PATH'])

../data/raw/
../data/processed/


## Carregando os dados

In [4]:
df = pd.read_csv(
    os.environ['NOTEBOOKS_RAW_DATA_PATH'] + \
        'creditcard.csv'
)

print(df.shape)
print(df.size)

(284807, 31)
8829017


In [5]:
df = df.rename(columns={
    'Time': 'Tempo',
    'Amount': 'Valor',
    'Class': 'Fraude'
})

In [6]:
df.dtypes

Tempo     float64
V1        float64
V2        float64
V3        float64
V4        float64
V5        float64
V6        float64
V7        float64
V8        float64
V9        float64
V10       float64
V11       float64
V12       float64
V13       float64
V14       float64
V15       float64
V16       float64
V17       float64
V18       float64
V19       float64
V20       float64
V21       float64
V22       float64
V23       float64
V24       float64
V25       float64
V26       float64
V27       float64
V28       float64
Valor     float64
Fraude      int64
dtype: object

## Separação em Treino e Teste

In [7]:
df = df.sort_values(by='Tempo', ascending=True)

In [8]:
df_train, df_test = train_test_split(
    df,
    test_size=.2,
    shuffle=False
)

df_train.shape, df_test.shape

((227845, 31), (56962, 31))

In [9]:
df_train.head()

Unnamed: 0,Tempo,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Valor,Fraude
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [10]:
df_test.head()

Unnamed: 0,Tempo,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Valor,Fraude
227845,145248.0,1.914027,-0.490068,-0.326111,0.604711,-0.850136,-0.736319,-0.524058,-0.088614,1.091125,...,0.210958,0.639338,0.147523,0.073654,-0.318378,0.350612,-0.023843,-0.037139,50.0,0
227846,145249.0,2.152696,-0.036161,-2.231811,0.091766,0.537612,-1.368103,0.613327,-0.455252,0.291814,...,0.017153,0.063242,-0.034561,-0.626866,0.249213,0.773931,-0.137115,-0.090611,14.95,0
227847,145249.0,-4.034795,2.305079,-1.461693,-0.729887,-1.52875,-1.225679,-0.893354,1.622522,1.291998,...,-0.392557,-0.787599,0.343468,-0.090331,0.248287,-0.238524,0.266484,-0.062236,7.7,0
227848,145249.0,-1.668741,1.168055,0.249642,-1.268497,0.785923,-0.663959,0.859433,0.068111,-0.144183,...,-0.247544,-0.592537,-0.286694,-0.378856,-0.077429,0.067608,-0.278962,-0.064193,6.99,0
227849,145250.0,-0.550678,-0.429004,-1.291893,-0.414409,-0.292229,0.071843,2.426068,-0.21273,0.412374,...,0.003032,-0.645783,0.877016,-1.228529,-0.036281,-0.11061,-0.09838,0.095985,460.71,0


## Salvando em `.parquet`

In [11]:
# conjunto completo
df.to_parquet(
    os.environ['NOTEBOOKS_PROCESSED_DATA_PATH'] + \
        'creditcard.parquet'
)

# conjunto de treino
df_train.to_parquet(
    os.environ['NOTEBOOKS_PROCESSED_DATA_PATH'] + \
        'creditcard_train_major.parquet'
)

# conjunto de teste
df_test.to_parquet(
    os.environ['NOTEBOOKS_PROCESSED_DATA_PATH'] + \
        'creditcard_test.parquet'
)

## Separando o Treino novamente em Treino e Validação

In [12]:
X = df_train.drop('Fraude', axis=1)
y = df_train['Fraude']

In [13]:
x_train, x_val, y_train, y_val = train_test_split(
    X, y,
    test_size=.2,
    shuffle=True
)

x_train.shape, x_val.shape

((182276, 30), (45569, 30))

In [14]:
print(x_train.shape, y_train.shape)

df_train = pd.concat([x_train, y_train], axis=1)
print(df_train.shape)

(182276, 30) (182276,)
(182276, 31)


In [15]:
print(x_val.shape, y_val.shape)

df_val = pd.concat([x_val, y_val], axis=1)
print(df_val.shape)

(45569, 30) (45569,)
(45569, 31)


## Validando a estratificação das classes

In [16]:
counts = df_train['Fraude'].value_counts()

display(counts)
counts.values[-1] / sum(counts)

0    181958
1       318
Name: Fraude, dtype: int64

0.001744607079374136

In [17]:
counts = df_val['Fraude'].value_counts()

display(counts)
counts.values[-1] / sum(counts)

0    45470
1       99
Name: Fraude, dtype: int64

0.0021725295705413766

## Salvando em `.parquet`

In [18]:
df_train.to_parquet(
    os.environ['NOTEBOOKS_PROCESSED_DATA_PATH'] + \
        'creditcard_train_minor.parquet'
)

df_val.to_parquet(
    os.environ['NOTEBOOKS_PROCESSED_DATA_PATH'] + \
        'creditcard_val.parquet'
)