In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split

import pickle

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 30)

import sys
import os

PROJECT_DIRECTORY = os.path.join(os.getcwd(), '..')
sys.path.append(PROJECT_DIRECTORY)

from src.utils import * 

In [2]:
df = pd.read_excel(PROJECT_DIRECTORY + '/data/raw/default of credit card clients.xls').loc[1:]
df = df.rename(columns={'Unnamed: 0':'ID'})
df = df.astype(np.float64)

In [3]:
df['Y'].value_counts(normalize=True)

Y
0.0    0.7788
1.0    0.2212
Name: proportion, dtype: float64

In [4]:
metadados = gerar_metadados(df, 'ID', 'Y', orderby = 'PC_NULOS')
metadados

Unnamed: 0,FEATURE,USO_FEATURE,QT_NULOS,PC_NULOS,CARDINALIDADE,TIPO_FEATURE
0,ID,ID,0,0.0,30000,float64
1,X1,Explicativa,0,0.0,81,float64
2,X2,Explicativa,0,0.0,2,float64
3,X3,Explicativa,0,0.0,7,float64
4,X4,Explicativa,0,0.0,4,float64
5,X5,Explicativa,0,0.0,56,float64
6,X6,Explicativa,0,0.0,11,float64
7,X7,Explicativa,0,0.0,11,float64
8,X8,Explicativa,0,0.0,11,float64
9,X9,Explicativa,0,0.0,11,float64


In [5]:
TARGET = ['Y']
AUXILIARES = ['ID']

# Limiar para classificação como categórica
THRESHOLD_CARDINALIDADE = 25

NUM_VARS = metadados[(metadados['CARDINALIDADE'] > THRESHOLD_CARDINALIDADE) & (metadados['USO_FEATURE'] == 'Explicativa')]['FEATURE'].to_list()
CAT_VARS = metadados[(metadados['CARDINALIDADE'] <= THRESHOLD_CARDINALIDADE) & (metadados['USO_FEATURE'] == 'Explicativa')]['FEATURE'].to_list()

print(f'Quantidade de Variáveis Númericas: {len(NUM_VARS)}')
print(f'Quantidade de variáveis categóricas: {len(CAT_VARS)}')

Quantidade de Variáveis Númericas: 14
Quantidade de variáveis categóricas: 9


In [6]:
NUM_VARS_PATH = PROJECT_DIRECTORY + '/models/artefacts/num_vars.pickle'
CAT_VARS_PATH = PROJECT_DIRECTORY + '/models/artefacts/cat_vars.pickle'
TARGET_VARS_PATH = PROJECT_DIRECTORY + '/models/artefacts/target_vars.pickle'
AUX_VARS_PATH = PROJECT_DIRECTORY + '/models/artefacts/aux_vars.pickle'

with open(NUM_VARS_PATH,'wb') as f:
    pickle.dump(NUM_VARS,f)

with open(CAT_VARS_PATH,'wb') as f:
    pickle.dump(CAT_VARS,f)  

with open(TARGET_VARS_PATH,'wb') as f:
    pickle.dump(TARGET,f)  

with open(AUX_VARS_PATH,'wb') as f:
    pickle.dump(AUXILIARES,f)  

In [7]:
df_train, df_test = train_test_split(df, test_size=0.2, random_state=42, stratify=df['Y'])

In [8]:
print("=====TREINO=====")
print(df_train.shape)
print(df_train['Y'].mean())

print(" ")

print("=====TESTE=====")
print(df_test.shape)
print(df_test['Y'].mean())

=====TREINO=====
(24000, 25)
0.22120833333333334
 
=====TESTE=====
(6000, 25)
0.22116666666666668


In [9]:
df_train.to_parquet(PROJECT_DIRECTORY + '/data/processed/df_train.parquet')
df_test.to_parquet(PROJECT_DIRECTORY + '/data/processed/df_test.parquet')