In [1]:
import pandas as pd
from pathlib import Path

# Caminho do CSV
RAW_PATH = Path("../data/raw/creditcard.csv")

# Ler CSV
df = pd.read_csv(RAW_PATH)

# Verificar as primeiras linhas
df.head()


Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [3]:
# Valores nulos
print("Valores nulos por coluna:")
print(df.isnull().sum())

# Linhas duplicadas
print("\nNúmero de linhas duplicadas:", df.duplicated().sum())

# Remover duplicados
df = df.drop_duplicates()
print("Número de linhas após remover duplicados:", len(df))


Valores nulos por coluna:
Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       0
V26       0
V27       0
V28       0
Amount    0
Class     0
dtype: int64

Número de linhas duplicadas: 1081
Número de linhas após remover duplicados: 283726


In [5]:
# Converter coluna 'Class' para inteiro (se não estiver)
df['Class'] = df['Class'].astype(int)

# Mostrar tipos de dados
print(df.dtypes)


Time      float64
V1        float64
V2        float64
V3        float64
V4        float64
V5        float64
V6        float64
V7        float64
V8        float64
V9        float64
V10       float64
V11       float64
V12       float64
V13       float64
V14       float64
V15       float64
V16       float64
V17       float64
V18       float64
V19       float64
V20       float64
V21       float64
V22       float64
V23       float64
V24       float64
V25       float64
V26       float64
V27       float64
V28       float64
Amount    float64
Class       int32
dtype: object


In [7]:
# Separar features e target
X = df.drop('Class', axis=1)
y = df['Class']

print("Features shape:", X.shape)
print("Target shape:", y.shape)


Features shape: (283726, 30)
Target shape: (283726,)


In [None]:
# Caminho para salvar dataset limpo
CLEAN_PATH = Path("../data/clean/creditcard_clean.csv")

# Criar pasta clean se não existir
CLEAN_PATH.parent.mkdir(parents=True, exist_ok=True)

# Salvar CSV limpo
df.to_csv(CLEAN_PATH, index=False)
print("Dataset limpo salvo em:", CLEAN_PATH)
