# Data Preprocessing for Fraud Detection

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
import pickle
import os

In [None]:
# Load data
df = pd.read_csv('../data/creditcard.csv')
print(f"Dataset shape: {df.shape}")
print(f"Fraud cases: {df['Class'].sum()} ({df['Class'].mean()*100:.2f}%)")
df.head()

In [None]:
# Feature engineering
scaler = RobustScaler()
df['Amount_scaled'] = scaler.fit_transform(df[['Amount']])
df['Time_scaled'] = scaler.fit_transform(df[['Time']])
df['Hour'] = (df['Time'] / 3600) % 24
df['Amount_log'] = np.log1p(df['Amount'])

feature_cols = [col for col in df.columns if col.startswith('V')] + ['Amount_scaled', 'Time_scaled', 'Hour', 'Amount_log']
X = df[feature_cols]
y = df['Class']

In [None]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Apply SMOTE
smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)

print(f"Original: {y_train.value_counts()}")
print(f"Balanced: {pd.Series(y_train_balanced).value_counts()}")

In [None]:
# Save data
os.makedirs('../models', exist_ok=True)

np.save('../models/X_train.npy', X_train.values)
np.save('../models/X_test.npy', X_test.values)
np.save('../models/y_train.npy', y_train.values)
np.save('../models/y_test.npy', y_test.values)
np.save('../models/X_train_balanced.npy', X_train_balanced)
np.save('../models/y_train_balanced.npy', y_train_balanced)

with open('../models/feature_names.pkl', 'wb') as f:
    pickle.dump(feature_cols, f)
    
with open('../models/scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

print("Preprocessing complete.")