In [1]:
# Imports & Load Clean Copy
# Core
import pandas as pd
import numpy as np
from pathlib import Path

# ML preprocessing
from sklearn.preprocessing import LabelEncoder, StandardScaler
from imblearn.over_sampling import SMOTE

# Paths
DATA_DIR = Path("../data")
df = pd.read_csv(DATA_DIR / "Base.csv")

print(f"Shape before cleaning: {df.shape}")
df.head(3)

Shape before cleaning: (1000000, 32)


Unnamed: 0,fraud_bool,income,name_email_similarity,prev_address_months_count,current_address_months_count,customer_age,days_since_request,intended_balcon_amount,payment_type,zip_count_4w,...,has_other_cards,proposed_credit_limit,foreign_request,source,session_length_in_minutes,device_os,keep_alive_session,device_distinct_emails_8w,device_fraud_count,month
0,0,0.3,0.986506,-1,25,40,0.006735,102.453711,AA,1059,...,0,1500.0,0,INTERNET,16.224843,linux,1,1,0,0
1,0,0.8,0.617426,-1,89,20,0.010095,-0.849551,AD,1658,...,0,1500.0,0,INTERNET,3.363854,other,1,1,0,0
2,0,0.8,0.996707,9,14,40,0.012316,-1.490386,AB,1095,...,0,200.0,0,INTERNET,22.730559,windows,0,1,0,0


In [2]:
# Replace -1 with np.nan for all numeric columns
df.replace(-1, np.nan, inplace=True)

# Confirm
df.isna().sum().sort_values(ascending=False).head(10)

prev_address_months_count       712920
bank_months_count               253635
current_address_months_count      4254
session_length_in_minutes         2015
credit_risk_score                  488
device_distinct_emails_8w          359
keep_alive_session                   0
device_os                            0
email_is_free                        0
source                               0
dtype: int64

In [8]:
# Separate feature types
num_cols = df.select_dtypes(include=[np.number]).columns.drop("fraud_bool")
cat_cols = df.select_dtypes(include=["object"]).columns.tolist() + [
    "employment_status", "housing_status", "payment_type", "device_os", "source"
]

# Numeric: fill with median
df[num_cols] = df[num_cols].fillna(df[num_cols].median())

# Categorical: fill with mode
for col in cat_cols:
    df[col] = df[col].fillna(df[col].mode()[0])

# Recheck
df.isna().sum().sum()

np.int64(0)

In [9]:
# Encode Categorical Variables
le = LabelEncoder()

for col in cat_cols:
    df[col] = le.fit_transform(df[col])

df[cat_cols].head()

Unnamed: 0,payment_type,employment_status,housing_status,source,device_os,employment_status.1,housing_status.1,payment_type.1,device_os.1,source.1
0,0,1,2,0,0,1,2,0,0,0
1,3,0,2,0,2,0,2,3,2,0
2,1,0,2,0,3,0,2,1,3,0
3,1,0,2,0,0,0,2,1,0,0
4,0,0,2,0,2,0,2,0,2,0


In [10]:
# Feature Scaling
scaler = StandardScaler()
df[num_cols] = scaler.fit_transform(df[num_cols])
df[num_cols].head(3)

Unnamed: 0,income,name_email_similarity,prev_address_months_count,current_address_months_count,customer_age,days_since_request,intended_balcon_amount,zip_count_4w,velocity_6h,velocity_24h,...,phone_mobile_valid,bank_months_count,has_other_cards,proposed_credit_limit,foreign_request,session_length_in_minutes,keep_alive_session,device_distinct_emails_8w,device_fraud_count,month
0,-0.904778,1.704497,-0.212215,-0.700493,0.524782,-0.189335,4.634885,-0.510946,2.469193,2.082983,...,0.352143,-0.592153,-0.535706,2.01852,-0.160921,1.080115,0.856307,-0.107716,0.0,-1.488092
1,0.817326,0.427954,-0.212215,0.02473,-1.13831,-0.188711,-0.470003,0.084852,1.182299,0.659452,...,0.352143,-1.295009,-0.535706,2.01852,-0.160921,-0.522567,0.856307,-0.107716,0.0,-1.488092
2,0.817326,1.739779,-0.90385,-0.825141,0.524782,-0.188298,-0.501671,-0.475139,-0.396701,0.474717,...,0.352143,1.516417,-0.535706,-0.64782,-0.160921,1.89083,-1.167806,-0.107716,0.0,-1.488092


In [11]:
# Handle Class Imbalance
from sklearn.model_selection import train_test_split

X = df.drop(columns="fraud_bool")
y = df["fraud_bool"]

# Split before SMOTE to avoid data leakage
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

print("Before SMOTE:")
print(y_train.value_counts(normalize=True))

# Apply SMOTE
sm = SMOTE(random_state=42)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train)

print("\nAfter SMOTE:")
print(y_train_res.value_counts(normalize=True))

Before SMOTE:
fraud_bool
0    0.988971
1    0.011029
Name: proportion, dtype: float64





After SMOTE:
fraud_bool
0    0.5
1    0.5
Name: proportion, dtype: float64


In [12]:
# Save Prepared Data
PREPARED_DIR = DATA_DIR / "processed"
PREPARED_DIR.mkdir(exist_ok=True)

X_train_res.to_csv(PREPARED_DIR / "X_train.csv", index=False)
y_train_res.to_csv(PREPARED_DIR / "y_train.csv", index=False)
X_test.to_csv(PREPARED_DIR / "X_test.csv", index=False)
y_test.to_csv(PREPARED_DIR / "y_test.csv", index=False)

# Data Preparation Summary

All features were cleaned and standardized for modeling.  
- Replaced -1 placeholders with NaN and imputed missing values (median for numeric, mode for categorical).  
- Encoded categorical variables using LabelEncoder.  
- Applied StandardScaler to numeric columns to normalize magnitudes.  
- Addressed severe class imbalance (fraud ≈ 1.1%) using SMOTE, achieving a balanced training set.  
- Exported ready-to-model datasets to `data/processed/`.