In [1]:
# Core Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Display settings (for Jupyter)
pd.set_option('display.max_columns', None)
sns.set(style="whitegrid")

# File Paths (adjust if needed)
RAW_DATA_PATH = "data/raw/fraud_simulated.csv"
PROCESSED_DATA_PATH = "data/processed/fraud_cleaned.csv"


In [2]:
# 2. Load and Inspect Raw Data

# Load CSV file
df_raw = pd.read_csv(RAW_DATA_PATH)

# Show first few rows
print("Preview of raw data:")
display(df_raw.head())

# Check structure and types
print("\nData types and null values:")
print(df_raw.info())
print(df_raw.isnull().sum())


Preview of raw data:


Unnamed: 0,timestamp,amount,user_id,country,channel,merchant_category,is_fraud
0,2024-01-03 16:15:00,44.02,1422,FR,app,travel,0
1,2024-01-24 05:14:00,85.59,1612,AT,pos,electronics,0
2,2024-01-20 15:17:00,76.67,1750,GB,web,fashion,0
3,2024-01-14 03:59:00,129.83,1753,FR,app,electronics,0
4,2024-01-13 23:46:00,32.56,1339,PL,pos,travel,0



Data types and null values:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   timestamp          2000 non-null   object 
 1   amount             2000 non-null   float64
 2   user_id            2000 non-null   int64  
 3   country            2000 non-null   object 
 4   channel            2000 non-null   object 
 5   merchant_category  2000 non-null   object 
 6   is_fraud           2000 non-null   int64  
dtypes: float64(1), int64(2), object(4)
memory usage: 109.5+ KB
None
timestamp            0
amount               0
user_id              0
country              0
channel              0
merchant_category    0
is_fraud             0
dtype: int64


In [3]:
# 3. Handle Missing or Invalid Values

# Clean whitespace, NaN strings, empty strings
df_cleaned = df_raw.copy()

# Strip string columns
if 'event_type' not in df_cleaned.columns:
    df_cleaned['event_type'] = 'transaction'

str_cols = ['user_id', 'country', 'event_type']
for col in str_cols:
    df_cleaned[col] = df_cleaned[col].astype(str).str.strip()
    df_cleaned[col] = df_cleaned[col].replace('', np.nan)

# Replace invalid amounts (e.g., negative or zero)
df_cleaned = df_cleaned[df_cleaned["amount"] > 0]

# Drop rows with remaining missing critical fields
df_cleaned = df_cleaned.dropna(subset=["timestamp", "user_id", "amount"])

print("Remaining rows after cleaning:", len(df_cleaned))


Remaining rows after cleaning: 2000


In [4]:
# 4. Convert timestamp to datetime
df_cleaned["timestamp"] = pd.to_datetime(df_cleaned["timestamp"], errors='coerce')

# Drop rows with invalid/missing timestamps
df_cleaned = df_cleaned.dropna(subset=["timestamp"])

# Extract temporal features
df_cleaned["hour"] = df_cleaned["timestamp"].dt.hour
df_cleaned["day"] = df_cleaned["timestamp"].dt.day
df_cleaned["weekday"] = df_cleaned["timestamp"].dt.weekday  # 0 = Monday, 6 = Sunday

# Optional: Visual inspection
print("Preview with temporal features:")
display(df_cleaned[["timestamp", "hour", "day", "weekday"]].head())


Preview with temporal features:


Unnamed: 0,timestamp,hour,day,weekday
0,2024-01-03 16:15:00,16,3,2
1,2024-01-24 05:14:00,5,24,2
2,2024-01-20 15:17:00,15,20,5
3,2024-01-14 03:59:00,3,14,6
4,2024-01-13 23:46:00,23,13,5


In [5]:
# 5. Normalize/Scale amount for ML

from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
df_cleaned["amount_scaled"] = scaler.fit_transform(df_cleaned[["amount"]])

# Optional: Check distribution
print("Amount before and after scaling:")
display(df_cleaned[["amount", "amount_scaled"]].head())


Amount before and after scaling:


Unnamed: 0,amount,amount_scaled
0,44.02,0.082887
1,85.59,0.16248
2,76.67,0.145401
3,129.83,0.247185
4,32.56,0.060944


In [6]:
# 6. Remove Duplicate Rows (if any)

initial_count = len(df_cleaned)
df_cleaned = df_cleaned.drop_duplicates()
final_count = len(df_cleaned)

print(f"Removed {initial_count - final_count} duplicate rows. Remaining: {final_count}")


Removed 0 duplicate rows. Remaining: 2000


In [7]:
# 7. Save Cleaned Data

# Create target folder if it doesn't exist
import os
os.makedirs(os.path.dirname(PROCESSED_DATA_PATH), exist_ok=True)

# Save cleaned data
df_cleaned.to_csv(PROCESSED_DATA_PATH, index=False)

# Optional: Logging
print(f"Saved cleaned dataset to {PROCESSED_DATA_PATH}")
print(f"Final dataset shape: {df_cleaned.shape}")
print(f"Columns: {list(df_cleaned.columns)}")


Saved cleaned dataset to data/processed/fraud_cleaned.csv
Final dataset shape: (2000, 12)
Columns: ['timestamp', 'amount', 'user_id', 'country', 'channel', 'merchant_category', 'is_fraud', 'event_type', 'hour', 'day', 'weekday', 'amount_scaled']
