In [1]:
# Core Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Display settings (for Jupyter)
pd.set_option('display.max_columns', None)
sns.set(style="whitegrid")

# File Paths (adjust if needed)
RAW_DATA_PATH = "../data/raw/fraud_simulated.csv"
PROCESSED_DATA_PATH = "../data/processed/fraud_cleaned.csv"


In [2]:
# 2. Load and Inspect Raw Data

# Load CSV file
df_raw = pd.read_csv(RAW_DATA_PATH)

# Show first few rows
print("Preview of raw data:")
display(df_raw.head())

# Check structure and types
print("\nData types and null values:")
print(df_raw.info())
print(df_raw.isnull().sum())


Preview of raw data:


Unnamed: 0,timestamp,user_id,amount,country,event_type
0,2025-07-24 10:15:00,USER123,199.5,DE,login



Data types and null values:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1 entries, 0 to 0
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   timestamp   1 non-null      object 
 1   user_id     1 non-null      object 
 2   amount      1 non-null      float64
 3   country     1 non-null      object 
 4   event_type  1 non-null      object 
dtypes: float64(1), object(4)
memory usage: 172.0+ bytes
None
timestamp     0
user_id       0
amount        0
country       0
event_type    0
dtype: int64


In [3]:
# 3. Handle Missing or Invalid Values

# Clean whitespace, NaN strings, empty strings
df_cleaned = df_raw.copy()

# Strip string columns
str_cols = ['user_id', 'country', 'event_type']
for col in str_cols:
    df_cleaned[col] = df_cleaned[col].astype(str).str.strip()
    df_cleaned[col] = df_cleaned[col].replace('', np.nan)

# Replace invalid amounts (e.g., negative or zero)
df_cleaned = df_cleaned[df_cleaned["amount"] > 0]

# Drop rows with remaining missing critical fields
df_cleaned = df_cleaned.dropna(subset=["timestamp", "user_id", "amount"])

print("Remaining rows after cleaning:", len(df_cleaned))


Remaining rows after cleaning: 1


In [4]:
# 4. Convert timestamp to datetime
df_cleaned["timestamp"] = pd.to_datetime(df_cleaned["timestamp"], errors='coerce')

# Drop rows with invalid/missing timestamps
df_cleaned = df_cleaned.dropna(subset=["timestamp"])

# Extract temporal features
df_cleaned["hour"] = df_cleaned["timestamp"].dt.hour
df_cleaned["day"] = df_cleaned["timestamp"].dt.day
df_cleaned["weekday"] = df_cleaned["timestamp"].dt.weekday  # 0 = Monday, 6 = Sunday

# Optional: Visual inspection
print("Preview with temporal features:")
display(df_cleaned[["timestamp", "hour", "day", "weekday"]].head())


Preview with temporal features:


Unnamed: 0,timestamp,hour,day,weekday
0,2025-07-24 10:15:00,10,24,3


In [5]:
# 5. Normalize/Scale amount for ML

from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
df_cleaned["amount_scaled"] = scaler.fit_transform(df_cleaned[["amount"]])

# Optional: Check distribution
print("Amount before and after scaling:")
display(df_cleaned[["amount", "amount_scaled"]].head())


Amount before and after scaling:


Unnamed: 0,amount,amount_scaled
0,199.5,0.0


In [6]:
# 6. Remove Duplicate Rows (if any)

initial_count = len(df_cleaned)
df_cleaned = df_cleaned.drop_duplicates()
final_count = len(df_cleaned)

print(f"Removed {initial_count - final_count} duplicate rows. Remaining: {final_count}")


Removed 0 duplicate rows. Remaining: 1


In [7]:
# 7. Save Cleaned Data

# Create target folder if it doesn't exist
import os
os.makedirs(os.path.dirname(PROCESSED_DATA_PATH), exist_ok=True)

# Save cleaned data
df_cleaned.to_csv(PROCESSED_DATA_PATH, index=False)

# Optional: Logging
print(f"Saved cleaned dataset to {PROCESSED_DATA_PATH}")
print(f"Final dataset shape: {df_cleaned.shape}")
print(f"Columns: {list(df_cleaned.columns)}")


Saved cleaned dataset to ../data/processed/fraud_cleaned.csv
Final dataset shape: (1, 9)
Columns: ['timestamp', 'user_id', 'amount', 'country', 'event_type', 'hour', 'day', 'weekday', 'amount_scaled']
