In [1]:
# 📓 01_missing_value_handling.ipynb

# === 1. Imports and Setup ===
import pandas as pd
import numpy as np
import sys
import os

# Add src/ to path so we can import config and utils
sys.path.append(os.path.abspath("../"))

from src.config import FRAUD_DATA_PATH, CREDITCARD_DATA_PATH, IP_COUNTRY_PATH
from src.utils import load_csv, check_missing_values

# === 2. Load Data ===
fraud_df = load_csv(FRAUD_DATA_PATH)
credit_df = load_csv(CREDITCARD_DATA_PATH)
ip_map_df = load_csv(IP_COUNTRY_PATH)


[INFO] Loaded data: C:\Users\yitbie\Desktop\10A\project\fraud-detection-project\data\raw\Fraud_Data.csv | Shape: (151112, 11)
[INFO] Loaded data: C:\Users\yitbie\Desktop\10A\project\fraud-detection-project\data\raw\creditcard.csv | Shape: (284807, 31)
[INFO] Loaded data: C:\Users\yitbie\Desktop\10A\project\fraud-detection-project\data\raw\IpAddress_to_Country.csv | Shape: (138846, 3)


In [2]:
# === 3. Missing Value Inspection ===
print("Missing Values in Fraud_Data.csv")
check_missing_values(fraud_df)


Missing Values in Fraud_Data.csv


Unnamed: 0,Missing Count,Missing %
user_id,0,0.0
signup_time,0,0.0
purchase_time,0,0.0
purchase_value,0,0.0
device_id,0,0.0
source,0,0.0
browser,0,0.0
sex,0,0.0
age,0,0.0
ip_address,0,0.0


In [3]:
print("Missing Values in creditcard.csv")
check_missing_values(credit_df)


Missing Values in creditcard.csv


Unnamed: 0,Missing Count,Missing %
Time,0,0.0
V1,0,0.0
V2,0,0.0
V3,0,0.0
V4,0,0.0
V5,0,0.0
V6,0,0.0
V7,0,0.0
V8,0,0.0
V9,0,0.0


In [4]:
print("Missing Values in IpAddress_to_Country.csv")
check_missing_values(ip_map_df)


Missing Values in IpAddress_to_Country.csv


Unnamed: 0,Missing Count,Missing %
lower_bound_ip_address,0,0.0
upper_bound_ip_address,0,0.0
country,0,0.0


In [5]:
# === 4. Handle Missing Values ===

# Fraud_Data.csv
# If any column like 'age', 'browser', or 'source' has missing values:
# === Handle Missing Values (Updated to avoid FutureWarning) ===
fraud_df['age'] = fraud_df['age'].fillna(fraud_df['age'].median())
fraud_df['browser'] = fraud_df['browser'].fillna('unknown')
fraud_df['source'] = fraud_df['source'].fillna('unknown')


# creditcard.csv is usually clean — just drop rows with missing data if any
credit_df.dropna(inplace=True)

# IP mapping — if any are missing, you can also drop
ip_map_df.dropna(inplace=True)


In [6]:
# === 5. Save Cleaned Outputs ===
from src.config import FRAUD_CLEANED_PATH, CREDITCARD_CLEANED_PATH

fraud_df.to_csv(FRAUD_CLEANED_PATH, index=False)
credit_df.to_csv(CREDITCARD_CLEANED_PATH, index=False)

print("[] Cleaned data saved to data/processed/")


[] Cleaned data saved to data/processed/


In [7]:
# Check if any missing values remain
print(" Remaining missing values in fraud_df:")
print(fraud_df.isnull().sum())

# Show a quick preview of the data
print("\n Preview of cleaned fraud_df:")
print(fraud_df.head())

# Shape of cleaned dataset
print(f"\n fraud_df shape after missing value handling: {fraud_df.shape}")

print(f"Fraud Data Shape: {fraud_df.shape}")
print(f"Credit Card Data Shape: {credit_df.shape}")
print(f"IP Mapping Shape: {ip_map_df.shape}")


 Remaining missing values in fraud_df:
user_id           0
signup_time       0
purchase_time     0
purchase_value    0
device_id         0
source            0
browser           0
sex               0
age               0
ip_address        0
class             0
dtype: int64

 Preview of cleaned fraud_df:
   user_id          signup_time        purchase_time  purchase_value  \
0    22058  2015-02-24 22:55:49  2015-04-18 02:47:11              34   
1   333320  2015-06-07 20:39:50  2015-06-08 01:38:54              16   
2     1359  2015-01-01 18:52:44  2015-01-01 18:52:45              15   
3   150084  2015-04-28 21:13:25  2015-05-04 13:54:50              44   
4   221365  2015-07-21 07:09:52  2015-09-09 18:40:53              39   

       device_id source browser sex  age    ip_address  class  
0  QVPSPJUOCKZAR    SEO  Chrome   M   39  7.327584e+08      0  
1  EOGFQPIZPYXFZ    Ads  Chrome   F   53  3.503114e+08      0  
2  YSSKYOSJHPPLJ    SEO   Opera   M   53  2.621474e+09      1  
3  ATGTX