In [22]:
# 📓 05_geolocation_merge.ipynb

# === 1. Imports and Setup ===
import pandas as pd
import numpy as np
import ipaddress
import os
import sys

sys.path.append(os.path.abspath("../"))

from src.config import FRAUD_CLEANED_PATH, IP_COUNTRY_PATH, FRAUD_WITH_GEO_PATH
from src.utils import load_csv


In [23]:
# === 2. Load Data ===
fraud_df = load_csv(FRAUD_CLEANED_PATH)
ip_map_df = load_csv(IP_COUNTRY_PATH)


[INFO] Loaded data: C:\Users\yitbie\Desktop\10A\project\fraud-detection-project\data\processed\fraud_cleaned.csv | Shape: (151112, 11)
[INFO] Loaded data: C:\Users\yitbie\Desktop\10A\project\fraud-detection-project\data\raw\IpAddress_to_Country.csv | Shape: (138846, 3)


In [24]:
# Check sample IPs
print("Sample IP values:")
print(fraud_df['ip_address'].head(10))
print("Dtype:", fraud_df['ip_address'].dtype)


Sample IP values:
0    7.327584e+08
1    3.503114e+08
2    2.621474e+09
3    3.840542e+09
4    4.155831e+08
5    2.809315e+09
6    3.987484e+09
7    1.692459e+09
8    3.719094e+09
9    3.416747e+08
Name: ip_address, dtype: float64
Dtype: float64


In [25]:

# Check if all IPs are numeric
non_numeric_ips = fraud_df[~fraud_df['ip_address'].apply(lambda x: isinstance(x, (int, float)))]

print(f"❗ Non-numeric IP entries: {len(non_numeric_ips)}")
print(non_numeric_ips.head())


❗ Non-numeric IP entries: 0
Empty DataFrame
Columns: [user_id, signup_time, purchase_time, purchase_value, device_id, source, browser, sex, age, ip_address, class]
Index: []


In [29]:
# Drop rows where ip_address is NaN
fraud_df = fraud_df.dropna(subset=["ip_address"])

# Convert to uint32 (ensure match with IP range)
fraud_df['ip_int'] = fraud_df['ip_address'].astype(np.uint32)

print("Cleaned and converted ip_address to ip_int.")


Cleaned and converted ip_address to ip_int.


In [30]:
# === Range Join Function ===
def find_country(ip):
    match = ip_map_df[
        (ip_map_df['lower_bound_ip_address'] <= ip) &
        (ip_map_df['upper_bound_ip_address'] >= ip)
    ]
    return match['country'].values[0] if not match.empty else "Unknown"

fraud_df['country'] = fraud_df['ip_int'].apply(find_country)


In [31]:
# === 6. Preview + Save ===
print(fraud_df[['ip_address', 'ip_int', 'country']].head())

fraud_df.to_csv(FRAUD_WITH_GEO_PATH, index=False)
print(f" Saved merged dataset to {FRAUD_WITH_GEO_PATH}")


     ip_address      ip_int        country
0  7.327584e+08   732758368          Japan
1  3.503114e+08   350311387  United States
2  2.621474e+09  2621473820  United States
3  3.840542e+09  3840542443        Unknown
4  4.155831e+08   415583117  United States
 Saved merged dataset to C:\Users\yitbie\Desktop\10A\project\fraud-detection-project\data\processed\fraud_with_geo.csv


Sample IP values:
Series([], Name: ip_address, dtype: float64)
Dtype: float64
