In [1]:
# 📓 02_cleaning_and_type_conversion.ipynb

# === 1. Imports and Setup ===
import pandas as pd
import numpy as np
import sys
import os

# Add src/ to path
sys.path.append(os.path.abspath("../"))

from src.config import FRAUD_CLEANED_PATH, FRAUD_WITH_GEO_PATH
from src.utils import load_csv, convert_to_datetime

# === 2. Load Pre-Cleaned Data ===
fraud_df = load_csv(FRAUD_CLEANED_PATH)


[INFO] Loaded data: C:\Users\yitbie\Desktop\10A\project\fraud-detection-project\data\processed\fraud_cleaned.csv | Shape: (151112, 11)


In [2]:
# === 3. Remove Duplicates ===
before = fraud_df.shape[0]
fraud_df = fraud_df.drop_duplicates()
after = fraud_df.shape[0]

print(f" Removed {before - after} duplicate rows.")


 Removed 0 duplicate rows.


In [3]:
# === 4. Convert to Datetime ===
fraud_df = convert_to_datetime(fraud_df, ["signup_time", "purchase_time"])

# Confirm conversion
print("signup_time dtype:", fraud_df['signup_time'].dtype)
print("purchase_time dtype:", fraud_df['purchase_time'].dtype)


signup_time dtype: datetime64[ns]
purchase_time dtype: datetime64[ns]


In [4]:
# === 5. Check Data Types ===
print("\n Data Types:")
print(fraud_df.dtypes)



 Data Types:
user_id                    int64
signup_time       datetime64[ns]
purchase_time     datetime64[ns]
purchase_value             int64
device_id                 object
source                    object
browser                   object
sex                       object
age                        int64
ip_address               float64
class                      int64
dtype: object


In [5]:
# === 6. Preview Final Cleaned Data ===
print("Preview of cleaned fraud_df:")
print(fraud_df.head())


Preview of cleaned fraud_df:
   user_id         signup_time       purchase_time  purchase_value  \
0    22058 2015-02-24 22:55:49 2015-04-18 02:47:11              34   
1   333320 2015-06-07 20:39:50 2015-06-08 01:38:54              16   
2     1359 2015-01-01 18:52:44 2015-01-01 18:52:45              15   
3   150084 2015-04-28 21:13:25 2015-05-04 13:54:50              44   
4   221365 2015-07-21 07:09:52 2015-09-09 18:40:53              39   

       device_id source browser sex  age    ip_address  class  
0  QVPSPJUOCKZAR    SEO  Chrome   M   39  7.327584e+08      0  
1  EOGFQPIZPYXFZ    Ads  Chrome   F   53  3.503114e+08      0  
2  YSSKYOSJHPPLJ    SEO   Opera   M   53  2.621474e+09      1  
3  ATGTXKYKUDUQN    SEO  Safari   M   41  3.840542e+09      0  
4  NAUITBZFJKHWW    Ads  Safari   M   45  4.155831e+08      0  


In [6]:
# === 7. Save the Cleaned Output Again (overwriting fraud_cleaned.csv) ===
fraud_df.to_csv(FRAUD_CLEANED_PATH, index=False)
print(f"Saved updated cleaned data to: {FRAUD_CLEANED_PATH}")


Saved updated cleaned data to: C:\Users\yitbie\Desktop\10A\project\fraud-detection-project\data\processed\fraud_cleaned.csv
