## Import Dependencies

In [1]:
import pandas as pd
import numpy as np

## Load Dataset

In [3]:
try:
    fraud_data = pd.read_csv('../data/Fraud_Data_merged.csv')
    print("Dataset loaded for EDA.")
except FileNotFoundError as e:
    print(f"Error loading file for EDA: {e}. Please ensure the CSV files are in the data directory.")
    exit()

Dataset loaded for EDA.


## Feature Engineering

In [7]:
# make purchase_time and signup_time are datetime objects 
if not pd.api.types.is_datetime64_any_dtype(fraud_data['purchase_time']):
    fraud_data['purchase_time'] = pd.to_datetime(fraud_data['purchase_time'], errors='coerce')
    fraud_data.dropna(subset=['purchase_time'], inplace=True)

if not pd.api.types.is_datetime64_any_dtype(fraud_data['signup_time']):
    fraud_data['signup_time'] = pd.to_datetime(fraud_data['signup_time'], errors='coerce')
    fraud_data.dropna(subset=['signup_time'], inplace=True)
print("purchase_time and signup_time are converted to datetime objects")

purchase_time and signup_time are converted to datetime objects


## Time Based

In [8]:
print("Creating time-based features...")
fraud_data['hour_of_day'] = fraud_data['purchase_time'].dt.hour
fraud_data['day_of_week'] = fraud_data['purchase_time'].dt.dayofweek # Monday=0, Sunday=6
fraud_data['time_since_signup'] = (fraud_data['purchase_time'] - fraud_data['signup_time']).dt.total_seconds() / (60*60*24) # in days
print("Time-based features 'hour_of_day', 'day_of_week', 'time_since_signup' created.")

Creating time-based features...
Time-based features 'hour_of_day', 'day_of_week', 'time_since_signup' created.


## Transaction Frequency and Velocity

In [9]:
# Number of transactions per user_id
print("Creating transaction frequency features...")
user_transaction_counts = fraud_data.groupby('user_id')['purchase_time'].transform('count')
fraud_data['user_transaction_count'] = user_transaction_counts
print("Feature 'user_transaction_count' created.")

# Number of transactions per device_id
device_transaction_counts = fraud_data.groupby('device_id')['purchase_time'].transform('count')
fraud_data['device_transaction_count'] = device_transaction_counts
print("Feature 'device_transaction_count' created.")

# Number of transactions per ip_address_int (after conversion)
ip_transaction_counts = fraud_data.groupby('ip_address_int')['purchase_time'].transform('count')
fraud_data['ip_transaction_count'] = ip_transaction_counts
print("Feature 'ip_transaction_count' created.")

print("\n--- Fraud Data Info After Feature Engineering ---")
fraud_data.info()

print("\nFirst 5 rows of Fraud_Data.csv with new engineered features:")
print(fraud_data.head())

print("The 'Time' and 'Amount' features, along with V1-V28, are already quite rich.")


print("\n--- Feature Engineering Complete ---")


Creating transaction frequency features...
Feature 'user_transaction_count' created.
Feature 'device_transaction_count' created.
Feature 'ip_transaction_count' created.

--- Fraud Data Info After Feature Engineering ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 151112 entries, 0 to 151111
Data columns (total 19 columns):
 #   Column                    Non-Null Count   Dtype         
---  ------                    --------------   -----         
 0   user_id                   151112 non-null  int64         
 1   signup_time               151112 non-null  datetime64[ns]
 2   purchase_time             151112 non-null  datetime64[ns]
 3   purchase_value            151112 non-null  int64         
 4   device_id                 151112 non-null  object        
 5   source                    151112 non-null  object        
 6   browser                   151112 non-null  object        
 7   sex                       151112 non-null  object        
 8   age                       151112 n