# Feature Engineering

This notebook creates behavior-based, time-based, and geolocation features to improve fraud detection performance.


In [11]:
import pandas as pd
import numpy as np
import ipaddress
from sklearn.preprocessing import StandardScaler

df = pd.read_csv("../data/raw/Fraud_Data.csv")
df['signup_time'] = pd.to_datetime(df['signup_time'])
df['purchase_time'] = pd.to_datetime(df['purchase_time'])


In [12]:
df['time_since_signup'] = (
    df['purchase_time'] - df['signup_time']
).dt.total_seconds() / 3600

df['hour_of_day'] = df['purchase_time'].dt.hour
df['day_of_week'] = df['purchase_time'].dt.dayofweek


Fraudsters often perform transactions shortly after account creation.  
Time-based features help capture this behavioral signal.


In [13]:
df = df.sort_values(['user_id', 'purchase_time'])

df['transaction_velocity'] = (
    df.groupby('user_id')['purchase_time']
    .diff()
    .dt.total_seconds()
    .fillna(999999)
)


High transaction velocity is a known fraud indicator, as fraudsters attempt multiple transactions before detection.


In [14]:
import pandas as pd

# 1. Prepare your main dataframe
# Ensure it is int64
df['ip_int'] = pd.to_numeric(df['ip_address'], errors='coerce').fillna(0).astype('int64')
df = df.sort_values('ip_int')

# 2. Prepare the IP map
ip_map = pd.read_csv("../data/raw/IpAddress_to_Country.csv")

# CRITICAL STEP: Convert the IP map columns to int64 to match df['ip_int']
ip_map['lower_bound_ip_address'] = ip_map['lower_bound_ip_address'].astype('int64')
ip_map['upper_bound_ip_address'] = ip_map['upper_bound_ip_address'].astype('int64')

ip_map = ip_map.sort_values('lower_bound_ip_address')

# 3. Use merge_asof
df = pd.merge_asof(
    df, 
    ip_map, 
    left_on='ip_int', 
    right_on='lower_bound_ip_address'
)

# 4. Cleanup: Verify the IP is actually within the range
# (Since merge_asof only looks at the lower bound, we check the upper bound here)
df.loc[df['ip_int'] > df['upper_bound_ip_address'], 'country'] = "Unknown"

In [15]:
categorical_cols = ['browser', 'source', 'sex', 'country']
df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

scaler = StandardScaler()
df[['purchase_value', 'time_since_signup']] = scaler.fit_transform(
    df[['purchase_value', 'time_since_signup']]
)


In [22]:
# Drop columns that SMOTE or models cannot handle
df = df.drop(columns=[
    'signup_time',      # original datetime
    'purchase_time',    # original datetime
    'user_id',          # ID
    'device_id',        # ID
    'ip_address'        # IP already mapped to country
], errors='ignore')


In [16]:
# Define target and features
y = df['class']
X = df.drop(columns=['class'])


In [17]:
numerical_cols = [
    'purchase_value',
    'time_since_signup',
    'transaction_velocity',
    'hour_of_day',
    'day_of_week',
    'age'
]

scaler = StandardScaler()
X[numerical_cols] = scaler.fit_transform(X[numerical_cols])


In [18]:
processed_df = X.copy()
processed_df['class'] = y.values


In [19]:
columns_to_drop = [
    'signup_time',
    'purchase_time',
    'ip_address',
    'device_id',
    'user_id'
]

X = X.drop(columns=columns_to_drop, errors='ignore')


In [20]:
processed_df.to_csv(
    "../data/processed/fraud_data_processed.csv",
    index=False
)


In [21]:
processed_df.shape
processed_df.head()
processed_df['class'].value_counts(normalize=True)


class
0    0.906354
1    0.093646
Name: proportion, dtype: float64