# Feature Engineering

This notebook implements and tests feature engineering for the fraud detection models.

In [2]:
import sys
sys.path.append('../src')
from preprocessing import load_data, clean_data, map_ip_to_country
from feature_engineering import create_time_features, create_velocity_features
from imbalance_handler import handle_imbalance
import pandas as pd
from sklearn.preprocessing import StandardScaler

## 1. Load Preprocessed Data

In [3]:
fraud_df, ip_df = load_data('../data/raw/Fraud_Data.csv', '../data/raw/IpAddress_to_Country.csv')
df = clean_data(fraud_df)
df = map_ip_to_country(df, ip_df)
df.head()

Unnamed: 0,user_id,signup_time,purchase_time,purchase_value,device_id,source,browser,sex,age,ip_address,class,country
0,62421,2015-02-16 00:17:05,2015-03-08 10:00:39,46,ZCLZTAJPCRAQX,Direct,Safari,M,36,52093.496895,0,Unknown
1,173212,2015-03-08 04:03:22,2015-03-20 17:23:45,33,YFGYOALADBHLT,Ads,IE,F,30,93447.138961,0,Unknown
2,242286,2015-05-17 16:45:54,2015-05-26 08:54:34,33,QZNVQTUITFTHH,Direct,FireFox,F,32,105818.501505,0,Unknown
3,370003,2015-03-03 19:58:39,2015-05-28 21:09:13,33,PIBUQMBIELMMG,Ads,IE,M,40,117566.664867,0,Unknown
4,119824,2015-03-20 00:31:27,2015-04-05 07:31:46,55,WFIIFCPIOGMHT,Ads,Safari,M,38,131423.789042,0,Unknown


## 2. Apply Feature Engineering

In [4]:
df = create_time_features(df)
df = create_velocity_features(df)
df.head()

Unnamed: 0,user_id,signup_time,purchase_time,purchase_value,device_id,source,browser,sex,age,ip_address,class,country,hour_of_day,day_of_week,time_since_signup,device_usage_count,ip_usage_count
0,62421,2015-02-16 00:17:05,2015-03-08 10:00:39,46,ZCLZTAJPCRAQX,Direct,Safari,M,36,52093.496895,0,Unknown,10,6,1763014.0,1,1
1,173212,2015-03-08 04:03:22,2015-03-20 17:23:45,33,YFGYOALADBHLT,Ads,IE,F,30,93447.138961,0,Unknown,17,4,1084823.0,1,1
2,242286,2015-05-17 16:45:54,2015-05-26 08:54:34,33,QZNVQTUITFTHH,Direct,FireFox,F,32,105818.501505,0,Unknown,8,1,749320.0,1,1
3,370003,2015-03-03 19:58:39,2015-05-28 21:09:13,33,PIBUQMBIELMMG,Ads,IE,M,40,117566.664867,0,Unknown,21,3,7434634.0,1,1
4,119824,2015-03-20 00:31:27,2015-04-05 07:31:46,55,WFIIFCPIOGMHT,Ads,Safari,M,38,131423.789042,0,Unknown,7,6,1407619.0,1,1


## 3. Scale and Encode

In [5]:
# One-Hot Encoding for small categories
df = pd.get_dummies(df, columns=['source', 'browser', 'sex'], drop_first=True)

# Scaling numerical columns
scaler = StandardScaler()
num_cols = ['purchase_value', 'age', 'time_since_signup', 'device_usage_count', 'ip_usage_count']
df[num_cols] = scaler.fit_transform(df[num_cols])
df.head()

Unnamed: 0,user_id,signup_time,purchase_time,purchase_value,device_id,age,ip_address,class,country,hour_of_day,...,time_since_signup,device_usage_count,ip_usage_count,source_Direct,source_SEO,browser_FireFox,browser_IE,browser_Opera,browser_Safari,sex_M
0,62421,2015-02-16 00:17:05,2015-03-08 10:00:39,0.494721,ZCLZTAJPCRAQX,0.331793,52093.496895,0,Unknown,10,...,-1.013679,-0.261514,-0.232151,True,False,False,False,False,True,True
1,173212,2015-03-08 04:03:22,2015-03-20 17:23:45,-0.214781,YFGYOALADBHLT,-0.364448,93447.138961,0,Unknown,17,...,-1.230613,-0.261514,-0.232151,False,False,False,True,False,False,False
2,242286,2015-05-17 16:45:54,2015-05-26 08:54:34,-0.214781,QZNVQTUITFTHH,-0.132367,105818.501505,0,Unknown,8,...,-1.337931,-0.261514,-0.232151,True,False,True,False,False,False,False
3,370003,2015-03-03 19:58:39,2015-05-28 21:09:13,-0.214781,PIBUQMBIELMMG,0.795954,117566.664867,0,Unknown,21,...,0.800513,-0.261514,-0.232151,False,False,False,True,False,False,True
4,119824,2015-03-20 00:31:27,2015-04-05 07:31:46,0.985915,WFIIFCPIOGMHT,0.563874,131423.789042,0,Unknown,7,...,-1.127359,-0.261514,-0.232151,False,False,False,False,False,True,True


## 4. Handle Class Imbalance

We use SMOTE (Synthetic Minority Over-sampling Technique) to balance the dataset.

In [6]:
print("Class distribution before SMOTE:")
print(df['class'].value_counts())

# Apply SMOTE (only on numeric data in this example)
balanced_df = handle_imbalance(df, 'class')

print("\nClass distribution after SMOTE:")
print(balanced_df['class'].value_counts())

balanced_df.to_csv('../data/processed/balanced_fraud_data.csv', index=False)
print('\nBalanced data saved to processed folder.')

Class distribution before SMOTE:
class
0    136961
1     14151
Name: count, dtype: int64

Class distribution after SMOTE:
class
0    136961
1    136961
Name: count, dtype: int64

Balanced data saved to processed folder.
