<a href="https://colab.research.google.com/github/fikrifaizz/Real-Time-Fraud-Detection-System/blob/main/notebooks/02_feature_engineering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

In [3]:
df = pd.read_csv('../data/processed/cleaned_data.csv')
print(f"Loaded: {df.shape[0]:,} rows, {df.shape[1]} columns")

Loaded: 590,540 rows, 424 columns


In [4]:
print("\nCreating Time Features...")

# Basic time features
df['hour'] = (df['TransactionDT'] % (24*3600)) // 3600
df['day'] = df['TransactionDT'] // (24*3600)
df['day_of_week'] = df['day'] % 7

# Time buckets
df['is_night'] = ((df['hour'] >= 0) & (df['hour'] < 6)).astype(int)
df['is_weekend'] = (df['day_of_week'].isin([5, 6])).astype(int)

print(f"Created 5 time features")


Creating Time Features...
Created 5 time features


In [5]:
print("\nCreating Amount Features...")

# Amount statistics
df['amount_log'] = np.log1p(df['TransactionAmt'])
df['amount_decimal'] = df['TransactionAmt'] - df['TransactionAmt'].astype(int)

# Round numbers (fraud often uses round numbers)
df['is_round_amount'] = (df['TransactionAmt'] % 10 == 0).astype(int)

print(f"Created 3 amount features")


Creating Amount Features...
Created 3 amount features


In [6]:
print("\nCreating Card Aggregation Features...")

# Group by card1 (main card identifier)
card_agg = df.groupby('card1').agg({
    'TransactionAmt': ['count', 'mean', 'std', 'min', 'max'],
    'TransactionDT': lambda x: x.max() - x.min()  # card usage duration
}).reset_index()

# Flatten column names
card_agg.columns = ['card1', 'card_txn_count', 'card_txn_mean', 
                    'card_txn_std', 'card_txn_min', 'card_txn_max',
                    'card_usage_duration']

# Merge back
df = df.merge(card_agg, on='card1', how='left')

# Fill NaN in std (happens when count=1)
df['card_txn_std'].fillna(0, inplace=True)

print(f"Created 6 card aggregation features")


Creating Card Aggregation Features...
Created 6 card aggregation features


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['card_txn_std'].fillna(0, inplace=True)


In [7]:
print("\nCreating Email Domain Features...")

# Email domain counts
for email_col in ['P_emaildomain', 'R_emaildomain']:
    if email_col in df.columns:
        email_counts = df[email_col].value_counts().to_dict()
        df[f'{email_col}_count'] = df[email_col].map(email_counts)
        
        # Popular email flag
        top_emails = df[email_col].value_counts().head(10).index.tolist()
        df[f'{email_col}_is_popular'] = df[email_col].isin(top_emails).astype(int)

print(f"Created 4 email features")


Creating Email Domain Features...
Created 4 email features


In [8]:
print("\nCreating Device Features...")

# Device type consistency
if 'DeviceType' in df.columns and 'DeviceInfo' in df.columns:
    # Device usage count
    device_counts = df['DeviceInfo'].value_counts().to_dict()
    df['device_usage_count'] = df['DeviceInfo'].map(device_counts)
    df['device_usage_count'].fillna(0, inplace=True)

print(f"Created device features")


Creating Device Features...
Created device features


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['device_usage_count'].fillna(0, inplace=True)


In [9]:
print("\nCreating Address Match Features...")

# Check if purchaser and recipient addresses match
if 'addr1' in df.columns and 'addr2' in df.columns:
    df['addr_match'] = (df['addr1'] == df['addr2']).astype(int)

print(f"Created address match feature")


Creating Address Match Features...
Created address match feature


In [10]:
print("\nCreating Velocity Features...")

# Sort by time
df = df.sort_values('TransactionDT').reset_index(drop=True)

# Card velocity: time since last transaction
df['time_since_last_txn'] = df.groupby('card1')['TransactionDT'].diff()
df['time_since_last_txn'].fillna(df['time_since_last_txn'].median(), inplace=True)

# Fast transactions flag (< 1 hour)
df['is_fast_transaction'] = (df['time_since_last_txn'] < 3600).astype(int)

print(f"Created 2 velocity features")


Creating Velocity Features...
Created 2 velocity features


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['time_since_last_txn'].fillna(df['time_since_last_txn'].median(), inplace=True)


In [11]:
print("\nCreating Missing Indicators...")

# Identity missing flag
identity_cols = [col for col in df.columns if col.startswith('id_')]
if identity_cols:
    df['identity_missing_count'] = df[identity_cols].isnull().sum(axis=1)
    df['has_identity_info'] = (df['identity_missing_count'] < len(identity_cols)).astype(int)

print(f"Created missing indicators")


Creating Missing Indicators...
Created missing indicators


In [12]:
print("\nEncoding Categorical Features...")

# Select categorical columns
cat_cols = df.select_dtypes(include=['object']).columns.tolist()
cat_cols = [col for col in cat_cols if col not in ['TransactionID']]

# Label encode
le = LabelEncoder()
for col in cat_cols:
    df[col] = le.fit_transform(df[col].astype(str))

print(f"Encoded {len(cat_cols)} categorical features")


Encoding Categorical Features...
Encoded 29 categorical features


In [13]:
print("\nSaving Featured Data...")

df.to_csv('../data/processed/featured_data.csv', index=False)
print(f"Saved: ../data/processed/featured_data.csv")
print(f"   Shape: {df.shape}")
print(f"   Memory: {df.memory_usage(deep=True).sum() / 1024**2:.1f} MB")


Saving Featured Data...
Saved: ../data/processed/featured_data.csv
   Shape: (590540, 446)
   Memory: 2009.4 MB


In [14]:
print("SPLITTING DATA (Time-Based)")

df = df.sort_values('TransactionDT').reset_index(drop=True)

# Calculate split indices
n = len(df)
train_end = int(n * 0.6)
val_end = int(n * 0.8)

# Split
train_df = df.iloc[:train_end].copy()
val_df = df.iloc[train_end:val_end].copy()
test_df = df.iloc[val_end:].copy()

print(f"\nSplit Complete:")
print(f"   Train: {len(train_df):,} rows ({len(train_df)/n*100:.1f}%)")
print(f"   Val:   {len(val_df):,} rows ({len(val_df)/n*100:.1f}%)")
print(f"   Test:  {len(test_df):,} rows ({len(test_df)/n*100:.1f}%)")

# Check fraud distribution
print(f"\nFraud Distribution:")
print(f"   Train: {train_df['isFraud'].mean()*100:.2f}%")
print(f"   Val:   {val_df['isFraud'].mean()*100:.2f}%")
print(f"   Test:  {test_df['isFraud'].mean()*100:.2f}%")

# Save splits
train_df.to_csv('../data/processed/train_set.csv', index=False)
val_df.to_csv('../data/processed/val_set.csv', index=False)
test_df.to_csv('../data/processed/test_set.csv', index=False)

print(f"\nSaved all splits to data/processed/")

SPLITTING DATA (Time-Based)

Split Complete:
   Train: 354,324 rows (60.0%)
   Val:   118,108 rows (20.0%)
   Test:  118,108 rows (20.0%)

Fraud Distribution:
   Train: 3.38%
   Val:   3.90%
   Test:  3.44%

Saved all splits to data/processed/
