### Case Study 1: Fraud Detection in Financial Transactions
**Goal: Build a pipeline to detect suspicious transactions using tabular features.**

In [28]:
import pandas as pd
import numpy as np
np.random.seed(42)

In [29]:
df = pd.read_csv('transactions.csv')
df

Unnamed: 0,transaction_id,user_id,amount,location,time,device,is_fraud
0,T1001,U001,850.0,LA,2024-05-10 12:34,Galaxy S21,0
1,T1002,U002,,NY,2024-05-11 09:20,iPhone 14,1
2,T1003,U003,0.0,TX,2024-06-01 23:59,Pixel 6,0
3,T1004,U004,1299.5,SF,2024-06-13 03:45,iPhone 13,1
4,T1005,U005,450.75,NY,2024-06-14 18:20,OnePlus 9,0


In [30]:
df.isnull().sum()

transaction_id    0
user_id           0
amount            1
location          0
time              0
device            0
is_fraud          0
dtype: int64

In [31]:
df.shape[0]

5

In [32]:
# Data cleaning
# Remove rows where amount is missing or ≤ 0
df_clean = df[df['amount'].notna() & (df['amount']>0)].copy()
df_clean

Unnamed: 0,transaction_id,user_id,amount,location,time,device,is_fraud
0,T1001,U001,850.0,LA,2024-05-10 12:34,Galaxy S21,0
3,T1004,U004,1299.5,SF,2024-06-13 03:45,iPhone 13,1
4,T1005,U005,450.75,NY,2024-06-14 18:20,OnePlus 9,0


In [33]:
# Feature Engineering
# Create hour, is_night, adn amount_zscore
df_clean['hour'] = pd.to_datetime(df_clean['time']).dt.hour
df_clean['is_night'] = df_clean['hour'].apply(lambda h:1 if h < 6 or h<22 else 0)
df_clean['amount_zscore'] = (df_clean['amount'] - df_clean['amount'].mean()) / df_clean['amount'].std()
df_clean

Unnamed: 0,transaction_id,user_id,amount,location,time,device,is_fraud,hour,is_night,amount_zscore
0,T1001,U001,850.0,LA,2024-05-10 12:34,Galaxy S21,0,12,1,-0.039447
3,T1004,U004,1299.5,SF,2024-06-13 03:45,iPhone 13,1,3,1,1.01914
4,T1005,U005,450.75,NY,2024-06-14 18:20,OnePlus 9,0,18,1,-0.979693


In [None]:
# Group-Based Aggregates
# For each user, compute avg amount, max amount

# user_stats = df_clean.groupby('user_id')['amount'].agg(
#     mean_user='mean',
#     max_user='max'
# ).reset_index()
# df_grop = df_clean.merge(user_stats, on='user_id')

user_stats = df_clean.groupby('user_id')['amount'].agg(['mean','max']).reset_index()
df_grop = df_clean.merge(user_stats, on='user_id', suffixes=('', '_user'))
df_grop

Unnamed: 0,transaction_id,user_id,amount,location,time,device,is_fraud,hour,is_night,amount_zscore,mean,max
0,T1001,U001,850.0,LA,2024-05-10 12:34,Galaxy S21,0,12,1,-0.039447,850.0,850.0
1,T1004,U004,1299.5,SF,2024-06-13 03:45,iPhone 13,1,3,1,1.01914,1299.5,1299.5
2,T1005,U005,450.75,NY,2024-06-14 18:20,OnePlus 9,0,18,1,-0.979693,450.75,450.75


In [None]:
# Device Type One-Hot Encoding
df_clean = pd.get_dummies(df_clean, columns=['device'], drop_first=True)
df_clean

Unnamed: 0,transaction_id,user_id,amount,location,time,is_fraud,hour,is_night,amount_zscore,device_OnePlus 9,device_iPhone 13
0,T1001,U001,850.0,LA,2024-05-10 12:34,0,12,1,-0.039447,False,False
3,T1004,U004,1299.5,SF,2024-06-13 03:45,1,3,1,1.01914,False,True
4,T1005,U005,450.75,NY,2024-06-14 18:20,0,18,1,-0.979693,True,False


In [38]:
# Build Feature Matrix (X) and Labels (y)
feature_cols = ['amount', 'hour', 'is_night', 'amount_zscore', 'mean', 'max']
X = df_grop[feature_cols].values
y = df_grop['is_fraud'].values

In [None]:
from scipy.spatial.distance import cdist

query = X[-1:]       # Last transaction
others = X[:-1]      # All previous
dists = cdist(query, others, metric='euclidean')
dists


array([[ 691.54795319, 1470.15600629]])