# Fraud Detection with Machine Learning
This notebook demonstrates how to build a fraud detection system using a Random Forest Classifier. It includes data loading, feature engineering, model training, and evaluation steps.

## 1. Import Required Libraries

In [None]:
import pandas as pd
import os
import numpy as np
from collections import deque
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

## 2. Load and Combine Dataset

In [None]:
# Set the path to your dataset folder
folder_path = "dataset"  

# Load all .pkl files and combine them
all_data = []
for filename in sorted(os.listdir(folder_path)):
    if filename.endswith(".pkl"):
        df_day = pd.read_pickle(os.path.join(folder_path, filename))
        all_data.append(df_day)

# Combine all daily DataFrames into one
df = pd.concat(all_data).reset_index(drop=True)
print(df.shape)
df.head()

## 3. Feature Engineering

In [None]:
# Convert TX_DATETIME to datetime format and create time-based features
df['TX_DATETIME'] = pd.to_datetime(df['TX_DATETIME'])
df['TX_DAY'] = df['TX_DATETIME'].dt.day
df['TX_HOUR'] = df['TX_DATETIME'].dt.hour
df['TX_WEEKDAY'] = df['TX_DATETIME'].dt.weekday

# Sort transactions by customer and compute rolling average transaction amount
df = df.sort_values(['CUSTOMER_ID', 'TX_DATETIME'])
df['CUSTOMER_AVG_AMOUNT_5'] = (
    df.groupby('CUSTOMER_ID')['TX_AMOUNT']
    .rolling(window=5, min_periods=1)
    .mean()
    .reset_index(0, drop=True)
)

# Add UNIX timestamp
df['TX_UNIX'] = df['TX_DATETIME'].astype(np.int64) // 10**9

### Customer Feature: Transactions in Last 24 Hours

In [None]:
customer_tx_count_1d = []
time_window = 86400  # 1 day in seconds

for _, group in df.groupby('CUSTOMER_ID'):
    q = deque()
    counts = []
    for t in group['TX_UNIX']:
        while q and t - q[0] > time_window:
            q.popleft()
        q.append(t)
        counts.append(len(q))
    customer_tx_count_1d.extend(counts)

df['CUSTOMER_TX_COUNT_1D'] = customer_tx_count_1d

### Terminal Feature: Fraud Count in Last 28 Days

In [None]:
df = df.sort_values(['TERMINAL_ID', 'TX_DATETIME'])
terminal_fraud_count_28d = []
time_window = 28 * 86400  # 28 days in seconds

for _, group in df.groupby('TERMINAL_ID'):
    q = deque()
    frauds = []
    for t, is_fraud in zip(group['TX_UNIX'], group['TX_FRAUD']):
        while q and t - q[0][0] > time_window:
            q.popleft()
        q.append((t, is_fraud))
        fraud_sum = sum(f[1] for f in q)
        frauds.append(fraud_sum)
    terminal_fraud_count_28d.extend(frauds)

df['TERMINAL_FRAUD_COUNT_28D'] = terminal_fraud_count_28d

## 4. Model Training

In [None]:
features = [
    'TX_AMOUNT', 'TX_DAY', 'TX_HOUR', 'TX_WEEKDAY',
    'CUSTOMER_AVG_AMOUNT_5', 'CUSTOMER_TX_COUNT_1D',
    'TERMINAL_FRAUD_COUNT_28D'
]

X = df[features]
y = df['TX_FRAUD']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

## 5. Model Evaluation

In [None]:
y_pred = model.predict(X_test)

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

accuracy = accuracy_score(y_test, y_pred)
accuracy_percentage = accuracy * 100
print(f"Accuracy: {accuracy_percentage:.2f}%")