In [1]:
# 📦 Import libraries
!pip install hmmlearn
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from hmmlearn import hmm
import warnings

warnings.filterwarnings("ignore")

# 📍 Define Bengaluru localities
localities = {
    'Indiranagar': {'center': (12.9716, 77.6412), 'population': 50000, 'atm_count': 30},
    'Whitefield': {'center': (12.9698, 77.7499), 'population': 70000, 'atm_count': 25},
    'Koramangala': {'center': (12.9352, 77.6245), 'population': 45000, 'atm_count': 35},
    'Jayanagar': {'center': (12.9250, 77.5938), 'population': 40000, 'atm_count': 40},
    'BTM Layout': {'center': (12.9166, 77.6101), 'population': 42000, 'atm_count': 30},
    'Rajajinagar': {'center': (12.9915, 77.5560), 'population': 48000, 'atm_count': 28},
    'Malleshwaram': {'center': (13.0092, 77.5695), 'population': 46000, 'atm_count': 32},
    'Marathahalli': {'center': (12.9560, 77.7019), 'population': 55000, 'atm_count': 20},
    'Electronic City': {'center': (12.8382, 77.6756), 'population': 60000, 'atm_count': 18},
    'Hebbal': {'center': (13.0358, 77.5970), 'population': 48000, 'atm_count': 26}
}

# 🧠 Calculate fraud rates dynamically
for data in localities.values():
    pop, atm = data['population'], data['atm_count']
    data['fraud_rate'] = min(0.15, 0.01 + (pop / 100000) * (1 - atm / 50))

# 🎲 Generate random location around a center
def random_location(center, radius_km=2):
    radius_deg = radius_km / 111
    lat_offset = np.random.uniform(-radius_deg, radius_deg)
    lon_offset = np.random.uniform(-radius_deg, radius_deg)
    return center[0] + lat_offset, center[1] + lon_offset

# 🧹 Generate a single transaction
def generate_transaction(user_id):
    locality = random.choice(list(localities.keys()))
    center, fraud_prob = localities[locality]['center'], localities[locality]['fraud_rate']
    is_fraud = np.random.rand() < fraud_prob
    amount = np.random.uniform(100001, 500000) if is_fraud else np.random.uniform(0, 100000)
    txn_time = datetime.now() - timedelta(minutes=np.random.randint(0, 60*24*30))
    lat, lon = random_location(center)
    return [user_id, round(amount, 2), lat, lon, txn_time.strftime('%Y-%m-%d %H:%M:%S'), locality, int(is_fraud)]

# 📈 Generate full dataset
def generate_dataset(total_transactions=1000):
    user_ids = [f'user_{i:04d}' for i in range(1, total_transactions // 10 + 2)]
    data = [generate_transaction(random.choice(user_ids)) for _ in range(total_transactions)]
    return pd.DataFrame(data, columns=['user_id', 'amount', 'latitude', 'longitude', 'transaction_time', 'locality', 'is_fraud'])

# 🚀 Create and save dataset
np.random.seed(42)
random.seed(42)
dataset = generate_dataset(1000)
dataset.to_csv('bengaluru_upi_transactions.csv', index=False)
print("✅ Dataset generated and saved as 'bengaluru_upi_transactions.csv'")
# 🖥️ Enable dataset download for your environment
from google.colab import files
files.download('bengaluru_upi_transactions.csv')

# 📅 Load dataset
df = pd.read_csv('bengaluru_upi_transactions.csv')

# 📏 Encode locality
df['locality'] = df['locality'].astype('category').cat.codes

# ⏰ Extract time features
df['transaction_time'] = pd.to_datetime(df['transaction_time'])
df['hour'] = df['transaction_time'].dt.hour
df['dayofweek'] = df['transaction_time'].dt.dayofweek

# 🔍 Features and labels
X = df[['amount', 'latitude', 'longitude', 'locality', 'hour', 'dayofweek']]
y = df['is_fraud']

# ✂️ Split for Random Forest
X_train_rf, X_test_rf, y_train_rf, y_test_rf = train_test_split(X, y, test_size=0.3, random_state=42)

# 🌲 Train Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train_rf, y_train_rf)

# 🧪 Evaluate Random Forest
rf_preds = rf.predict(X_test_rf)
print("\n🌲 Random Forest Classification Report:")
print(classification_report(y_test_rf, rf_preds))

# ✂️ Split for HMM
X_seq = X.to_numpy()
y_seq = y.to_numpy()
X_seq_train, X_seq_test, y_seq_train, y_seq_test = train_test_split(X_seq, y_seq, test_size=0.3, random_state=42)

# 🤖 Train HMM
hmm_model = hmm.GaussianHMM(n_components=2, covariance_type='diag', n_iter=100)
hmm_model.fit(X_seq_train)

# 🧹 Sliding window for HMM detection
window_size = 10
scores, labels = [], []

for i in range(0, len(X_seq_test) - window_size + 1):
    window = X_seq_test[i:i+window_size]
    if len(window) == window_size:
        score = hmm_model.score(window)
        scores.append(score)
        fraud_ratio = y_seq_test[i:i+window_size].mean()
        labels.append(1 if fraud_ratio > 0.2 else 0)

# 🎚 Thresholding HMM scores
threshold = np.percentile(scores, 20)
hmm_preds = [1 if score < threshold else 0 for score in scores]

# 🧪 Evaluate HMM
print("\n🤖 HMM Fraud Detection Classification Report:")
print(classification_report(labels, hmm_preds))

# 🔄 Hybrid Model
rf_final_preds = rf.predict(X)
hmm_scores, final_labels = [], []

for i in range(0, len(X) - window_size + 1):
    window_rf_preds = rf_final_preds[i:i+window_size]
    if len(window_rf_preds) == window_size:
        score = hmm_model.score(X.iloc[i:i+window_size].to_numpy())
        hmm_scores.append(score)
        fraud_ratio = np.mean(window_rf_preds)
        final_labels.append(1 if fraud_ratio > 0.2 else 0)

# 🎚 Thresholding Hybrid scores
hybrid_threshold = np.percentile(hmm_scores, 20)
hybrid_preds = [1 if score < hybrid_threshold else 0 for score in hmm_scores]

# 🧪 Evaluate Hybrid Model
print("\n🔄 Hybrid Model Classification Report:")
print(classification_report(final_labels, hybrid_preds))


Collecting hmmlearn
  Downloading hmmlearn-0.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.0 kB)
Downloading hmmlearn-0.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (165 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m165.9/165.9 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: hmmlearn
Successfully installed hmmlearn-0.3.3
✅ Dataset generated and saved as 'bengaluru_upi_transactions.csv'


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


🌲 Random Forest Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       258
           1       1.00      1.00      1.00        42

    accuracy                           1.00       300
   macro avg       1.00      1.00      1.00       300
weighted avg       1.00      1.00      1.00       300


🤖 HMM Fraud Detection Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.88      0.91       250
           1       0.47      0.66      0.55        41

    accuracy                           0.85       291
   macro avg       0.70      0.77      0.73       291
weighted avg       0.87      0.85      0.86       291


🔄 Hybrid Model Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.88      0.89       804
           1       0.52      0.55      0.54       187

    accuracy                           0.82       991
   