In [17]:
import pandas as pd
import numpy as np
from pathlib import Path

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, IsolationForest

from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from imblearn.over_sampling import SMOTE

import joblib

PROJECT_ROOT = Path("/home/python/Downloads/Fraud Detection Project")  
DATA_DIR = PROJECT_ROOT / "data"
MODELS_DIR = PROJECT_ROOT / "models"
MODELS_DIR.mkdir(exist_ok=True)

df = pd.read_csv(DATA_DIR / "fraud_processed.csv")
df.head()


Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [18]:
X = df.drop(columns=["Class"])
y = df["Class"]

print("Fraud ratio:", y.mean())


Fraud ratio: 0.001727485630620034


In [19]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

X_train.shape, X_test.shape


((227845, 30), (56962, 30))

In [20]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)



In [21]:
log_reg = LogisticRegression(
    class_weight="balanced",
    max_iter=1000,
    random_state=42
)

log_reg.fit(X_train_scaled, y_train)

y_pred_lr = log_reg.predict(X_test_scaled)
y_proba_lr = log_reg.predict_proba(X_test_scaled)[:, 1]

print("Logistic Regression Classification Report:")
print(classification_report(y_test, y_pred_lr))

print("ROC-AUC:", roc_auc_score(y_test, y_proba_lr))


Logistic Regression Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.98      0.99     56864
           1       0.06      0.92      0.11        98

    accuracy                           0.98     56962
   macro avg       0.53      0.95      0.55     56962
weighted avg       1.00      0.98      0.99     56962

ROC-AUC: 0.9720834996210077


In [22]:
rf = RandomForestClassifier(
    n_estimators=300,
    random_state=42,
    class_weight="balanced",
    n_jobs=-1
)

rf.fit(X_train, y_train) 

y_pred_rf = rf.predict(X_test)
y_proba_rf = rf.predict_proba(X_test)[:, 1]

print("Random Forest Classification Report:")
print(classification_report(y_test, y_pred_rf))

print("ROC-AUC:", roc_auc_score(y_test, y_proba_rf))


Random Forest Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56864
           1       0.96      0.77      0.85        98

    accuracy                           1.00     56962
   macro avg       0.98      0.88      0.93     56962
weighted avg       1.00      1.00      1.00     56962

ROC-AUC: 0.9567233635857268


In [23]:
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

print("Before SMOTE:", y_train.value_counts())
print("After SMOTE:", y_train_smote.value_counts())

rf_smote = RandomForestClassifier(
    n_estimators=300,
    random_state=42,
    n_jobs=-1
)

rf_smote.fit(X_train_smote, y_train_smote)

y_pred_rf_smote = rf_smote.predict(X_test)
y_proba_rf_smote = rf_smote.predict_proba(X_test)[:, 1]

print("Random Forest + SMOTE Classification Report:")
print(classification_report(y_test, y_pred_rf_smote))

print("ROC-AUC:", roc_auc_score(y_test, y_proba_rf_smote))


Before SMOTE: Class
0    227451
1       394
Name: count, dtype: int64
After SMOTE: Class
0    227451
1    227451
Name: count, dtype: int64
Random Forest + SMOTE Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56864
           1       0.83      0.83      0.83        98

    accuracy                           1.00     56962
   macro avg       0.91      0.91      0.91     56962
weighted avg       1.00      1.00      1.00     56962

ROC-AUC: 0.9816676811411114


In [24]:
iso = IsolationForest(
    n_estimators=300,
    contamination=0.01,  
    random_state=42
)

iso.fit(X_train_scaled)

# IsolationForest gives -1 for anomaly, 1 for normal
iso_scores_test = iso.decision_function(X_test_scaled)  
iso_anomaly_flag = iso.predict(X_test_scaled)  # -1 / 1

# Convert to "anomaly probability-like" score
iso_anomaly_score = -iso_scores_test  

print("IsolationForest anomaly flags (value counts):")
print(pd.Series(iso_anomaly_flag).value_counts())


IsolationForest anomaly flags (value counts):
 1    56351
-1      611
Name: count, dtype: int64


In [25]:
rules_score = np.zeros(len(X_test))

# Example dummy rules:
rules_score += (X_test["Amount"] > X_test["Amount"].median()).astype(int)
rules_score += (df.loc[X_test.index, "Time"] < 60*60).astype(int)  

# Normalize to 0–1
rules_score = rules_score / rules_score.max()


In [26]:
ml_prob = y_proba_rf_smote  # from RF + SMOTE

fraud_score = 0.6 * ml_prob + 0.4 * rules_score

fraud_results = pd.DataFrame({
    "fraud_score": fraud_score,
    "ml_prob": ml_prob,
    "rule_score": rules_score,
    "iso_anomaly_score": iso_anomaly_score,
    "actual_class": y_test.values
}, index=X_test.index)

fraud_results.head()


Unnamed: 0,fraud_score,ml_prob,rule_score,iso_anomaly_score,actual_class
263020,0.2,0.0,0.5,-0.175744,0
11378,0.0,0.0,0.0,-0.081432,0
147283,0.222,0.036667,0.5,-0.047577,0
219439,0.0,0.0,0.0,-0.215935,0
36939,0.006,0.01,0.0,-0.11895,0


In [27]:
fraud_results_sorted = fraud_results.sort_values("fraud_score", ascending=False)
top_suspicious = fraud_results_sorted.head(50)
top_suspicious.head()


Unnamed: 0,fraud_score,ml_prob,rule_score,iso_anomaly_score,actual_class
143336,0.8,1.0,0.5,0.087899,1
15476,0.8,1.0,0.5,0.130947,1
42549,0.8,1.0,0.5,0.113904,1
42769,0.8,1.0,0.5,0.099768,1
64411,0.8,1.0,0.5,0.075116,1


In [28]:
print("Top 10 suspicious transactions:")
top_suspicious.head(10)


Top 10 suspicious transactions:


Unnamed: 0,fraud_score,ml_prob,rule_score,iso_anomaly_score,actual_class
143336,0.8,1.0,0.5,0.087899,1
15476,0.8,1.0,0.5,0.130947,1
42549,0.8,1.0,0.5,0.113904,1
42769,0.8,1.0,0.5,0.099768,1
64411,0.8,1.0,0.5,0.075116,1
15225,0.8,1.0,0.5,0.126458,1
15539,0.8,1.0,0.5,0.131903,1
141260,0.8,1.0,0.5,0.089877,1
15810,0.8,1.0,0.5,0.132575,1
218442,0.8,1.0,0.5,0.04812,1


In [29]:
joblib.dump(rf_smote, MODELS_DIR / "fraud_rf_model.pkl")
joblib.dump(scaler, MODELS_DIR / "fraud_scaler.pkl")
print("Saved model and scaler.")


Saved model and scaler.


In [30]:
DASHBOARD_DIR = PROJECT_ROOT / "dashboard"
DASHBOARD_DIR.mkdir(exist_ok=True)

dashboard_df = fraud_results.copy()
dashboard_df["is_fraud"] = dashboard_df["actual_class"]

dashboard_path = DASHBOARD_DIR / "fraud_dashboard_data.csv"
dashboard_df.to_csv(dashboard_path, index=True)  # keep index as transaction ID
print("Saved dashboard data to:", dashboard_path)


Saved dashboard data to: /home/python/Downloads/Fraud Detection Project/dashboard/fraud_dashboard_data.csv
