In [1]:
import pandas as pd
import numpy as np
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from sklearn.compose import ColumnTransformer
import joblib

In [2]:
df = pd.read_csv('../data/processed/Fraudulent_processed.csv')
print(df.head())

   time_step transaction_type    amount    sender_id  sender_old_balance  \
0          1          PAYMENT   9839.64  C1231006815            170136.0   
1          1          PAYMENT   1864.28  C1666544295             21249.0   
2          1         TRANSFER    181.00  C1305486145               181.0   
3          1         CASH_OUT    181.00   C840083671               181.0   
4          1          PAYMENT  11668.14  C2048537720             41554.0   

   sender_new_balance  receiver_id  receiver_old_balance  \
0           160296.36  M1979787155                   0.0   
1            19384.72  M2044282225                   0.0   
2                0.00   C553264065                   0.0   
3                0.00    C38997010               21182.0   
4            29885.86  M1230701703                   0.0   

   receiver_new_balance  is_fraud  is_flagged_fraud  is_high_value  \
0                   0.0         0                 0              0   
1                   0.0         0         

In [3]:
df = df.drop(["sender_id", "receiver_id"], axis=1)

In [4]:
df.head()

Unnamed: 0,time_step,transaction_type,amount,sender_old_balance,sender_new_balance,receiver_old_balance,receiver_new_balance,is_fraud,is_flagged_fraud,is_high_value,sender_balance_diff,receiver_balance_diff,is_receiver_merchant
0,1,PAYMENT,9839.64,170136.0,160296.36,0.0,0.0,0,0,0,9839.64,0.0,1
1,1,PAYMENT,1864.28,21249.0,19384.72,0.0,0.0,0,0,0,1864.28,0.0,1
2,1,TRANSFER,181.0,181.0,0.0,0.0,0.0,1,0,0,181.0,0.0,0
3,1,CASH_OUT,181.0,181.0,0.0,21182.0,0.0,1,0,0,181.0,-21182.0,0
4,1,PAYMENT,11668.14,41554.0,29885.86,0.0,0.0,0,0,1,11668.14,0.0,1


In [5]:
df = pd.get_dummies(df, columns=["transaction_type"], drop_first=True)
df.head()

Unnamed: 0,time_step,amount,sender_old_balance,sender_new_balance,receiver_old_balance,receiver_new_balance,is_fraud,is_flagged_fraud,is_high_value,sender_balance_diff,receiver_balance_diff,is_receiver_merchant,transaction_type_CASH_OUT,transaction_type_DEBIT,transaction_type_PAYMENT,transaction_type_TRANSFER
0,1,9839.64,170136.0,160296.36,0.0,0.0,0,0,0,9839.64,0.0,1,False,False,True,False
1,1,1864.28,21249.0,19384.72,0.0,0.0,0,0,0,1864.28,0.0,1,False,False,True,False
2,1,181.0,181.0,0.0,0.0,0.0,1,0,0,181.0,0.0,0,False,False,False,True
3,1,181.0,181.0,0.0,21182.0,0.0,1,0,0,181.0,-21182.0,0,True,False,False,False
4,1,11668.14,41554.0,29885.86,0.0,0.0,0,0,1,11668.14,0.0,1,False,False,True,False


In [6]:
X = df.drop("is_fraud", axis=1)
y = df["is_fraud"]

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [8]:
rf = RandomForestClassifier(
    n_estimators=500,
    max_depth=None,
    min_samples_split=5,
    min_samples_leaf=2,
    class_weight="balanced_subsample",
    random_state=42,
    n_jobs=-1
)


In [9]:
rf.fit(X_train, y_train)

In [10]:
y_pred_proba = rf.predict_proba(X_test)[:, 1]

In [11]:
y_pred_tuned = (y_pred_proba > 0.10).astype(int)
print(classification_report(y_test, y_pred_tuned))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00   1116028
           1       0.57      0.97      0.72      1639

    accuracy                           1.00   1117667
   macro avg       0.79      0.99      0.86   1117667
weighted avg       1.00      1.00      1.00   1117667



In [12]:
thresholds = [0.05, 0.10, 0.15, 0.20, 0.25, 0.30, 0.35]
for t in thresholds:
    y_pred_t = (y_pred_proba > t).astype(int)
    print(f"Threshold {t}: Precision={precision_score(y_test, y_pred_t, zero_division=0):.3f}, "
          f"Recall={recall_score(y_test, y_pred_t, zero_division=0):.3f}, "
          f"F1={f1_score(y_test, y_pred_t, zero_division=0):.3f}")


Threshold 0.05: Precision=0.348, Recall=0.986, F1=0.514
Threshold 0.1: Precision=0.574, Recall=0.973, F1=0.722
Threshold 0.15: Precision=0.673, Recall=0.962, F1=0.792
Threshold 0.2: Precision=0.746, Recall=0.943, F1=0.833
Threshold 0.25: Precision=0.807, Recall=0.933, F1=0.866
Threshold 0.3: Precision=0.853, Recall=0.919, F1=0.885
Threshold 0.35: Precision=0.888, Recall=0.896, F1=0.892


In [13]:
# Evaluate threshold-adjusted predictions
acc  = accuracy_score(y_test, y_pred_tuned)
prec = precision_score(y_test, y_pred_tuned, zero_division=0)
rec  = recall_score(y_test, y_pred_tuned, zero_division=0)
f1   = f1_score(y_test, y_pred_tuned, zero_division=0)
cm   = confusion_matrix(y_test, y_pred_tuned)
print("\n==================== RANDOM FOREST ====================\n")
print(f"Accuracy:  {acc:.4f}")
print(f"Precision: {prec:.4f}")
print(f"Recall:    {rec:.4f}")
print(f"F1 Score:  {    f1:.4f}")

print("\nConfusion Matrix:")
print(cm)

print("\nClassification Report:")
print(classification_report(y_test, y_pred_tuned, zero_division=0))




Accuracy:  0.9989
Precision: 0.5737
Recall:    0.9732
F1 Score:  0.7219

Confusion Matrix:
[[1114843    1185]
 [     44    1595]]

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00   1116028
           1       0.57      0.97      0.72      1639

    accuracy                           1.00   1117667
   macro avg       0.79      0.99      0.86   1117667
weighted avg       1.00      1.00      1.00   1117667



In [15]:
import shutil
import os

wrong_path = "src/models"   # this refers to notebooks/src/models because your cwd is notebooks

if os.path.exists(wrong_path):
    shutil.rmtree(wrong_path)
    print("Removed wrong folder:", wrong_path)
else:
    print("Wrong folder not found — already removed.")

Removed wrong folder: src/models


In [None]:
project_root = os.path.abspath("..")

correct_model_path = os.path.join(project_root, "src/models")

os.makedirs(correct_model_path, exist_ok=True)

print("Correct model folder created at:", correct_model_path)


In [None]:
joblib.dump(rf, os.path.join(correct_model_path, "fraud_model.pkl"))
joblib.dump(X_train.columns.tolist(), os.path.join(correct_model_path, "model_features.pkl"))

print("Model saved correctly in:", correct_model_path)
