In [1]:
import pandas as pd

# Load the dataset
df = pd.read_csv('dataset/banking_finance_logs.csv')

df = df.head(20000)

In [2]:
from sklearn.cluster import DBSCAN
from sentence_transformers import SentenceTransformer
import numpy as np
import pandas as pd

# Load pre-trained sentence transformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Convert log messages into embeddings
messages = df['message'].to_list()
embeddings = model.encode(messages, normalize_embeddings=True)
# Apply DBSCAN clustering
dbscan = DBSCAN(eps=0.3, min_samples=5, metric='cosine')  # Use cosine similarity
df['cluster'] = dbscan.fit_predict(embeddings)

df['is_outlier'] = df['cluster'] == -1

# Print results
df[['message', 'cluster', 'is_outlier']]

Unnamed: 0,message,cluster,is_outlier
0,Transaction T7416135 completed and recorded.,0,False
1,User User627035 logged in successfully.,1,False
2,Redirection successful for secure login.,2,False
3,Security token missing in authentication process.,3,False
4,Payment Session P4833715 initiated for authent...,4,False
...,...,...,...
19995,User User627035 logged in successfully.,1,False
19996,Unauthorized access attempt detected from unkn...,13,False
19997,Critical error in credit score verification mo...,49,False
19998,Invalid request structure detected.,25,False


In [3]:
import pickle

# Save DBSCAN model
with open("../models/dbscan_model.pkl", "wb") as f:
    pickle.dump(dbscan, f)

# Save clustered log messages
df.to_csv("../models/log_clusters.csv", index=False)

# Save embeddings
np.save("../models/log_embeddings.npy", embeddings)


In [5]:
import re


def classify_cluster_message(message):
    patterns = {
        r"User User\d+ logged (in|out) successfully\.": "Low",
        r"Payment Session P\d{7,} Closed Successfully\.": "Low",
        r"Transaction T\d{7,} completed without errors\.": "Low",
        r"Payment Session P\d{7,} initiated for authenticated user\.": "Low",
        r"New account created successfully\.": "Low",
        r"Transaction T\d{7,} confirmation received\.": "Low",
        r"Transaction T\d{7,} completed and recorded\.": "Low",
        r"User User\d+ registration successful\.": "Low",
        r"Profile User\d+ updated successfully\.": "Low",
        r"Resource moved permanently to a new URL\.": "Low",
        r"Redirection successful for secure login\.": "Low",
        r"Page redirection to updated banking portal\.": "Low",
        r"Redirection to new payment endpoint successful\.": "Low",
    }

    for pattern, category in patterns.items():
        if re.search(pattern, message, re.IGNORECASE):
            return category
    return None

In [6]:
df["regex_label"] = df["message"].apply(classify_cluster_message)
df.head()

Unnamed: 0,timestamp,error_code,message,log_type,severity,source_ip,destination_ip,process,encoded_message,cluster,regex_label
0,2025-01-01 00:01:50,201,Transaction T7416135 completed and recorded.,Http Success,Low,192.168.1.10,192.168.1.30,Database Server,Transaction T7416135 completed and recorded. D...,0,Low
1,2025-01-01 00:02:21,200,User User627035 logged in successfully.,Http Success,Low,53.26.85.240,192.168.1.10,Web Server,User User627035 logged in successfully. Web Se...,1,Low
2,2025-01-01 00:05:31,301,Redirection successful for secure login.,HTTP Error,Low,27.78.247.46,192.168.1.10,Web Server,Redirection successful for secure login. Web S...,2,Low
3,2025-01-01 00:06:15,401,Security token missing in authentication process.,HTTP Error,Medium,151.130.32.102,192.168.1.10,Web Server,Security token missing in authentication proce...,3,
4,2025-01-01 00:07:05,200,Payment Session P4833715 initiated for authent...,Http Success,Low,192.168.1.40,192.168.1.20,Payment Gateway,Payment Session P4833715 initiated for authent...,4,Low


In [7]:
df_non_regex = df[df.regex_label.isna()]
df_non_regex.shape

(12908, 11)

In [8]:
process_counts = df_non_regex['log_type'].value_counts()

print("Process Counts:\n", process_counts, "\n")

Process Counts:
 log_type
HTTP Error                  8398
System Failure              2761
Security Breach             1120
Maintenance                  162
Access Control Violation      13
Email Failure                 12
Name: count, dtype: int64 



In [9]:
df_bert_classification = df_non_regex[
    ~df_non_regex.log_type.isin(['Maintenance', 'Access Control Violation', 'Email Failure'])]

In [13]:
embeddings_bert = model.encode(df_bert_classification['message'].to_list())

In [14]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

X = embeddings_bert
Y = df_bert_classification['severity']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.4, random_state=42)

# Train Logistic Regression model
severity_logistic_model = LogisticRegression(max_iter=1000, class_weight='balanced')
severity_logistic_model.fit(X_train, y_train)

# Predict and get classification report
y_pred = severity_logistic_model.predict(X_test)
report = classification_report(y_test, y_pred, zero_division=0)
print(report)

              precision    recall  f1-score   support

    Critical       0.89      0.95      0.92      1701
        High       0.92      0.85      0.88      1805
      Medium       0.96      0.97      0.96      1583

    accuracy                           0.92      5089
   macro avg       0.92      0.92      0.92      5089
weighted avg       0.92      0.92      0.92      5089



In [18]:
import joblib

joblib.dump(severity_logistic_model, '../models/severity_logistic_model.joblib')

['../models/severity_logistic_model.joblib']