In [None]:
import pandas as pd

df = pd.read_csv('dataset/synthetic_logs.csv')
df

In [None]:
df.source.unique()

In [None]:
df.target_label.unique()

In [24]:
from sentence_transformers import SentenceTransformer
from sklearn.cluster import DBSCAN

# Load the sentence transformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Generate embeddings for log messages
embeddings = model.encode(df['log_message'].tolist())

  from .autonotebook import tqdm as notebook_tqdm


In [25]:
embeddings[ :2]

array([[-1.02939695e-01,  3.35459337e-02, -2.20260713e-02,
         1.55107002e-03, -9.86915641e-03, -1.78956270e-01,
        -6.34410605e-02, -6.01761900e-02,  2.81108692e-02,
         5.99619113e-02, -1.72618758e-02,  1.43372442e-03,
        -1.49560004e-01,  3.15285148e-03, -5.66031002e-02,
         2.71685962e-02, -1.49890035e-02, -3.54038216e-02,
        -3.62936519e-02, -1.45410355e-02, -5.61494799e-03,
         8.75538141e-02,  4.55121025e-02,  2.50964351e-02,
         1.00187343e-02,  1.24266837e-02, -1.39923573e-01,
         7.68695995e-02,  3.14095579e-02, -4.15252894e-03,
         4.36902903e-02,  1.71250086e-02, -8.00951421e-02,
         5.74006177e-02,  1.89091787e-02,  8.55261683e-02,
         3.96399014e-02, -1.34371802e-01, -1.44370436e-03,
         3.06704687e-03,  1.76854089e-01,  4.44881897e-03,
        -1.69274863e-02,  2.24266667e-02, -4.35049683e-02,
         6.09029876e-03, -9.98167135e-03, -6.23972639e-02,
         1.07371928e-02, -6.04898669e-03, -7.14660510e-0

In [29]:
# Perform DBSCAN clustering
dbscan = DBSCAN(eps=0.2, min_samples=1, metric='cosine')
clusters = dbscan.fit_predict(embeddings)

# Add cluster labels to dataframe
df['cluster'] = clusters
df.head()

Unnamed: 0,timestamp,source,log_message,target_label,complexity,cluster
0,2025-06-27 07:20:25,ModernCRM,nova.osapi_compute.wsgi.server [req-b9718cd8-f...,HTTP Status,bert,0
1,1/14/2025 23:07,ModernCRM,Email service experiencing issues with sending,Critical Error,bert,1
2,1/17/2025 1:29,AnalyticsEngine,Unauthorized access to data was attempted,Security Alert,bert,2
3,2025-07-12 00:24:16,ModernHR,nova.osapi_compute.wsgi.server [req-4895c258-b...,HTTP Status,bert,0
4,2025-06-02 18:25:23,BillingSystem,nova.osapi_compute.wsgi.server [req-ee8bc8ba-9...,HTTP Status,bert,0


In [30]:
df[df.cluster==1]

Unnamed: 0,timestamp,source,log_message,target_label,complexity,cluster
1,1/14/2025 23:07,ModernCRM,Email service experiencing issues with sending,Critical Error,bert,1
10,8/9/2025 18:58,ModernCRM,Email server encountered a sending fault,Error,bert,1
217,1/22/2025 5:45,BillingSystem,Mail service encountered a delivery glitch,Error,bert,1
248,5/2/2025 23:04,ModernHR,Service disruption caused by email sending error,Critical Error,bert,1
265,3/30/2025 23:53,ModernCRM,Email system had a problem sending emails,Error,bert,1
361,11/19/2025 23:06,BillingSystem,Email service experienced a sending issue,Error,bert,1
450,10/27/2025 5:59,ThirdPartyAPI,Email delivery system encountered an error,Error,bert,1
477,12/2/2025 10:30,AnalyticsEngine,Email transmission error caused service impact,Critical Error,bert,1
570,11/7/2025 18:08,ThirdPartyAPI,Email service impacted by sending failure,Critical Error,bert,1
678,4/28/2025 15:13,AnalyticsEngine,Email delivery problem affected system,Critical Error,bert,1


Let's analyze clusters by size and display messages from large clusters (>10 records)

In [31]:
# Get cluster sizes and sort them
cluster_sizes = df['cluster'].value_counts().sort_values(ascending=False)
print("Cluster sizes:")
print(cluster_sizes)

Cluster sizes:
cluster
0      1017
5       147
11      100
13       86
7        60
       ... 
132       1
133       1
134       1
111       1
135       1
Name: count, Length: 136, dtype: int64


In [32]:
# Print messages from clusters with more than 10 records
for cluster_id in cluster_sizes[cluster_sizes > 10].index:
    print(f"\nCluster {cluster_id} (size: {cluster_sizes[cluster_id]}):")
    print(df[df['cluster'] == cluster_id]['log_message'].to_string())
    print("-" * 80)



Cluster 0 (size: 1017):
0       nova.osapi_compute.wsgi.server [req-b9718cd8-f...
3       nova.osapi_compute.wsgi.server [req-4895c258-b...
4       nova.osapi_compute.wsgi.server [req-ee8bc8ba-9...
5       nova.osapi_compute.wsgi.server [req-f0bffbc3-5...
9       nova.osapi_compute.wsgi.server [req-2bf7cfee-a...
12      nova.osapi_compute.wsgi.server [req-d4f8d0c2-4...
16      nova.osapi_compute.wsgi.server [req-6fe0e366-f...
17      nova.osapi_compute.wsgi.server [req-5f1c2027-e...
19      nova.osapi_compute.wsgi.server [req-945d1f31-a...
20      nova.osapi_compute.wsgi.server [req-033d97b9-6...
21      nova.osapi_compute.wsgi.server [req-75bc6269-8...
23      nova.osapi_compute.wsgi.server [req-077c3c87-b...
24      nova.osapi_compute.wsgi.server [req-4e83daf7-a...
25      nova.osapi_compute.wsgi.server [req-bfce366e-9...
28      nova.osapi_compute.wsgi.server [req-5e6e042b-f...
31      nova.metadata.wsgi.server [-] 10.11.21.138,10....
33      nova.osapi_compute.wsgi.server [req-fe9

In [40]:
import re
def classify_with_regex(log_message):
    regex_patterns = {
        r"User User\d+ logged (in|out).": "User Action",
        r"Backup (started|ended) at .*": "System Notification",
        r"Backup completed successfully.": "System Notification",
        r"System updated to version .*": "System Notification",
        r"File .* uploaded successfully by user .*": "System Notification",
        r"Disk cleanup completed successfully.": "System Notification",
        r"System reboot initiated by user .*": "System Notification",
        r"Account with ID .* created by .*": "User Action"
    }
    for pattern, label in regex_patterns.items():
        if re.search(pattern, log_message, re.IGNORECASE):
            return label
    return "Other"

In [41]:
classify_with_regex("User User494 logged OUT.")

'User Action'

In [42]:
classify_with_regex("Account with ID A0898234 created by Dhaval")

'User Action'

In [43]:
df['regex_label'] = df['log_message'].apply(classify_with_regex)
df

Unnamed: 0,timestamp,source,log_message,target_label,complexity,cluster,regex_label
0,2025-06-27 07:20:25,ModernCRM,nova.osapi_compute.wsgi.server [req-b9718cd8-f...,HTTP Status,bert,0,Other
1,1/14/2025 23:07,ModernCRM,Email service experiencing issues with sending,Critical Error,bert,1,Other
2,1/17/2025 1:29,AnalyticsEngine,Unauthorized access to data was attempted,Security Alert,bert,2,Other
3,2025-07-12 00:24:16,ModernHR,nova.osapi_compute.wsgi.server [req-4895c258-b...,HTTP Status,bert,0,Other
4,2025-06-02 18:25:23,BillingSystem,nova.osapi_compute.wsgi.server [req-ee8bc8ba-9...,HTTP Status,bert,0,Other
...,...,...,...,...,...,...,...
2405,2025-08-13 07:29:25,ModernHR,nova.osapi_compute.wsgi.server [req-96c3ec98-2...,HTTP Status,bert,0,Other
2406,1/11/2025 5:32,ModernHR,User 3844 account experienced multiple failed ...,Security Alert,bert,7,Other
2407,2025-08-03 03:07:47,ThirdPartyAPI,nova.metadata.wsgi.server [req-b6d4a270-accb-4...,HTTP Status,bert,0,Other
2408,11/11/2025 11:52,BillingSystem,Email service affected by failed transmission,Critical Error,bert,1,Other


In [48]:
df.shape

(2410, 7)

In [46]:
df[df.regex_label=="Other"]

Unnamed: 0,timestamp,source,log_message,target_label,complexity,cluster,regex_label
0,2025-06-27 07:20:25,ModernCRM,nova.osapi_compute.wsgi.server [req-b9718cd8-f...,HTTP Status,bert,0,Other
1,1/14/2025 23:07,ModernCRM,Email service experiencing issues with sending,Critical Error,bert,1,Other
2,1/17/2025 1:29,AnalyticsEngine,Unauthorized access to data was attempted,Security Alert,bert,2,Other
3,2025-07-12 00:24:16,ModernHR,nova.osapi_compute.wsgi.server [req-4895c258-b...,HTTP Status,bert,0,Other
4,2025-06-02 18:25:23,BillingSystem,nova.osapi_compute.wsgi.server [req-ee8bc8ba-9...,HTTP Status,bert,0,Other
...,...,...,...,...,...,...,...
2405,2025-08-13 07:29:25,ModernHR,nova.osapi_compute.wsgi.server [req-96c3ec98-2...,HTTP Status,bert,0,Other
2406,1/11/2025 5:32,ModernHR,User 3844 account experienced multiple failed ...,Security Alert,bert,7,Other
2407,2025-08-03 03:07:47,ThirdPartyAPI,nova.metadata.wsgi.server [req-b6d4a270-accb-4...,HTTP Status,bert,0,Other
2408,11/11/2025 11:52,BillingSystem,Email service affected by failed transmission,Critical Error,bert,1,Other


In [51]:
df_non_regex = df[df['regex_label'] == "Other"].copy()
df_non_regex.shape

(1910, 7)

In [52]:
target_counts = df_non_regex['target_label'].value_counts()
target_counts[target_counts <= 5]


target_label
Workflow Error         4
Name: count, dtype: int64

In [55]:
df_non_legacy = df_non_regex[df_non_regex.source!='LegacyCRM']
df_non_legacy.source.unique()

array(['ModernCRM', 'AnalyticsEngine', 'ModernHR', 'BillingSystem',
       'ThirdPartyAPI'], dtype=object)

In [56]:
filtered_embeddings = model.encode(df_non_legacy['log_message'].tolist())
filtered_embeddings[:2]

array([[-1.02939695e-01,  3.35459337e-02, -2.20260713e-02,
         1.55107002e-03, -9.86915641e-03, -1.78956270e-01,
        -6.34410605e-02, -6.01761900e-02,  2.81108692e-02,
         5.99619113e-02, -1.72618758e-02,  1.43372442e-03,
        -1.49560004e-01,  3.15285148e-03, -5.66031002e-02,
         2.71685962e-02, -1.49890035e-02, -3.54038216e-02,
        -3.62936519e-02, -1.45410355e-02, -5.61494799e-03,
         8.75538141e-02,  4.55121025e-02,  2.50964351e-02,
         1.00187343e-02,  1.24266837e-02, -1.39923573e-01,
         7.68695995e-02,  3.14095579e-02, -4.15252894e-03,
         4.36902903e-02,  1.71250086e-02, -8.00951421e-02,
         5.74006177e-02,  1.89091787e-02,  8.55261683e-02,
         3.96399014e-02, -1.34371802e-01, -1.44370436e-03,
         3.06704687e-03,  1.76854089e-01,  4.44881897e-03,
        -1.69274863e-02,  2.24266667e-02, -4.35049683e-02,
         6.09029876e-03, -9.98167135e-03, -6.23972639e-02,
         1.07371928e-02, -6.04898669e-03, -7.14660510e-0

In [62]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    filtered_embeddings,
    df_non_legacy['target_label'],
    test_size=0.2,
    random_state=42
)

In [63]:
# Train logistic regression model
lr_model = LogisticRegression(max_iter=1000, multi_class='multinomial')
lr_model.fit(X_train, y_train)



In [64]:
# Make predictions and print classification report
y_pred = lr_model.predict(X_test)
print(classification_report(y_test, y_pred))


                precision    recall  f1-score   support

Critical Error       0.92      1.00      0.96        35
         Error       0.96      0.89      0.92        27
   HTTP Status       1.00      1.00      1.00       197
Resource Usage       1.00      1.00      1.00        35
Security Alert       1.00      0.99      0.99        87

      accuracy                           0.99       381
     macro avg       0.98      0.98      0.98       381
  weighted avg       0.99      0.99      0.99       381



In [65]:
import joblib
joblib.dump(lr_model, '../models/log_classifier.joblib')

['models/log_classifier.joblib']