In [1]:
from dotenv import load_dotenv
load_dotenv()

import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
import joblib

### BERT Training

In [2]:
df = pd.read_csv("../datasets/cleaned_data_for_bert.csv")
df

Unnamed: 0,source,log_message,target_label
0,ModernCRM,nova.osapi_compute.wsgi.server [req-b9718cd8-f...,HTTP Status
1,AnalyticsEngine,Unauthorized access to data was attempted,Security Alert
2,ModernHR,nova.osapi_compute.wsgi.server [req-4895c258-b...,HTTP Status
3,BillingSystem,nova.osapi_compute.wsgi.server [req-ee8bc8ba-9...,HTTP Status
4,ModernHR,nova.osapi_compute.wsgi.server [req-f0bffbc3-5...,HTTP Status
...,...,...,...
2384,ModernHR,nova.osapi_compute.wsgi.server [req-96c3ec98-2...,HTTP Status
2385,ModernHR,User 3844 account experienced multiple failed ...,Security Alert
2386,ThirdPartyAPI,nova.metadata.wsgi.server [req-b6d4a270-accb-4...,HTTP Status
2387,BillingSystem,Email service affected by failed transmission,Critical Error


In [3]:
model = SentenceTransformer('all-MiniLM-L6-v2')  # Lightweight embedding model
embeddings_filtered = model.encode(df['log_message'].tolist())
embeddings_filtered.shape

(2389, 384)

In [4]:
X = embeddings_filtered
y = df['target_label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=67)
classifier = LogisticRegression(max_iter=1000)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
report = classification_report(y_test, y_pred)
print(report)

                     precision    recall  f1-score   support

     Critical Error       0.98      0.98      0.98        56
              Error       0.98      0.98      0.98        57
        HTTP Status       1.00      1.00      1.00       289
     Resource Usage       1.00      1.00      1.00        52
     Security Alert       1.00      1.00      1.00       120
System Notification       1.00      1.00      1.00        97
        User Action       1.00      1.00      1.00        46

           accuracy                           1.00       717
          macro avg       0.99      0.99      0.99       717
       weighted avg       1.00      1.00      1.00       717



Lol what an accuracy, the only wrong prediction just at classifying between critical error and error. 

In [5]:
joblib.dump(classifier, '../models/bert_model.joblib')

['../models/bert_model.joblib']