In [3]:
import pandas as pd
df=pd.read_csv("synthetic_logs.csv")
df.head()
df.drop(labels='complexity',axis=1,inplace=True)

In [4]:
df.source.unique()
df.head()


Unnamed: 0,timestamp,source,log_message,target_label
0,2025-06-27 07:20:25,ModernCRM,nova.osapi_compute.wsgi.server [req-b9718cd8-f...,HTTP Status
1,1/14/2025 23:07,ModernCRM,Email service experiencing issues with sending,Critical Error
2,1/17/2025 1:29,AnalyticsEngine,Unauthorized access to data was attempted,Security Alert
3,2025-07-12 00:24:16,ModernHR,nova.osapi_compute.wsgi.server [req-4895c258-b...,HTTP Status
4,2025-06-02 18:25:23,BillingSystem,nova.osapi_compute.wsgi.server [req-ee8bc8ba-9...,HTTP Status


In [5]:
from sentence_transformers import SentenceTransformer
# Initialize the sentence transformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Generate embeddings for the log messages
print("Generating embeddings...")
embeddings = model.encode(df['log_message'].tolist(), show_progress_bar=True)


  from .autonotebook import tqdm as notebook_tqdm


Generating embeddings...


Batches: 100%|██████████| 76/76 [00:46<00:00,  1.62it/s]


In [6]:
embeddings[:5]

array([[-0.10293962,  0.03354594, -0.02202607, ...,  0.00457793,
        -0.04259717,  0.00322621],
       [ 0.00804572, -0.03573923,  0.04938739, ...,  0.01538319,
        -0.06230947, -0.02774666],
       [-0.00908224,  0.13003924, -0.05275568, ...,  0.02014104,
        -0.05117098, -0.02930294],
       [-0.09751046,  0.04911299, -0.03977424, ...,  0.02477502,
        -0.03546079, -0.00018598],
       [-0.10468338,  0.05926038, -0.02488499, ...,  0.02502055,
        -0.037193  , -0.0256891 ]], shape=(5, 384), dtype=float32)

In [7]:

from sklearn.cluster import DBSCAN

# Apply DBSCAN for clustering
print("Applying DBSCAN clustering...")
dbscan = DBSCAN(eps=0.2, min_samples=1,metric='cosine').fit(embeddings) 
clusters = dbscan.fit_predict(embeddings)

# Add cluster labels to the dataframe
df['cluster'] = clusters

Applying DBSCAN clustering...


In [8]:
print(df[['log_message', 'cluster']])
df[df.cluster==12]


                                            log_message  cluster
0     nova.osapi_compute.wsgi.server [req-b9718cd8-f...        0
1        Email service experiencing issues with sending        1
2             Unauthorized access to data was attempted        2
3     nova.osapi_compute.wsgi.server [req-4895c258-b...        0
4     nova.osapi_compute.wsgi.server [req-ee8bc8ba-9...        0
...                                                 ...      ...
2405  nova.osapi_compute.wsgi.server [req-96c3ec98-2...        0
2406  User 3844 account experienced multiple failed ...        7
2407  nova.metadata.wsgi.server [req-b6d4a270-accb-4...        0
2408      Email service affected by failed transmission        1
2409  Repeated failed login attempts occurred for us...        7

[2410 rows x 2 columns]


Unnamed: 0,timestamp,source,log_message,target_label,cluster
29,3/10/2025 23:11,AnalyticsEngine,Alert: brute force login attempt from 192.168....,Security Alert,12
304,1/22/2025 16:51,ModernHR,Alert: brute force login attempt from 192.168....,Security Alert,12
939,6/9/2025 6:37,ModernCRM,Brute force login detected from IP 192.168.237...,Security Alert,12
1556,2/9/2025 7:08,BillingSystem,Brute force login detected from IP 192.168.246...,Security Alert,12
2142,10/24/2025 5:18,ModernCRM,Brute force login detected from IP 192.168.174...,Security Alert,12
2282,6/30/2025 22:15,ModernCRM,Alert: brute force login attempt from 192.168....,Security Alert,12


In [9]:
# After clustering, count the number of samples in each cluster
cluster_counts = df['cluster'].value_counts()

# Print examples from clusters with more than 10 records
print("\nClusters with more than 10 records:")
for cluster_id, count in cluster_counts.items():
    if count > 10 and cluster_id != -1:  # Skip noise points (cluster -1)
        print(f"\nCluster {cluster_id} (containing {count} records):")
        
        # Get 5 examples from this cluster
        examples = df[df['cluster'] == cluster_id]['log_message'].head(5).tolist()
        for i, example in enumerate(examples):
            print(f"  {i+1}. {example}")


Clusters with more than 10 records:

Cluster 0 (containing 1017 records):
  1. nova.osapi_compute.wsgi.server [req-b9718cd8-f65e-49cc-8349-6cf7122af137 113d3a99c3da401fbd62cc2caa5b96d2 54fadb412c4e40cdbaed9335e4c35a9e - - -] 10.11.10.1 "GET /v2/54fadb412c4e40cdbaed9335e4c35a9e/servers/detail HTTP/1.1" status: 200 len: 1893 time: 0.2675118
  2. nova.osapi_compute.wsgi.server [req-4895c258-b2f8-488f-a2a3-4fae63982e48 113d3a99c3da401fbd62cc2caa5b96d2 54fadb412c4e40cdbaed9335e4c35a9e - - -] 10.11.10.1 "GET /v2/54fadb412c4e40cdbaed9335e4c35a9e/servers/detail HTTP/1.1" HTTP status code -  200 len: 211 time: 0.0968180
  3. nova.osapi_compute.wsgi.server [req-ee8bc8ba-9265-4280-9215-dbe000a41209 113d3a99c3da401fbd62cc2caa5b96d2 54fadb412c4e40cdbaed9335e4c35a9e - - -] 10.11.10.1 "GET /v2/54fadb412c4e40cdbaed9335e4c35a9e/servers/detail HTTP/1.1" RCODE  200 len: 1874 time: 0.2280791
  4. nova.osapi_compute.wsgi.server [req-f0bffbc3-5ab0-4916-91c1-0a61dd7d4ec2 113d3a99c3da401fbd62cc2caa5b96d2 54f

In [10]:
import re
def classify_with_regex(log_message):
    regex_patterns = {
        r"User User\d+ logged (in|out).": "User Action",
        r"Backup (started|ended) at .*": "System Notification",
        r"Backup completed successfully.": "System Notification",
        r"System updated to version .*": "System Notification",
        r"File .* uploaded successfully by user .*": "System Notification",
        r"Disk cleanup completed successfully.": "System Notification",
        r"System reboot initiated by user .*": "System Notification",
        r"Account with ID .* created by .*": "User Action"
    }
    for pattern, label in regex_patterns.items():
        if re.search(pattern, log_message):
            return label
    return None

In [11]:
classify_with_regex("User User123 logged in.")


'User Action'

In [12]:
classify_with_regex("System reboot initiated by user User179.")


'System Notification'

In [13]:
classify_with_regex("Hey you, chill bro")

In [14]:
# Apply regex classification
df['regex_label'] = df['log_message'].apply(lambda x: classify_with_regex(x))
df[df['regex_label'].notnull()]

Unnamed: 0,timestamp,source,log_message,target_label,cluster,regex_label
7,10/11/2025 8:44,ModernHR,File data_6169.csv uploaded successfully by us...,System Notification,4,System Notification
14,1/4/2025 1:43,ThirdPartyAPI,File data_3847.csv uploaded successfully by us...,System Notification,4,System Notification
15,5/1/2025 9:41,ModernCRM,Backup completed successfully.,System Notification,8,System Notification
18,2/22/2025 17:49,ModernCRM,Account with ID 5351 created by User634.,User Action,9,User Action
27,9/24/2025 19:57,ThirdPartyAPI,User User685 logged out.,User Action,11,User Action
...,...,...,...,...,...,...
2376,6/27/2025 8:47,ModernCRM,System updated to version 2.0.5.,System Notification,21,System Notification
2381,9/5/2025 6:39,ThirdPartyAPI,Disk cleanup completed successfully.,System Notification,32,System Notification
2394,4/3/2025 13:13,ModernHR,Disk cleanup completed successfully.,System Notification,32,System Notification
2395,5/2/2025 14:29,ThirdPartyAPI,Backup ended at 2025-05-06 11:23:16.,System Notification,13,System Notification


In [15]:
df_non_regex = df[df['regex_label'].isnull()].copy()
df_non_regex.shape

(1910, 6)

In [16]:
# Count occurrences of each target label
label_counts = df_non_regex['target_label'].value_counts()

# Find target labels with fewer than 5 occurrences
rare_labels = label_counts[label_counts < 5].index.tolist()

print(f"Target labels with fewer than 5 occurrences: {rare_labels}")



In [17]:
# Remove rows with these rare target labels
df_cleaned = df_non_regex[~df_non_regex['target_label'].isin(rare_labels)]

print(f"After removing rare labels, dataframe shape: {df_cleaned.shape}")

After removing rare labels, dataframe shape: (1903, 6)


In [18]:
# Initialize the sentence transformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Generate embeddings for the log messages
print("Generating embeddings...")
embeddings2 = model.encode(df_cleaned['log_message'].tolist(), show_progress_bar=True)


Generating embeddings...


Batches: 100%|██████████| 60/60 [00:50<00:00,  1.18it/s]


In [19]:
embeddings2[:5]

array([[-0.10293962,  0.03354594, -0.02202607, ...,  0.00457793,
        -0.04259717,  0.00322621],
       [ 0.00804572, -0.03573923,  0.04938739, ...,  0.01538319,
        -0.06230947, -0.02774666],
       [-0.00908224,  0.13003924, -0.05275568, ...,  0.02014104,
        -0.05117098, -0.02930294],
       [-0.09751046,  0.04911299, -0.03977424, ...,  0.02477502,
        -0.03546079, -0.00018598],
       [-0.10468338,  0.05926038, -0.02488499, ...,  0.02502055,
        -0.037193  , -0.0256891 ]], shape=(5, 384), dtype=float32)

In [20]:
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
X=embeddings2
'''label_encoder = LabelEncoder()'''
'''y = label_encoder.fit_transform(df_cleaned['target_label'])'''
y=df_cleaned['target_label'].values
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.30, random_state=42)

print("Training logistic regression model has changes...")
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train, y_train)

# 5. Make predictions on the test set
y_pred = lr_model.predict(X_test)
report = classification_report(y_test, y_pred)
print(report)

Training logistic regression model has changes...
                precision    recall  f1-score   support

Critical Error       0.91      1.00      0.95        48
         Error       0.98      0.89      0.93        47
   HTTP Status       1.00      1.00      1.00       304
Resource Usage       1.00      1.00      1.00        49
Security Alert       1.00      0.99      1.00       123

      accuracy                           0.99       571
     macro avg       0.98      0.98      0.98       571
  weighted avg       0.99      0.99      0.99       571



In [None]:
# Check the unique values in your target column
print("Unique values in target_label:", df_cleaned['target_label'].unique())

# Check what labels the model knows
print("Classes recognized by the model:", lr_model.classes_)



Unique values in target_label: ['HTTP Status' 'Critical Error' 'Security Alert' 'Error' 'Resource Usage']
Classes recognized by the model: ['Critical Error' 'Error' 'HTTP Status' 'Resource Usage' 'Security Alert']


In [None]:
'''BASED ON ABOVE OUTPUT WE CAN CONCLUDE THAT 
This explains why your model works! The LogisticRegression class in scikit-learn is capable of handling string class labels directly.
What's happening:

When you use .values, you're getting a NumPy array of strings: ['HTTP Status', 'Critical Error', etc.]
Scikit-learn's LogisticRegression automatically handles these string labels internally by:

Creating a mapping of unique string values to numeric indices
Using these numeric indices during training and prediction
Preserving the original string labels in the classes_ attribute


When you call predict(), the model returns the original string labels rather than numeric indices, which makes it seem like no encoding was necessary.

This is a convenient feature, but it's happening "under the hood." The model is still converting strings to numbers internally - you're just not seeing that step explicitly in your code.
While this works fine, explicitly using LabelEncoder would still be considered a best practice because:

It makes the encoding step visible in your code
It gives you more control over the encoding process
It's more consistent with how preprocessing is typically handled in ML pipelines

But your current approach is valid and working correctly, as evidenced by the model recognizing all your class labels properly!'''

In [22]:
import joblib
joblib.dump(lr_model, '../models/log_classifier.joblib')

['../models/log_classifier.joblib']