In [1]:
import pandas as pd

from sklearn.cluster import DBSCAN
from sentence_transformers import SentenceTransformer

from IPython.core.interactiveshell import InteractiveShell

### Data Exploration

In [2]:
InteractiveShell.ast_node_interactivity = "all"

df = pd.read_csv("../datasets/labeled_logs.csv")[['source', 'log_message', 'target_label']]
df.head()
df.shape
df['source'].value_counts()
df['target_label'].value_counts()
df[df['source']=='LegacyCRM']

Unnamed: 0,source,log_message,target_label
60,LegacyCRM,Lead conversion failed for prospect ID 7842 du...,Workflow Error
255,LegacyCRM,API endpoint 'getCustomerDetails' is deprecate...,Deprecation Warning
377,LegacyCRM,Customer follow-up process for lead ID 5621 fa...,Workflow Error
1325,LegacyCRM,Escalation rule execution failed for ticket ID...,Workflow Error
1734,LegacyCRM,The 'ExportToCSV' feature is outdated. Please ...,Deprecation Warning
1826,LegacyCRM,Support for legacy authentication methods will...,Deprecation Warning
2217,LegacyCRM,Task assignment for TeamID 3425 could not comp...,Workflow Error


In [3]:
InteractiveShell.ast_node_interactivity = "last"

model = SentenceTransformer('all-MiniLM-L6-v2')  # embedding model
df2 = df[~df['target_label'].isin(['Workflow Error', 'Deprecation Warning'])].copy() # these two labels are too few to be trained on, llm will be used to classify the LegacyCRM logs
embeddings = model.encode(df2['log_message'].tolist())

In [4]:
clustering = DBSCAN(eps=0.1, min_samples=1, metric='cosine').fit(embeddings)
df2['cluster'] = clustering.labels_

In [5]:
InteractiveShell.ast_node_interactivity = "all"

df2.head()
vc = df2['cluster'].value_counts()
failed_clusters = [] # clusters with mixed labels

for cluster_id in vc.index.tolist():
    if df2[df2['cluster'] == cluster_id]['target_label'].nunique() == 1 and vc[cluster_id] > 5:
        print(f"- cluster {cluster_id}:")
        df2[df2['cluster'] == cluster_id][['log_message','target_label']].head(3)
    elif df2[df2['cluster'] == cluster_id]['target_label'].nunique() > 1:
        failed_clusters.append(cluster_id)
print("Failed clusters:", failed_clusters)

- cluster 0:
- cluster 16:
- cluster 15:
- cluster 5:
- cluster 9:
- cluster 27:
- cluster 13:
- cluster 4:
- cluster 44:
- cluster 20:
- cluster 11:
- cluster 31:
- cluster 3:
- cluster 21:
- cluster 32:
- cluster 51:
- cluster 23:
- cluster 90:
- cluster 12:
- cluster 56:
- cluster 58:
- cluster 105:
- cluster 36:
- cluster 82:
- cluster 35:
- cluster 71:
- cluster 7:
- cluster 179:
Failed clusters: [188, 1, 73]


I will remove the failed clusters to improve the BERT model.

In [6]:
newdf = df2[~df2['cluster'].isin(failed_clusters)][['source','log_message', 'target_label']].copy()
newdf.shape
newdf.to_csv("../datasets/cleaned_data_for_bert.csv", index=False)
df[df['source']=='LegacyCRM'].to_csv("../datasets/cleaned_data_for_llm.csv", index=False)

(2389, 3)