In [None]:
import pandas as pd
from sentence_transformers import SentenceTransformer


I will use a simple sentence transformer to create the embeddings, and then feed the embeddings into a simple multi output classifier/

In [371]:
df = pd.read_json("data/dataset.json")
df.head()

# I decided to shuffle the dataframe to ensure randomness in the 50 50 train-test split, however when I tried this approach
#  without shuffling the accuracy was 0 as the test split had label that werent present in the train split.

df = df.sample(frac=1).reset_index(drop=True)

I decided to use a LabelEncoder while at it, for this simple case we can also just do list(set(labels)) and enumarete.

In [372]:
import numpy as np
from sklearn.preprocessing import LabelEncoder

type_encoder = LabelEncoder()
priority_encoder = LabelEncoder()
project_encoder = LabelEncoder()

df["type_enc"] = type_encoder.fit_transform(df["Type"])
df["priority_enc"] = priority_encoder.fit_transform(df["Priority"])
df["project_enc"] = project_encoder.fit_transform(df["project_name"])

df.head()

Unnamed: 0,summary,description,reporter_name,project_name,Assignee,Priority,Type,id,type_enc,priority_enc,project_enc
0,User noticed: Rename AppCode module configurat...,Description:\n[!](Screenshot%202025-06-06%20at...,user_097,Fast Roe,user_066,Normal,Task,15,5,3,5
1,Issue reported: Bad visual feedback on selecte...,Observed behaviour:\nWhen using project depend...,user_060,Fast Wolf,,Normal,Bug,13,0,3,7
2,Improvement needed: Xcode file generation trig...,Observed behaviour:\nThe plugin currently trig...,user_097,Fast Roe,user_053,Normal,Task,16,5,3,5
3,User noticed: Preserve table filtering after n...,"Description:\n**Problem:**\nCurrently, when a ...",user_134,Fast Buffalo,,Normal,Bug,3,0,3,1
4,User noticed: Regression: Unfriendly error whe...,Details provided:\nWhen attempting to `Create ...,user_050,,user_001,,,54,8,5,16


In [373]:
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
df['embeddings'] = df.apply(lambda row: embedding_model.encode(f"{row['summary']} {row['description']}"), axis=1)

In [374]:
x_train = np.vstack(df['embeddings'].values[:50])
y_train = df[["type_enc", "priority_enc", "project_enc"]].values[:50]
x_test = np.vstack(df['embeddings'].values[50:])
y_test = df[["type_enc", "priority_enc", "project_enc"]].values[50:]

x_train.shape, y_train.shape, x_test.shape, y_test.shape

((50, 384), (50, 3), (50, 384), (50, 3))

In [None]:
from sklearn.multioutput import MultiOutputClassifier
from sklearn.svm import LinearSVC


classifier = MultiOutputClassifier(LinearSVC())
classifier.fit(x_train, y_train)

predictions = classifier.predict(x_test)
classifier.score(x_test,y_test) 

np.float64(0.44)

In [376]:
table = df[50:][["project_name","type_enc", "priority_enc", "project_enc"]]
pred_df = pd.DataFrame(predictions, index=table.index,columns=["type_pred", "priority_pred", "project_pred"])
table = table.join(pred_df)

table['type_correct'] = table['type_enc'] == table['type_pred']
table['priority_correct'] = table['priority_enc'] == table['priority_pred']
table['project_correct'] = table['project_enc'] == table['project_pred']

# we have a lot of issues without project name or type, and as the dataset is really small, our accuracy is low 
accuracy_report = table.groupby('project_name',dropna=False)[['type_correct', 'priority_correct', 'project_correct']].mean() * 100
accuracy_report

Unnamed: 0_level_0,type_correct,priority_correct,project_correct
project_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Fast Badger,25.0,100.0,0.0
Fast Buffalo,0.0,0.0,0.0
Fast Falcon,0.0,0.0,0.0
Fast Hedgehog,100.0,0.0,0.0
Fast Panda,0.0,0.0,0.0
Fast Roe,0.0,0.0,0.0
Fast Wolf,0.0,100.0,0.0
Lazy Beaver,0.0,0.0,0.0
Lazy Panda,0.0,0.0,0.0
Lazy Raccoon,0.0,0.0,0.0
