In [2]:
!pip install -U sentence-transformers faiss-cpu
!pip install -U sentence-transformers

from sentence_transformers import SentenceTransformer, CrossEncoder, InputExample
import faiss
import pandas as pd
import numpy as np



In [None]:
feedback_df = pd.read_csv("/dbfs/path/to/feedbacks.csv")
backlog_df = pd.read_csv("/dbfs/path/to/backlog_items.csv")

In [None]:
# model = SentenceTransformer('all-MiniLM-L6-v2')  # can use stronger ones

In [None]:
# Path to your labeled pairs CSV in DBFS
data_path = "/dbfs/path/to/your_feedback_backlog_pairs.csv"

# Load CSV: columns = feedback_text, backlog_text, label (1 = match, 0 = not match)
df = pd.read_csv(data_path)

# Create InputExample list
train_samples = [
    InputExample(texts=[row['feedback_text'], row['backlog_text']], label=float(row['label']))
    for _, row in df.iterrows()
]

In [None]:
# Step 3: Prepare Model & Dataloader
from sentence_transformers import SentenceTransformer, losses
from torch.utils.data import DataLoader

# Load a pre-trained Sentence-BERT model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Prepare the dataloader
train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=16)

# Define the loss function (contrastive)
train_loss = losses.CosineSimilarityLoss(model=model)

In [None]:
# Step 4: Fine-Tune the Model
model_save_path = "/dbfs/path/to/fine_tuned_model/"
num_epochs = 2

model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=num_epochs,
    warmup_steps=10,
    output_path=model_save_path
)

In [None]:
# Step 5: Use the Fine-Tuned Model
# Load fine-tuned model
from sentence_transformers import SentenceTransformer

fine_tuned_model = SentenceTransformer(model_save_path)

In [None]:
# Embedding backlog items
backlog_embeddings = fine_tuned_model.encode(backlog_df["full_text"].tolist(), show_progress_bar=True, convert_to_numpy=True)

In [None]:
dimension = backlog_embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(backlog_embeddings)

In [None]:
feedback_texts = feedback_df["text"].tolist()
feedback_embeddings = fine_tuned_model.encode(feedback_texts, show_progress_bar=True, convert_to_numpy=True)

In [None]:
# Search top-K matches
K = 5
D, I = index.search(feedback_embeddings, K)  # D = distances, I = indices

# Build match result dataframe
match_results = []
for i, feedback in enumerate(feedback_texts):
    for rank, idx in enumerate(I[i]):
        match_results.append({
            "feedback_id": feedback_df.iloc[i]["id"],
            "feedback_text": feedback,
            "matched_backlog_id": backlog_df.iloc[idx]["id"],
            "backlog_text": backlog_df.iloc[idx]["full_text"],
            "distance": D[i][rank],
            "rank": rank + 1
        })

matches_df = pd.DataFrame(match_results)

In [None]:
# For better precision, re-rank top-K using a cross-encoder
cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')

matches_df["pair"] = matches_df.apply(lambda row: (row["feedback_text"], row["backlog_text"]), axis=1)

matches_df["score"] = cross_encoder.predict(matches_df["pair"].tolist())

In [None]:
best_matches = matches_df.sort_values("score", ascending=False).groupby("feedback_id").first().reset_index()

best_matches.to_csv("/dbfs/path/to/similarity_matches.csv", index=False)