In [None]:
!pip install -U sentence-transformers faiss-cpu
!pip install -U sentence-transformers

from sentence_transformers import SentenceTransformer, CrossEncoder, InputExample
import faiss
import pandas as pd
import numpy as np

In [2]:
feedback_df = pd.read_csv("/Users/faisal/PycharmProjects/PythonProject/JupyterProject/Data/feedbacks.csv", encoding="utf-8")
backlog_df = pd.read_csv("/Users/faisal/PycharmProjects/PythonProject/JupyterProject/Data/backlog_items.csv")

In [3]:
model = SentenceTransformer('all-MiniLM-L6-v2')  # can use stronger ones

In [None]:
# Path to your labeled pairs CSV in DBFS
data_path = "/dbfs/path/to/your_feedback_backlog_pairs.csv"

# Load CSV: columns = feedback_text, backlog_text, label (1 = match, 0 = not match)
df = pd.read_csv(data_path)

# Create InputExample list
train_samples = [
    InputExample(texts=[row['feedback_text'], row['backlog_text']], label=float(row['label']))
    for _, row in df.iterrows()
]

In [None]:
# Step 3: Prepare Model & Dataloader
from sentence_transformers import SentenceTransformer, losses
from torch.utils.data import DataLoader

# Load a pre-trained Sentence-BERT model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Prepare the dataloader
train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=16)

# Define the loss function (contrastive)
train_loss = losses.CosineSimilarityLoss(model=model)

In [None]:
# Step 4: Fine-Tune the Model
model_save_path = "/dbfs/path/to/fine_tuned_model/"
num_epochs = 2

model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=num_epochs,
    warmup_steps=10,
    output_path=model_save_path
)

In [None]:
# Step 5: Use the Fine-Tuned Model
# Load fine-tuned model
fine_tuned_model = SentenceTransformer(model_save_path)

In [4]:
# Embedding backlog items
backlog_embeddings = model.encode(backlog_df["backlog_text"].tolist(), show_progress_bar=True, convert_to_numpy=True)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [5]:
dimension = backlog_embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(backlog_embeddings)

In [27]:
feedback_texts = feedback_df["feedbacks"].dropna()
feedback_texts = feedback_texts[feedback_texts.apply(lambda x: isinstance(x, str))]  # Keep only strings
feedback_texts = feedback_texts.tolist()
feedback_embeddings = model.encode(feedback_texts, show_progress_bar=True, convert_to_numpy=True)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [28]:
feedback_embeddings = np.ascontiguousarray(np.array(feedback_embeddings, dtype=np.float32))
backlog_embeddings = np.ascontiguousarray(np.array(backlog_embeddings, dtype=np.float32))

In [None]:
# Run batch search
K = 5  # number of nearest neighbors
all_distances = []
all_indices = []

for i in range(feedback_embeddings.shape[0]):
    D_i, I_i = index.search(feedback_embeddings[i:i+1], K)
    all_distances.append(D_i)
    all_indices.append(I_i)
    print(f"Search {i+1}/{feedback_embeddings.shape[0]} done")

# Convert lists to numpy arrays
D = np.vstack(all_distances)
I = np.vstack(all_indices)
print("All searches complete:", D.shape, I.shape)

In [32]:
match_results = []

for i in range(len(feedback_texts)):
    feedback = feedback_texts[i]
    for rank, idx in enumerate(I[i]):
        backlog_text = backlog_df.iloc[idx]["backlog_text"]
        distance = D[i][rank]
        match_results.append({
            "feedback_text": feedback,
            "backlog_text": backlog_text,
            "distance": distance,
            "rank": rank + 1
        })

matches_df = pd.DataFrame(match_results)
# Optional: display first few matches
print(matches_df.head())

                                                feedback_text  \
0  The app keeps freezing randomly, not sure what's going on.   
1  The app keeps freezing randomly, not sure what's going on.   
2  The app keeps freezing randomly, not sure what's going on.   
3  The app keeps freezing randomly, not sure what's going on.   
4  The app keeps freezing randomly, not sure what's going on.   

                                                 backlog_text  distance  rank  
0    Fix intermittent crash occurring on content detail view.  1.254880     1  
1  Improve application performance during screen transitions.  1.330254     2  
2            Implement dark mode UI fixes across all devices.  1.639975     3  
3         Enable persistent login and device handoff support.  1.665462     4  
4  Introduce a simplified layout for better mobile usability.  1.791740     5  


In [33]:
# For better precision, re-rank top-K using a cross-encoder
cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')
matches_df["pair"] = matches_df.apply(lambda row: (row["feedback_text"], row["backlog_text"]), axis=1)
matches_df["score"] = cross_encoder.predict(matches_df["pair"].tolist())

In [34]:
# Sort matches by feedback_text and descending cross-encoder score
top_matches = (
    matches_df.sort_values(by=["feedback_text", "score"], ascending=[True, False])
              .groupby("feedback_text")
              .head(1)  # Take top-1 match per feedback
              .reset_index(drop=True)
)
# Display the result
pd.set_option("display.max_colwidth", None)  # Ensure full text is shown
print(top_matches[["feedback_text", "backlog_text", "score"]])

                                                 feedback_text  \
0               App behaves differently on my tablet vs phone.   
1           App crashes when I try to open a specific section.   
2     Can you please make it easier to find stuff in the menu?   
3                    Dark mode isn't working properly anymore.   
4                  I can't seem to change my settings anymore.   
5                 I keep getting the same error over and over.   
6               I lose track of where I am in the flow easily.   
7            I miss how things used to work before the update.   
8               I really wish the interface was a bit cleaner.   
9                 It's hard to tell what some icons even mean.   
10           Logging in is a pain every time I switch devices.   
11            Need more control over notifications and alerts.   
12                 Not sure how to go back without restarting.   
13                     Not sure if my changes are being saved.   
14        