# Rerank relevant results

In [19]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import glob
import html
import os
from tqdm.auto import tqdm
from FlagEmbedding import FlagReranker, FlagModel

from typing import Union
import torch
import torch.nn.functional as F
from transformers import AutoModel, AutoTokenizer

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Load comments

In [14]:
# Load the most recent CSV file
list_of_files = glob.glob('out/full_labeled_comments_*.parquet')
latest_file = max(list_of_files, key=os.path.getctime)
comments = pd.read_parquet(latest_file)

In [17]:
comments['text'] = comments['text'].apply(html.unescape)
pairs = comments.text.apply(lambda comment: ['product market fit', comment]).tolist()

# Rerank model

In [None]:
reranker = FlagReranker('BAAI/bge-reranker-v2-m3', use_fp16=True)
scores = reranker.compute_score([['what is panda?', 'hi'], ['what is panda?', 'The giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China.']])
print(scores)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at BAAI/bge-base-en-v1.5 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


KeyboardInterrupt: 

In [10]:
scores = reranker.compute_score(pairs[:30])
rankings = pd.DataFrame([scores, comments.text]).T
rankings

Unnamed: 0,0,1
0,-10.911517,Laziness is the enemy. I spend a lot of time ...
1,-11.037979,"I just stuck it on a public server, behind a B..."
2,-10.933453,Thanks for answering. I'm not NetBeans fan mys...
3,-10.574196,> it misses the main point of the new table fo...
4,-6.169174,> If I'm already going through the trouble of ...
...,...,...
3241,,As a long-time reader of Daring Fireball I lea...
3242,,Funny..but I was expecting them to have it all...
3243,,"Agreed, you should make a decision on who is t..."
3244,,"Very nice, thank you!<p>Unhelpfully my only pa..."


# Embedding similarity

In [20]:
model = AutoModel.from_pretrained("avsolatorio/NoInstruct-small-Embedding-v0")
tokenizer = AutoTokenizer.from_pretrained("avsolatorio/NoInstruct-small-Embedding-v0")

In [None]:
def get_embedding(text: Union[str, list[str]], mode: str = "sentence"):
    model.eval()

    assert mode in ("query", "sentence"), f"mode={mode} was passed but only `query` and `sentence` are the supported modes."

    if isinstance(text, str):
        text = [text]

    inp = tokenizer(text, return_tensors="pt", padding=True, truncation=True)

    with torch.no_grad():
        output = model(**inp)

    # The model is optimized to use the mean pooling for queries,
    # while the sentence / document embedding uses the [CLS] representation.

    if mode == "query":
        vectors = output.last_hidden_state * inp["attention_mask"].unsqueeze(2)
        vectors = vectors.sum(dim=1) / inp["attention_mask"].sum(dim=-1).view(-1, 1)
    else:
        vectors = output.last_hidden_state[:, 0, :]

    return vectors

In [None]:
embeddings = get_embedding(comments.text.tolist(), mode="sentence")
timestamp = pd.Timestamp.now().strftime("%Y%m%d%H%M%S")
torch.save(embeddings, f"out/comments_embeddings-noInstructSmall_{timestamp}.pt")

In [22]:

# Test the retrieval performance.
query = get_embedding("How can I find product market fit?", mode="query")

scores = F.cosine_similarity(query, embeddings, dim=-1)
# print(scores.cpu().numpy())


In [26]:
rankings = pd.DataFrame([scores.cpu().numpy(), comments.text]).T
rankings.columns = ['score', 'text']
rankings.sort_values('score', ascending=False).head(10)

Unnamed: 0,score,text
1481,0.916152,Product Market Fit
1479,0.916152,Product Market Fit
1480,0.882274,"Product Market Fit. Basically, can you find th..."
3199,0.860962,Thank you for response. Recently I found blog ...
1033,0.85509,It’s always product market fit. If the product...
2972,0.846613,Is that marketing (advertising) or product mar...
2050,0.845904,Thank you so much for taking the time to answe...
1362,0.844424,"Great point! And if you see, that there are al..."
460,0.840863,"Yeah, possibly. I have only visibility to the ..."
2018,0.827746,I appreciate this viewpoint.<p>There’s also a ...


In [None]:
models = ['BAAI/bge-base-en-v1.5', 'BAAI/bge-small-en-v1.5']
embedder = FlagModel(models[1], use_fp16=import pandas as pddasdfdasdfasdfddimport pandas saTrue)