In [None]:
import pandas as pd
from tqdm.auto import tqdm
from sentence_transformers import SentenceTransformer

In [None]:
model_name = "Snowflake/snowflake-arctic-embed-l-v2.0"
model = SentenceTransformer(model_name, trust_remote_code=True).cuda()

In [None]:
import numpy as np
with open("llm-abstract-sentences-saev2.npy",  "rb") as f:
    embeddings = np.load(f)

In [None]:
import json
import lzma
with lzma.open("llm-abstract-sentences.json.xz", "rt") as f:
    es = json.loads(f.read())

In [None]:
sentences = [e["title"] + ": " + e["text"] for e in es]

In [None]:
from FlagEmbedding import FlagReranker
reranker = FlagReranker('BAAI/bge-reranker-v2-m3', use_fp16=True)

# Retrieval

In [None]:
import numpy as np
import pandas as pd
def search(query, text, embeddings, bi_encoder, cross_encoder, top=100):
    question_embedding = bi_encoder.encode(query, normalize_embeddings=True, prompt_name="query")
    
    sim = model.similarity(question_embedding, embeddings).flatten().numpy() 
    
    hits = [ { "text": text[i], "score": sim[i] } 
                     for i in sim.argsort()[::-1][0:top] ]

    # Consider only top hits for re-ranking
    cross_input = [[query, hit["text"]] for hit in hits]
    # cross-encode (this takes most time)
    cross_scores = cross_encoder.compute_score(cross_input)

    # Integrate cross-scores in original hits (this would be easier with pandas)
    for i in range(len(cross_scores)):
        hits[i]["cross-score"] = cross_scores[i]

    # re-sort by cross-score, descending!
    hits = sorted(hits, key=lambda x: x["cross-score"], reverse=True)
    
    # Return top-20 results of re-ranker as dataframe
    return pd.DataFrame(hits[0:20])

In [None]:
pd.set_option('display.max_colwidth', 0)

In [None]:
search("How long do I train an LLM?", sentences, embeddings, model, reranker)

In [None]:
search("How long was Llama 3.2 trained?", sentences, embeddings, model, reranker)

In [None]:
search("How does SGD work?", sentences, embeddings, model, reranker)

In [None]:
# cross encoder
from sentence_transformers import CrossEncoder, util
cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')

In [None]:
from sentence_transformers import CrossEncoder, util
cross_encoder = CrossEncoder("jinaai/jina-reranker-v2-base-multilingual",
    automodel_args={"torch_dtype": "auto"},
    trust_remote_code=True)

In [None]:
import numpy as np
import pandas as pd
def search(query, text, embeddings, bi_encoder, cross_encoder, top=100):
    question_embedding = bi_encoder.encode(query, normalize_embeddings=True, prompt_name="query")
    
    sim = model.similarity(question_embedding, embeddings).flatten().numpy() 
    
    hits = [ { "text": text[i], "score": sim[i] } 
                     for i in sim.argsort()[::-1][0:top] ]

    # Consider only top hits for re-ranking
    cross_input = [[query, hit["text"]] for hit in hits]
    # cross-encode (this takes most time)
    cross_scores = cross_encoder.predict(cross_input)

    # Integrate cross-scores in original hits (this would be easier with pandas)
    for i in range(len(cross_scores)):
        hits[i]["cross-score"] = cross_scores[i]

    # re-sort by cross-score, descending!
    hits = sorted(hits, key=lambda x: x["cross-score"], reverse=True)
    
    # Return top-20 results of re-ranker as dataframe
    return pd.DataFrame(hits[0:20])

In [None]:
search("How long do I train an LLM?", sentences, embeddings, model, cross_encoder)

In [None]:
search("How long was Llama 3.2 trained?", sentences, embeddings, model, cross_encoder)

In [None]:
search("How does SGD work?", sentences, embeddings, model, cross_encoder)