# Load data (from previous notebook)

In [None]:
sentences = open("sentences.txt").read().split("@@@")

In [None]:
len(sentences)

In [None]:
import numpy as np
with open("sentences-mqa.npy", "rb") as f:
    sembeddings = np.load(f)

# Retrieval

In [None]:
import numpy as np
import pandas as pd
def search(query, text, corpus_embeddings, bi_encoder, cross_encoder, query_prompt_name=None, top_k=100):
    # code query to restrict search space
    question_embedding = bi_encoder.encode(query, normalize_embeddings=True, prompt_name=query_prompt_name)
    
    # Determine similarity (vectors are normalized)
    sim = model.similarity(question_embedding, corpus_embeddings)[0].numpy() 
    
    # Get most similar top_k by sorting
    hits = [ { "id": i, "text": text[i], "score": sim[i] } 
                     for i in sim.argsort()[::-1][0:top_k] ]

    # Consider only top hits for re-ranking
    cross_input = [[query, hit["text"]] for hit in hits]
    # cross-encode (this takes most time)
    cross_scores = cross_encoder.predict(cross_input)

    # Integrate cross-scores in original hits (this would be easier with pandas)
    for i in range(len(cross_scores)):
        hits[i]["cross-score"] = cross_scores[i]

    # nre-sort by cross-score, descending!
    hits = sorted(hits, key=lambda x: x["cross-score"], reverse=True)
    
    # Return top-20 results of re-ranker as dataframe
    return pd.DataFrame(hits[0:20])

In [None]:
# bi-encoder is needed
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1')

In [None]:
# cross encoder
from sentence_transformers import CrossEncoder, util
cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')

In [None]:
pd.set_option('display.max_colwidth', 0)

In [None]:
search("Is the climate crisis worse in poorer countries?", sentences, sembeddings, model, cross_encoder)

In [None]:
model2 = SentenceTransformer('mixedbread-ai/mxbai-embed-large-v1')
with open("sentences-mbread.npy", "rb") as f:
    sembeddings2 = np.load(f)

In [None]:
search("Is the climate crisis worse in poorer countries?", 
       sentences, sembeddings2, model2, cross_encoder, query_prompt_name="query")

In [None]:
model3 = SentenceTransformer("NovaSearch/stella_en_1.5B_v5", trust_remote_code=True)
with open("sentences-stella.npy", "rb") as f:
    sembeddings3 = np.load(f)

In [None]:
search("Is the climate crisis worse in poorer countries?", 
       sentences, sembeddings3, model3, cross_encoder, query_prompt_name="s2p_query")

## Alternative cross encoder

In [None]:
cross_encoder2 = CrossEncoder('mixedbread-ai/mxbai-rerank-large-v1')

In [None]:
search("Is the climate crisis worse in poorer countries?", sentences, sembeddings, model, cross_encoder2)

In [None]:
search("Is the climate crisis worse in poorer countries?", sentences, sembeddings3, model3, 
       cross_encoder2, query_prompt_name="s2p_query")