# Load data (from previous notebook)

In [None]:
sentences = open("sentences.txt").read().split("@@@")

In [None]:
len(sentences)

# Retrieval

In [None]:
import numpy as np
with open("sentences-mqa.npy", "rb") as f:
    sembeddings = np.load(f)

In [None]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1')

In [None]:
import pandas as pd
def search_semantic(query, text, corpus_embeddings, model, query_prompt_name=None, top=20):
    # code query to restrict search space
    question_embedding = model.encode(query, normalize_embeddings=True, prompt_name=query_prompt_name)
    
    # Determine similarity (vectors are normalized)
    sim = model.similarity(question_embedding, corpus_embeddings)[0].numpy() 
    # Alternative: sim = np.dot(corpus_embeddings, question_embedding)
    
    # Get most similar top_k by sorting
    hits = [ { "id": i, "text": text[i], "score": sim[i] } 
                     for i in sim.argsort()[::-1][0:top] ]
    
    # Return as dataframe
    return pd.DataFrame(hits)

In [None]:
import tantivy
schema_builder = tantivy.SchemaBuilder()
schema_builder.add_integer_field("id", stored=True)
schema_builder.add_text_field("text", stored=True)
schema = schema_builder.build()
index = tantivy.Index(schema, "tantivy-index")

In [None]:
def search_lexical(query, index, top=20):
    searcher = index.searcher()
    query = index.parse_query(query, ["text"])
    search_results = searcher.search(query, limit=20).hits
    res = []
    for (score, doc_id) in search_results:
        doc = searcher.doc(doc_id)
        res.append({ "id": doc["id"][0], "text": doc["text"][0], "score": score })

    return(pd.DataFrame(res))

In [None]:
sentence = "Is the climate crisis worse in poorer countries?"

In [None]:
pd.set_option('display.max_colwidth', 0)

In [None]:
sdf = search_semantic(sentence, sentences, sembeddings, model).set_index("id")
sdf

In [None]:
ldf = search_lexical(sentence, index).set_index("id")
ldf

In [None]:
import numpy as np
def rrf(dataframes):
    docs = []
    ids = []
    for df in dataframes:
        ids += list(df.index)
    ids = np.unique(ids)
    #ids = np.unique(list(ldf.index) + list(sdf.index))
    #ids = np.unique(list(ldf.index) + list(sdf1.index) +list(sdf2.index) +list(sdf3.index))
    for i in ids:
        s = 0
        rank = []
        for df in dataframes:
            if i in df.index:
                s += 1 / (60.0 + list(df.index).index(i)+1)
                rank.append(list(df.index).index(i)+1)
            else:
                rank.append(None)
                
        docs.append({ "id": i, "text": sentences[i], "score": s,  "rank": rank })

    return pd.DataFrame(docs).sort_values("score", ascending=False)

In [None]:
rrf([sdf, ldf])

In [None]:
model2 = SentenceTransformer('mixedbread-ai/mxbai-embed-large-v1')
with open("sentences-mbread.npy", "rb") as f:
    sembeddings2 = np.load(f)

In [None]:
model3 = SentenceTransformer("NovaSearch/stella_en_1.5B_v5", trust_remote_code=True)
with open("sentences-stella.npy", "rb") as f:
    sembeddings3 = np.load(f)

In [None]:
question = "Is the clima crisis worse for poorer countries?"

In [None]:
ldf = search_lexical(question, index, 200).set_index("id")

In [None]:
sdf1 = search_semantic("The climate crisis is worse in poorer countries", 
                       sentences, sembeddings, model).set_index("id")

In [None]:
sdf2 = search_semantic("The climate crisis is worse in poorer countries", 
                       sentences, sembeddings2, model2, query_prompt_name="query").set_index("id")

In [None]:
sdf3 = search_semantic("The climate crisis is worse in poorer countries", 
                       sentences, sembeddings3, model3, query_prompt_name="s2p_query").set_index("id")

In [None]:
rrf([ldf, sdf1, sdf2, sdf3]).head(20)