# Load data

In [None]:
import glob
import os

In [None]:
data = []
for n in glob.glob("un/TXT/Session 77 - 2022/*.txt"):
    data.append({"country": os.path.basename(n.replace("_77_2022.txt", "")), "text": open(n).read() })

In [None]:
import pandas as pd

In [None]:
pd.set_option('display.max_colwidth', 500)
df = pd.DataFrame(data)
df

# Sentence segmentation

In [None]:
import spacy

In [None]:
!python -m spacy download en_core_web_sm

In [None]:
nlp = spacy.load("en_core_web_sm")

In [None]:
from tqdm.auto import tqdm

In [None]:
# runs < 1min
sentences = []
for text in tqdm(df["text"]):
    doc = nlp(text)
    for sentence in doc.sents:
        sentences.append(str(sentence).strip())

In [None]:
sentences[0:20]

In [None]:
len(sentences)

In [None]:
open("sentences.txt", "w").write("@@@".join(sentences))

In [None]:
sentences = open("sentences.txt").read().split("@@@")

# Encode sentences

In [None]:
!pip install sentence_transformers -U

In [None]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1')

In [None]:
!pip install huggingface_hub -U

In [None]:
# can take a minute or two depending on CPU/GPU configuration
sembeddings = model.encode(sentences, show_progress_bar=True, normalize_embeddings=True)

In [None]:
len(sembeddings)

In [None]:
sembeddings.shape

In [None]:
import numpy as np
with open("sentences-mqa.npy", "wb") as f:
    np.save(f, sembeddings)

Many more models are available on Hugging Face.

Benchmark of models: https://huggingface.co/spaces/mteb/leaderboard

Search for all sentence similarity models: https://huggingface.co/models?pipeline_tag=sentence-similarity&sort=trending

In [None]:
# new superfast alternative using ModelVec, new in 3.4, speedup 400x CPU to 25x GPU:
# model = SentenceTransformer("minishlab/potion-base-8M", device="cpu")

## Alternative Model

In [None]:
# option: truncate_dim=dimensions
# option for cpu: backend="openvino"
model2 = SentenceTransformer('mixedbread-ai/mxbai-embed-large-v1')

In [None]:
# can take a minute or two depending on CPU/GPU configuration
sembeddings2 = model2.encode(sentences, show_progress_bar=True, 
                             normalize_embeddings=True)

In [None]:
sembeddings2.shape

In [None]:
# if we want, we could now quantize the embeddings to save space and add performance:
from sentence_transformers.quantization import quantize_embeddings
binary_embeddings2 = quantize_embeddings(sembeddings2, precision="ubinary")
binary_embeddings2.shape

In [None]:
with open("sentences-mbread.npy", "wb") as f:
    np.save(f, sembeddings2)

## One more alternative

In [None]:
model3 = SentenceTransformer("NovaSearch/stella_en_1.5B_v5", trust_remote_code=True)

In [None]:
sembeddings3 = model3.encode(sentences, show_progress_bar=True, normalize_embeddings=True)

In [None]:
sembeddings3.shape

In [None]:
with open("sentences-stella.npy", "wb") as f:
    np.save(f, sembeddings3)

# Retrieval

In [None]:
def search(query, text, corpus_embeddings, model, query_prompt_name=None, top=20):
    # code query to restrict search space
    question_embedding = model.encode(query, normalize_embeddings=True, prompt_name=query_prompt_name)
    
    # Determine similarity (vectors are normalized)
    sim = model.similarity(question_embedding, corpus_embeddings)[0].numpy() 
    # Alternative: sim = np.dot(corpus_embeddings, question_embedding)
    
    # Get most similar top_k by sorting
    hits = [ { "text": text[i], "score": sim[i] } 
                     for i in sim.argsort()[::-1][0:top] ]
    
    # Return as dataframe
    return pd.DataFrame(hits)

In [None]:
pd.set_option('display.max_colwidth', 0)

In [None]:
search("The climate crisis is worse for poorer countries.", sentences, sembeddings, model)

In [None]:
search("The climate crisis is worse for poorer countries.", sentences, sembeddings2, model2)

In [None]:
search("The climate crisis is worse for poorer countries.", sentences, sembeddings2, model2,
       query_prompt_name="query")

In [None]:
search("The climate crisis is worse for poorer countries.", sentences, sembeddings3, model3)

In [None]:
search("The climate crisis is worse for poorer countries.", sentences, sembeddings3, model3, 
       query_prompt_name="s2p_query")