# Load data

Sätze extrahiert aus der UN-Generaldebatte 2023

In [None]:
import os
import urllib.request

# File name and URL
file_name = "sentences.txt"
url = "https://github.com/datanizing/m3-llm-workshop/raw/main/sentences.txt"

# Check if the file exists, if not, download it
if not os.path.isfile(file_name):
    print(f"{file_name} does not exist. Downloading...")
    urllib.request.urlretrieve(url, file_name)
    print(f"Downloaded {file_name}.")
else:
    print(f"{file_name} already exists.")

In [None]:
sentences = open("sentences.txt", encoding="utf-8").read().split("@@@")

In [None]:
len(sentences)

# Encode sentences

Gutes Modell aus dem Leaderboard heraussuchen: Modell https://huggingface.co/spaces/mteb/leaderboard

In [None]:
from sentence_transformers import SentenceTransformer
#model_name = "jinaai/jina-embeddings-v3"                 # am schnellsten
#model_name = "intfloat/multilingual-e5-large-instruct"   # etwas langsamer
model_name = "Snowflake/snowflake-arctic-embed-l-v2.0"    # ca. genau so schnell
#model_name = "infly/inf-retriever-v1-1.5b"               # etwa 3x langsamer

model = SentenceTransformer(model_name)

In [None]:
# kann ein bisschen dauern je nach CPU/GPU-Konfiguration
sembeddings = model.encode(sentences, show_progress_bar=True)

In [None]:
import numpy as np
with open("sentences-saev2.npy", "wb") as f:
    np.save(f, sembeddings)

In [None]:
sembeddings.shape

In [None]:
model2 = SentenceTransformer('all-mpnet-base-v2')

In [None]:
# etwas schneller
sembeddings2 = model2.encode(sentences, show_progress_bar=True)

In [None]:
with open("sentences-mpnet.npy", "wb") as f:
    np.save(f, sembeddings2)

In [None]:
# neue superschnelle Alternative, new in 3.4, speedup 400x CPU to 25x GPU:
model3 = SentenceTransformer("minishlab/potion-base-8M", device="cpu")

In [None]:
# viel schneller
sembeddings3 = model3.encode(sentences, show_progress_bar=True)

In [None]:
with open("sentences-potion.npy", "wb") as f:
    np.save(f, sembeddings3)

# Retrieval

In [None]:
def search(query, text, corpus_embeddings, model, prompt_name="query", top=20):
    # query codieren
    query_embedding = model.encode(query, prompt_name=prompt_name)
    
    # Determine similarity (vectors are normalized)
    sim =  model.similarity(query_embedding, corpus_embeddings)[0].numpy() 
    
    # Get most similar top by sorting
    hits = [ { "text": text[i], "score": sim[i] } 
                     for i in sim.argsort()[::-1][0:top] ]
    
    # Return as dataframe
    return pd.DataFrame(hits)

In [None]:
import pandas as pd
pd.set_option('display.max_colwidth', 0)

In [None]:
search("Is the climate crisis worse for poorer countries?", 
       sentences, sembeddings, model)

In [None]:
search("Is the climate crisis worse for poorer countries?", 
       sentences, sembeddings2, model2, prompt_name=None)

In [None]:
search("Is the climate crisis worse for poorer countries?", 
       sentences, sembeddings3, model3, prompt_name=None)

In [None]:
search("Is the war on Ukraine caused by Russia?", 
       sentences, sembeddings, model)

In [None]:
search("Is the war on Ukraine caused by Russia?", 
       sentences, sembeddings2, model2, prompt_name=None)

In [None]:
search("Is the war on Ukraine caused by Russia?", 
       sentences, sembeddings3, model3, prompt_name=None)

In [None]:
search("Sind arme Länder durch die Klimakrise stärker betroffen?", 
       sentences, sembeddings, model)