In [None]:
import pandas as pd
from tqdm.auto import tqdm
from sentence_transformers import SentenceTransformer

In [None]:
model_name = "Snowflake/snowflake-arctic-embed-l-v2.0"
model = SentenceTransformer(model_name, trust_remote_code=True).cuda()

In [None]:
import numpy as np
# Download zu groß
with open("llm-abstract-sentences-saev2.npy",  "rb") as f:
    embeddings = np.load(f)

In [None]:
import json
import lzma
with lzma.open("llm-abstract-sentences.json.xz", "rt") as f:
    es = json.loads(f.read())

In [None]:
sentences = [e["title"] + ": " + e["text"] for e in es]

# Retrieval

In [None]:
def search(query, text, embeddings, model, top=20):
    question_embedding = model.encode(query, normalize_embeddings=True, prompt_name="query")
    
    # Ähnlichkeiten bestimmen
    sim = model.similarity(question_embedding, embeddings).flatten().numpy()
    
    # ähnlichste k bestimmen und zurückgegeben
    hits = [ { "text": text[i], "score": sim[i] } 
                     for i in sim.argsort()[::-1][0:top] ]
    
    # Darstellung als DataFrame
    return pd.DataFrame(hits)

In [None]:
pd.set_option('display.max_colwidth', 0)

In [None]:
search("How long do I train an LLM?", sentences, embeddings, model)

In [None]:
search("How long was Llama 3.2 trained?", sentences, embeddings, model)

In [None]:
search("How does SGD work?", sentences, embeddings, model)