# Generation

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

In [None]:
gen_model_name = 'Qwen/Qwen2.5-0.5B-Instruct'
gen_model = AutoModelForCausalLM.from_pretrained(gen_model_name, 
                                             attn_implementation='flash_attention_2',
                                             torch_dtype=torch.bfloat16,
                                             device_map="cuda",
                                             low_cpu_mem_usage=True)

In [None]:
gen_tokenizer = AutoTokenizer.from_pretrained(gen_model_name)

In [None]:
gen_model.eval()

In [None]:
gen_tokenizer.chat_template

In [None]:
messages = [{ "content": "Du bist ein hilfreicher Assistent.", 
              "role": "system" }, 
            { "content": "Erkläre den Heise Zeitschriftenverlag!", 
              "role": "user" },
            ]

In [None]:
inputs = gen_tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors = "pt").to("cuda")

In [None]:
print(gen_tokenizer.batch_decode(inputs)[0])

In [None]:
outputs = gen_model.generate(inputs, max_new_tokens = 512, use_cache = True,
                         do_sample=True, temperature=0.7, top_k=25, top_p=0.8)

In [None]:
print(gen_tokenizer.batch_decode(outputs)[0])

# Retrieval

In [None]:
import pandas as pd
from tqdm.auto import tqdm
from sentence_transformers import SentenceTransformer

In [None]:
model_name = "Snowflake/snowflake-arctic-embed-l-v2.0"
model = SentenceTransformer(model_name, trust_remote_code=True).cuda()

In [None]:
import numpy as np
with open("llm-abstract-sentences-saev2.npy",  "rb") as f:
    embeddings = np.load(f)

In [None]:
import json
import lzma
with lzma.open("llm-abstract-sentences.json.xz", "rt") as f:
    es = json.loads(f.read())

In [None]:
sentences = [e["title"] + ": " + e["text"] for e in es]

In [None]:
# cross encoder
from sentence_transformers import CrossEncoder, util
cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')

In [None]:
import numpy as np
import pandas as pd
def search(query, text, embeddings, bi_encoder, cross_encoder, top=100):
    question_embedding = bi_encoder.encode(query, normalize_embeddings=True, prompt_name="query")
    
    sim = model.similarity(question_embedding, embeddings).flatten().numpy() 
    
    hits = [ { "text": text[i], "score": sim[i] } 
                     for i in sim.argsort()[::-1][0:top] ]

    # Consider only top hits for re-ranking
    cross_input = [[query, hit["text"]] for hit in hits]
    # cross-encode (this takes most time)
    cross_scores = cross_encoder.predict(cross_input)

    # Integrate cross-scores in original hits (this would be easier with pandas)
    for i in range(len(cross_scores)):
        hits[i]["cross-score"] = cross_scores[i]

    # re-sort by cross-score, descending!
    hits = sorted(hits, key=lambda x: x["cross-score"], reverse=True)
    
    # Return top-20 results of re-ranker as dataframe
    return pd.DataFrame(hits[0:20])

In [None]:
def rag(query, text, embeddings, bi_encoder, cross_encoder, gen_model, gen_tokenizer, top=100, top_rag=5):
    res = search(query, text, embeddings, bi_encoder, cross_encoder, top)
    context = "\n".join(res["text"].map(str).values[0:top_rag])
    messages = [{ "content": """You are an assistant which answers questions only based on the context.
                            If the answer is not in the context, say that you can't answer the question.
                            Use correct scientific terms.""", 
              "role": "system" }, 
            { "content": f"Answer the question '{query}' based on the context: {context}", 
              "role": "user" },
            ]
    inputs = gen_tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt").to("cuda")
    outputs = gen_model.generate(inputs, max_new_tokens=512, use_cache=True, do_sample=False)
    gen = gen_tokenizer.batch_decode(outputs)[0]
    return gen[gen.find("<|im_start|>assistant"):], res

In [None]:
res, df = rag("How long do I train an LLM?", sentences, embeddings, model, cross_encoder, gen_model, gen_tokenizer)
print(res)

In [None]:
res, df = rag("How long was Llama 3.2 trained?", sentences, embeddings, model, cross_encoder, gen_model, gen_tokenizer)
print(res)

In [None]:
res, df = rag("How does SGD work?", sentences, embeddings, model, cross_encoder, gen_model, gen_tokenizer)
print(res)