In [None]:
import numpy as np
import re
import pandas as pd
from pathlib import Path

In [None]:
from src.config import DOCUMENTS_DIR

In [None]:
path = Path(DOCUMENTS_DIR)

In [None]:
texts = []
for filename in path.glob("*.md"):
    with open(filename) as f:
        texts.append(f.read())

texts[0]

# chunk

In [None]:
def parse_class(text):
    chunks = re.split("##", text)
    title = chunks[0].split("#")[0]
    return {"title": title, "chunks": chunks}

In [None]:
def parse_class_add_title(text):
    chunks = text.split("##")
    title =  chunks[0].split("#")[0]
    return {"title": title, "chunks": [f"{title}: {chunk}" for chunk in chunks]}

In [None]:
chunks = sum((parse_class_add_title(txt)["chunks"] for txt in texts), [])

In [None]:
chunks

## Embedding

In [None]:
from FlagEmbedding import FlagModel

In [None]:
model = FlagModel(
    'BAAI/bge-base-en-v1.5',
    query_instruction_for_retrieval="Represent this sentence for searching relevant passages:",
    use_fp16=True,
)

In [None]:
corpus_embedding = model.encode(chunks)

In [None]:
queries = ["How to make a sum of tensor"]


In [None]:
query_embedding = model.encode(queries)

In [None]:
sim_scores = query_embedding @ corpus_embedding.T

In [None]:
for query, score in zip(queries, sim_scores):
    print(" ---- ")
    print("Query: ", query)
    indexes = np.argsort(score)[-5:]
    print("Sources:")
    for i, idx in enumerate(reversed(indexes)):
        if score[idx] > .5:
            print(f"{i+1} -- similarity {score[idx]:.2f} -- \"", chunks[idx], '"')

# eval retrieval 

In [None]:
list_question = ["How to make a sum of tensor", "what is burn"]
list_answer = ["fn add", "crate"]

## MRR 

In [None]:
df  = pd.DataFrame({"question": list_question, "answer": list_answer})

In [None]:
df

In [None]:
query_embedding = model.encode(list(df["question"]))
query_embedding

In [None]:
acceptable_chunks = []
for answer in df["answer"]:
    chunks_ok = set(i for i, chunk in enumerate(chunks) if answer in chunk)
    acceptable_chunks.append(chunks_ok)
acceptable_chunks

In [None]:
def compute_mrr(sim_score, acceptable_chunks):
    ranks = []
    for this_score, this_acceptable_chunks in zip(sim_score, acceptable_chunks):
        indexes = reversed(np.argsort(this_score))
        rank = 1 + next(i for i, idx in enumerate(indexes) if idx in this_acceptable_chunks)
        ranks.append(rank)
        
    return {
        "score": sum(1 / r if r < 6 else 0 for r in ranks) / len(ranks),
        "ranks": ranks,
    }

In [None]:
sim_scores = query_embedding @ corpus_embedding.T

In [None]:
res = compute_mrr(sim_scores, acceptable_chunks)
res["score"]

## text generation

In [None]:
def get_context(query, corpus, corpus_embeddings):
    query_embedding = model.encode([query])
    sim_scores = query_embedding @ corpus_embedding.T
    indexes = list(np.argsort(sim_scores[0]))[-5:]
    return [corpus[i] for i in indexes]

In [None]:
get_context("how to sum two tensor", chunks, corpus_embedding)

# smoll

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer

checkpoint = "HuggingFaceTB/SmolLM2-360M-Instruct"
# checkpoint = "HuggingFaceTB/SmolLM2-1.7B-Instruct"

device = "cpu" # for GPU usage or "cpu" for CPU usage

tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model_generator = AutoModelForCausalLM.from_pretrained(checkpoint).to(device)