In this notebook I will use the the croissant LLM to answer my questions.

In [None]:
from transformers import LlamaForCausalLM, AutoTokenizer, set_seed
import torch

In [None]:
set_seed(42)

In [None]:
model_name = "croissantllm/CroissantLLMChat-v0.1"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = LlamaForCausalLM.from_pretrained(
    model_name, torch_dtype=torch.float16, device_map="auto", offload_folder="offload")

In [None]:
from src.retriever.retrieval import HybridRetriever
model_id = "camembert-base"
spacy_model = 'fr_core_news_md'

In [None]:
retriever = HybridRetriever(model_id=model_id, spacy_model=spacy_model)

In [None]:
questions = ["Quand l’Ordonnance présidentielle a-t-elle été lue sur le plateau de la Radiotélévision nationale congolaise (RTNC)?",
             "Qui a été nommé pour remplacer Emmanuel Ramazani Shadary au poste de vice-Premier ministre et ministre de l’Intérieur et sécurité?",
             "Où et quand Henri Mova Sakanyi est-il né?",
             "Quelle est la carrière politique de Henri Mova Sakanyi en République démocratique du Congo?",
             "Quel est le poste actuel de Henri Mova Sakanyi au sein du Parti du peuple pour la Reconstruction et la Démocratie (PPRD)?"]

In [None]:
template = """
Given the following information, answer the question.

Context:
{% for document in documents %}
    {{ document.content }}
{% endfor %}

Question: {{question}}
Answer:
"""

In [None]:
from jinja2 import Template

In [None]:
def generate_chat_input(query:str, documents:list) -> str:
    prompt_template  = """
    Given the following information Context:
        {% for document in documents %}
            {{ document }}
        {% endfor %}
    answer the question : {{question}} in French.
    Answer:
    """
    template = Template(prompt_template)
    prompt = template.render(documents=documents, question=query)
    
    chat_input = [
      {"role": "system", "content": "You answer questions about news in Democratic Republic of the Congo in French."},
      {"role": "user", "content": prompt},
    ]
    
    return chat_input

In [None]:
def parse_response(text):
    """ Split the text inside the  <|im_start|> assistant <|im_end|> tags and then split the new line text  and return the pair question and response"""

    # use regex to get the text inside the <|im_start|> assistant <|im_end|> tags
    text = text.split("<|im_start|> assistant")[1].split("<|im_end|>")[0]
    return text

In [None]:
def generate_answer(chat_input:str) -> str:
    """this function generates an answer to a question given a chat input

    Args:
        chat_input (str): _description_

    Returns:
        str: _description_
    """
   
    generation_args = {
        "max_new_tokens": 512,
        "do_sample": True,
        "temperature": 0.3,
        "top_p": 0.90,
        "top_k": 40,
        "repetition_penalty": 1.05,
        "eos_token_id": [tokenizer.eos_token_id, 32000],
    }


    inputs = tokenizer(chat_input, return_tensors="pt").to(model.device)
    tokens = model.generate(**inputs, **generation_args)
    input_ids = inputs["input_ids"]
    generated_token = tokens[0][input_ids.shape[-1]:]
    return tokenizer.decode(generated_token)

In [None]:
from unicodedata import normalize

In [None]:
questions[1]

In [None]:
for question in questions[:1]:
    documents = retriever.run(question)
    chat_input = generate_chat_input(question, documents)
    chat_input = tokenizer.apply_chat_template(
        chat_input, tokenize=False, add_generation_prompt=True)
    print(chat_input)
    answer = generate_answer(chat_input)
    # answer = parse_response(answer)
    print("the answer to the question {} is: __ \n {}".format(question, answer))

    print(50 * "-")

The model still halucinate, it may need some fine-tuning but let move to it's deployment.

I have manged to run the model, let me think about the deployment, in this I will use either llam c++ or trition inference server.

 https://github.com/ggerganov/llama.cpp?tab=readme-ov-file

In [None]:
len(tokenizer.vocab)

In [None]:
tokens_to_id = tokenizer.vocab

In [None]:
id_to_token = {v: k for k, v in tokens_to_id.items()}

In [None]:
assert len(id_to_token) == len(tokens_to_id)

### testing the tokenizer

In [None]:
from transformers import LlamaTokenizer

### Code for prediction

Bellow is the code that call the llama server Api to get the message.

In [None]:
import json


def encode_header(message):
    tokens = f"<|start_header_id|>{message['role']}<|end_header_id|>\n\n"
    return tokens


def encode_message(message):
    tokens = encode_header(message)
    tokens += message["content"].strip() + "<|eot_id|>"
    return tokens


def encode_dialog_prompt(dialog):
    tokens = "<|begin_of_text|>"
    for message in dialog:
        tokens += encode_message(message)

    tokens += encode_header({"role": "assistant", "content": ""})
    return tokens




In [None]:




# Specify the URL
url = 'http://localhost:8001/completion'

# Define the headers


In [None]:
batched_prompts = []

dialog = generate_chat_input(questions[1], documents)
batched_prompts.append(dialog)

In [None]:
batched_prompts

In [None]:
batched_prompts[0][1]["content"]

In [None]:
import requests

In [None]:
tokenizer.eos_token

In [None]:
headers = {
    'Content-Type': 'application/json',
}
data = {
    "prompt": batched_prompts[0][1]["content"],
    "n_predict": 512,
    "temperature": 0.3,
    "top_k": 40,
    "top_p": 0.90,
    "stopped_eos": True,
    "repeat_penalty": 1.05,
    "stop": [],
}


json_data = json.dumps(data)

# Send the POST request
response = requests.post(url, headers=headers, data=json_data)

In [None]:
print(data["prompt"])

In [None]:
response.json()

In [None]:
print(response.json()["content"])