## Inference with the LLama Model

In this notebook, we will call the croissant LLM to generate the response from our questions. We will compare the first approach that call the model directly from the transformer library and then the second approach that hit the llam-cpp api.


In [None]:
from transformers import LlamaForCausalLM, AutoTokenizer, set_seed
import torch

In [None]:
set_seed(42)

In [None]:
model_name = "croissantllm/CroissantLLMChat-v0.1"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = LlamaForCausalLM.from_pretrained(
    model_name, torch_dtype=torch.float16, device_map="auto", offload_folder="offload")

In [None]:
from src.rag.components.retriever import HybridRetriever
model_id = "camembert-base"
spacy_model = 'fr_core_news_md'

In [None]:
retriever = HybridRetriever(model_id=model_id, spacy_model=spacy_model)

In [None]:
questions = ["Quand l’Ordonnance présidentielle a-t-elle été lue sur le plateau de la Radiotélévision nationale congolaise (RTNC)?",
             "Qui a été nommé pour remplacer Emmanuel Ramazani Shadary au poste de vice-Premier ministre et ministre de l’Intérieur et sécurité?",
             "Où et quand Henri Mova Sakanyi est-il né?",
             "Quelle est la carrière politique de Henri Mova Sakanyi en République démocratique du Congo?",
             "Quel est le poste actuel de Henri Mova Sakanyi au sein du Parti du peuple pour la Reconstruction et la Démocratie (PPRD)?"]

In [None]:
template = """
Given the following information, answer the question.

Context:
{% for document in documents %}
    {{ document.content }}
{% endfor %}

Question: {{question}}
Answer:
"""

In [None]:
from jinja2 import Template

### Using The Transformer Model

In [None]:
def generate_chat_input(query:str, documents:list) -> str:
    prompt_template  = """
        Context:
        {% for document in documents %}
            {{ document }}
        {% endfor %}

        Question: {{question}}
        Answer:
        """
    template = Template(prompt_template)
    prompt = template.render(documents=documents, question=query)
    
    chat_input = [
        {"role": "system", "content": "Given the Context:, answer the question in french."},
      {"role": "user", "content": prompt},
    ]
    
    return chat_input

In [None]:
def parse_response(text):
    """ Split the text inside the  <|im_start|> assistant <|im_end|> tags and then split the new line text  and return the pair question and response"""

    # use regex to get the text inside the <|im_start|> assistant <|im_end|> tags
    text = text.split("<|im_start|> assistant")[1].split("<|im_end|>")[0]
    return text

In [None]:
def generate_answer(chat_input:str) -> str:
    """this function generates an answer to a question given a chat input

    Args:
        chat_input (str): _description_

    Returns:
        str: _description_
    """
   
    generation_args = {
        "max_new_tokens": 512,
        "do_sample": True,
        "temperature": 0.3,
        "top_p": 0.90,
        "top_k": 40,
        "repetition_penalty": 1.05,
        "eos_token_id": [tokenizer.eos_token_id, 32000],
    }


    inputs = tokenizer(chat_input, return_tensors="pt").to(model.device)
    tokens = model.generate(**inputs, **generation_args)
    input_ids = inputs["input_ids"]
    generated_token = tokens[0][input_ids.shape[-1]:]
    return tokenizer.decode(generated_token)

In [None]:
from unicodedata import normalize

In [None]:
questions[1]

In [None]:
for question in questions[:1]:
    documents = retriever.run(question)
    chat_input = generate_chat_input(question, documents)
    chat_tokens = tokenizer.apply_chat_template(
        chat_input, tokenize=False, add_generation_prompt=True)
    answer = generate_answer(chat_tokens)
    answer = parse_response(answer)
    print("the answer to the question {} is: __ \n {}".format(question, answer))

    print(50 * "-")

In [None]:
chat_tokens

#### Hit the LLama API

Bellow is the code that call the llama server Api to get the message.

In [None]:

# Specify the URL
API_URL = 'http://localhost:8001/completion'

In [None]:
documents = retriever.run(questions[1])


In [None]:
chat_input = generate_chat_input(questions[1], documents)

In [None]:
chat_tokens = tokenizer.apply_chat_template(
    chat_input, tokenize=False, add_generation_prompt=True)

In [None]:
import requests
import json

In [None]:
tokenizer.eos_token

In [None]:
def generate_response_from_llama_api(prompt:str) ->str:
    """ 
    This function sends a post request to the llama api and returns the response.
    """
    headers = {
        'Content-Type': 'application/json',
    }


    data = {
        "prompt": prompt,
        "n_predict": 128,
        "temperature": 0.3,
        "top_k": 40,
        "top_p": 0.90,
        "stopped_eos": True,
        "repeat_penalty": 1.05,
        "stop": ["assistant", tokenizer.eos_token],
        "seed": 42
    }


    json_data = json.dumps(data)

    # Send the POST request
    try:
        response = requests.post(API_URL, headers=headers, data=json_data)
        return response.json()["content"]
    except Exception as e:
        print(e)
        return None

In [None]:
for question in questions:
    documents = retriever.run(question)
    chat_input = generate_chat_input(question, documents)
    prompt = tokenizer.apply_chat_template(
        chat_input, tokenize=False, add_generation_prompt=True)
    answer = generate_response_from_llama_api(prompt)
    print("the answer to the question {} is: __ \n {}".format(question, answer))

    print(50 * "-")

### Using the Class to generate the response

In [None]:
from src.rag.components.generator import LLamaCppGeneratorComponent
from src.rag.components.retriever import HybridRetriever

In [None]:
hybrid_retriever = HybridRetriever(model_id=model_id, spacy_model=spacy_model)

In [None]:
sample_question = questions[4]

In [None]:
documents = hybrid_retriever.run(sample_question)

In [None]:
# Specify the URL
API_URL = 'http://localhost:8001'

In [None]:
generator_component = LLamaCppGeneratorComponent(api_url=API_URL)

In [None]:
assert generator_component._ping_api()

In [None]:
answer = generator_component.run(sample_question, documents)

In [None]:
print("the answer to the question {} is: __ \n {}".format(sample_question, answer))