In [None]:
import argilla as rg

In [None]:
client = rg.Argilla(
    api_url = 'http://localhost:3000',
    api_key='argilla.apikey',
)

In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

In [None]:
generative_model_id = "JDBN/t5-base-fr-qg-fquad"
model = T5ForConditionalGeneration.from_pretrained(
    generative_model_id, device_map="auto")
tokenizer = T5Tokenizer.from_pretrained(generative_model_id)

In [None]:
from random import randint

random_id = randint(1, 1000)

In [None]:
from src.retriever.database import execute_query, generate_database_connection

In [None]:
database_connection = generate_database_connection()
random_article = execute_query(
    database_connection, f'SELECT content FROM article WHERE id = {random_id}')

In [None]:
random_article = random_article[0][0]

In [None]:
random_article

In [None]:
model_input = f"generate question : {random_article}"

In [None]:
encoded_input = tokenizer([model_input],
                              return_tensors='pt').to(model.device)
output = model.generate(input_ids=encoded_input.input_ids,
                        do_sample=True, 
                            top_k=5, 
                            temperature=0.6,
                            max_length=256)
output = tokenizer.decode(output[0], skip_special_tokens=True)

In [None]:
output

In [None]:
random_article

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

In [None]:
model_name = "croissantllm/CroissantLLMChat-v0.1"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name, torch_dtype=torch.float16, device_map="auto", offload_folder="offload")

In [None]:
model_input = f"generate 5 french questions  and their answers based on the following text : {random_article}"

In [None]:
from transformers import set_seed
set_seed(42)

In [None]:
generation_args = {
    "max_new_tokens": 512,
    "do_sample": True,
    "temperature": 0.3,
    "top_p": 0.90,
    "top_k": 40,
    "repetition_penalty": 1.05,
    "eos_token_id": [tokenizer.eos_token_id, 32000],
}

chat = [
    {"role": "user", "content": f"{model_input}"},
]

chat_input = tokenizer.apply_chat_template(
    chat, tokenize=False, add_generation_prompt=True)

inputs = tokenizer(chat_input, return_tensors="pt").to(model.device)
tokens = model.generate(**inputs, **generation_args)

print(tokenizer.decode(tokens[0]))


At this point, we have a good french model that can be run properly on CPU. It slow, but it works. Let now use this model to generate 5k questions that can be answered with our dataset.

In [None]:
def parse_response(text):
    """ Split the text inside the  <|im_start|> assistant <|im_end|> tags and then split the new line text  and return the pair question and response"""
   
    # use regex to get the text inside the <|im_start|> assistant <|im_end|> tags
    text = text.split("<|im_start|> assistant")[1].split("<|im_end|>")[0]
    return text

In [None]:
responses  = parse_response(tokenizer.decode(tokens[0]))

In [None]:
responses

In [None]:
import re


def extract_questions_and_responses(text):
    # Define a regex pattern to match questions and their responses
    text = re.sub(r'\n+', '\n', text)
    pattern = re.compile(
        r'(\d+\.\s[^?]+?\?)\sRéponse:\s([^0-9]+?)(?=\d+\.|$)', re.DOTALL)

    # Find all matches in the text
    matches = pattern.findall(text)

    # Create a list of tuples with (question, response)
    result = [(match[0].strip(), match[1].strip()) for match in matches]
    result = [
        f"Question: {question_answer[0]} \n Réponse: {question_answer[1]}" for question_answer in result]
    return  "\n\n".join(result)


# Extract the questions and responses
questions_and_responses = extract_questions_and_responses(responses)

In [None]:
print(questions_and_responses)

In [None]:
paragraph_text = rg.TextField(name="content")

In [None]:
annotation_question = rg.TextQuestion(
    name="text",
    title="Are this the correct question_answer pairs based on the questions",
    description="Please provide feedback on the response",
    required=True,
    use_markdown=True
)

answers = rg.TextField(name="answers")

In [None]:
annotation_setting = rg.Settings(
    guidelines="Please provide feedback on the response",
    fields=[paragraph_text, answers],
    questions = [annotation_question]

)

In [None]:
annotation_dataset = rg.Dataset(
    name="annotation_dataset",
    settings=annotation_setting,
    client=client,
    workspace = "argilla"
)

In [None]:
annotation_dataset.create()

In [None]:
workspaces = client.workspaces

In [None]:
workspaces

In [None]:
record = rg.Record(
    fields={"content": random_article, "answers": questions_and_responses},
)

In [None]:
annotation_dataset.records.log([record])

### Using the whole dataset

In [None]:
def generate_prompt_template(content):
  prompt = f"generate 5 french questions , and for each question one answer in the format (question, answer) based on the following text : {content}"
  chat = [
      {"role": "user", "content": prompt},
  ]
  chat_input = tokenizer.apply_chat_template(
      chat, tokenize=False, add_generation_prompt=True)
  return chat_input

In [None]:
from pathlib import Path

In [None]:
current_directory = Path().cwd().parent

In [None]:
subset_path = current_directory.joinpath("subset_to_label.parquet")

In [None]:
from transformers import set_seed
set_seed(42)

In [None]:
from datasets import load_dataset

In [None]:
dataset = load_dataset("parquet", data_files={'train': subset_path.__str__()})

In [None]:
dataset = dataset.map(lambda x: {
                      "prompt_template": generate_prompt_template(x["content"])}, batched=False)

In [None]:
dataset_path = current_directory.joinpath("datasets", "congo_news_qa")
dataset_path.mkdir(parents=True, exist_ok=True)

In [None]:

import json


def write_list_to_json_file(list_of_strings, file_path):
    with open(file_path, 'w') as f:
        json.dump(list_of_strings, f, indent=4)

In [None]:
def extract_question_and_answers(examples):
    """
    take a batch of example compute the embeddings and save the subset of the embeddings
    Add a new columns named embedding to the subsets of example and save the subset locally.
    """
    generation_args = {
        "max_new_tokens": 512,
        "do_sample": True,
        "temperature": 0.3,
        "top_p": 0.90,
        "top_k": 40,
        "repetition_penalty": 1.05,
        "eos_token_id": [tokenizer.eos_token_id, 32000],
    }

    inputs = tokenizer(examples["prompt_template"],
                       return_tensors="pt", padding=True).to(model.device)
    outputs = model.generate(**inputs, **generation_args)
    questions_and_answers = tokenizer.batch_decode(
        outputs, skip_special_tokens=True)
    examples["question_answers"] = questions_and_answers
    
    return examples

In [None]:
numbers_of_shards = 1000 // 60

In [None]:
numbers_of_shards

In [None]:
16 * 60

In [None]:
for shard_index in range(0, numbers_of_shards):
    shard = dataset["train"].shard(num_shards=numbers_of_shards, index=shard_index)
    shard = shard.map(extract_question_and_answers, batched=True, batch_size=2)
    shard.save_to_disk(dataset_path.joinpath(f"shard_{shard_index}"))
    print("done processing shard ", shard_index)

###  Lesson learned


In this section I discoverd how powerfull the croissant LLM model can be on non english task. It was able to generate question on both french and english. 

I also discover a good annodation tool which is not prodigy, argilla. It has the same capacity as prodigy.

T5 is also a good powerful model for question answering, 

It just the annodation can take a huge amount of time reson why I decide to focus my effort on deploying the Croissant model.