In [None]:
import argilla as rg

In [None]:
client = rg.Argilla(
    api_url = 'http://localhost:3000',
    api_key='argilla.apikey',
)

In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

In [None]:
generative_model_id = "JDBN/t5-base-fr-qg-fquad"
model = T5ForConditionalGeneration.from_pretrained(
    generative_model_id, device_map="auto")
tokenizer = T5Tokenizer.from_pretrained(generative_model_id)

In [None]:
from random import randint

random_id = randint(1, 1000)

In [None]:
from src.retriever.database import execute_query, generate_database_connection

In [None]:
database_connection = generate_database_connection()
random_article = execute_query(
    database_connection, f'SELECT content FROM article WHERE id = {random_id}')

In [None]:
random_article = random_article[0][0]

In [None]:
random_article

In [None]:
model_input = f"generate question : {random_article}"

In [None]:
encoded_input = tokenizer([model_input],
                              return_tensors='pt').to(model.device)
output = model.generate(input_ids=encoded_input.input_ids,
                        do_sample=True, 
                            top_k=5, 
                            temperature=0.6,
                            max_length=256)
output = tokenizer.decode(output[0], skip_special_tokens=True)

In [None]:
output

In [None]:
random_article

In [1]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

In [2]:
model_name = "croissantllm/CroissantLLMChat-v0.1"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name, torch_dtype=torch.float16, device_map="auto", offload_folder="offload")



tokenizer_config.json:   0%|          | 0.00/19.2k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.35M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/557 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


config.json:   0%|          | 0.00/732 [00:00<?, ?B/s]



model.safetensors.index.json:   0%|          | 0.00/18.0k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/397M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

In [None]:
model_input = f"generate 5 french questions  and their answers based on the following text : {random_article}"

In [None]:
from transformers import set_seed
set_seed(42)

In [None]:
generation_args = {
    "max_new_tokens": 512,
    "do_sample": True,
    "temperature": 0.3,
    "top_p": 0.90,
    "top_k": 40,
    "repetition_penalty": 1.05,
    "eos_token_id": [tokenizer.eos_token_id, 32000],
}

chat = [
    {"role": "user", "content": f"{model_input}"},
]

chat_input = tokenizer.apply_chat_template(
    chat, tokenize=False, add_generation_prompt=True)

inputs = tokenizer(chat_input, return_tensors="pt").to(model.device)
tokens = model.generate(**inputs, **generation_args)

print(tokenizer.decode(tokens[0]))


At this point, we have a good french model that can be run properly on CPU. It slow, but it works. Let now use this model to generate 5k questions that can be answered with our dataset.

In [None]:
def parse_response(text):
    """ Split the text inside the  <|im_start|> assistant <|im_end|> tags and then split the new line text  and return the pair question and response"""
   
    # use regex to get the text inside the <|im_start|> assistant <|im_end|> tags
    text = text.split("<|im_start|> assistant")[1].split("<|im_end|>")[0]
    return text

In [None]:
responses  = parse_response(tokenizer.decode(tokens[0]))

In [None]:
responses

In [None]:
import re


def extract_questions_and_responses(text):
    # Define a regex pattern to match questions and their responses
    text = re.sub(r'\n+', '\n', text)
    pattern = re.compile(
        r'(\d+\.\s[^?]+?\?)\sRéponse:\s([^0-9]+?)(?=\d+\.|$)', re.DOTALL)

    # Find all matches in the text
    matches = pattern.findall(text)

    # Create a list of tuples with (question, response)
    result = [(match[0].strip(), match[1].strip()) for match in matches]
    result = [
        f"Question: {question_answer[0]} \n Réponse: {question_answer[1]}" for question_answer in result]
    return  "\n\n".join(result)


# Extract the questions and responses
questions_and_responses = extract_questions_and_responses(responses)

In [None]:
print(questions_and_responses)

In [None]:
paragraph_text = rg.TextField(name="content")

In [None]:
annotation_question = rg.TextQuestion(
    name="text",
    title="Are this the correct question_answer pairs based on the questions",
    description="Please provide feedback on the response",
    required=True,
    use_markdown=True
)

answers = rg.TextField(name="answers")

In [None]:
annotation_setting = rg.Settings(
    guidelines="Please provide feedback on the response",
    fields=[paragraph_text, answers],
    questions = [annotation_question]

)

In [None]:
annotation_dataset = rg.Dataset(
    name="annotation_dataset",
    settings=annotation_setting,
    client=client,
    workspace = "argilla"
)

In [None]:
annotation_dataset.create()

In [None]:
workspaces = client.workspaces

In [None]:
workspaces

In [None]:
record = rg.Record(
    fields={"content": random_article, "answers": questions_and_responses},
)

In [None]:
annotation_dataset.records.log([record])

### Using the whole dataset

In [3]:
def generate_prompt_template(content):
  prompt = f"generate 5 french questions , and for each question one answer in the format (question, answer) based on the following text : {content}"
  chat = [
      {"role": "user", "content": prompt},
  ]
  chat_input = tokenizer.apply_chat_template(
      chat, tokenize=False, add_generation_prompt=True)
  return chat_input

In [4]:
from pathlib import Path

In [5]:
current_directory = Path().cwd().parent

In [6]:
subset_path = current_directory.joinpath("subset_to_label.parquet")

In [7]:
from transformers import set_seed
set_seed(42)

2024-07-17 06:29:32.131545: I tensorflow/core/util/port.cc:111] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-07-17 06:29:32.248679: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-17 06:29:32.248708: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-17 06:29:32.248715: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-07-17 06:29:32.302087: I tensorflow/core/platform/cpu_feature_g

In [8]:
from datasets import load_dataset

In [9]:
dataset = load_dataset("parquet", data_files={'train': subset_path.__str__()})

Generating train split: 0 examples [00:00, ? examples/s]

In [10]:
dataset = dataset.map(lambda x: {
                      "prompt_template": generate_prompt_template(x["content"])}, batched=False)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [11]:
dataset_path = current_directory.joinpath("datasets", "congo_news_qa")
dataset_path.mkdir(parents=True, exist_ok=True)

In [12]:

import json


def write_list_to_json_file(list_of_strings, file_path):
    with open(file_path, 'w') as f:
        json.dump(list_of_strings, f, indent=4)

In [13]:
def extract_question_and_answers(examples):
    """
    take a batch of example compute the embeddings and save the subset of the embeddings
    Add a new columns named embedding to the subsets of example and save the subset locally.
    """
    generation_args = {
        "max_new_tokens": 512,
        "do_sample": True,
        "temperature": 0.3,
        "top_p": 0.90,
        "top_k": 40,
        "repetition_penalty": 1.05,
        "eos_token_id": [tokenizer.eos_token_id, 32000],
    }

    inputs = tokenizer(examples["prompt_template"],
                       return_tensors="pt", padding=True).to(model.device)
    outputs = model.generate(**inputs, **generation_args)
    questions_and_answers = tokenizer.batch_decode(
        outputs, skip_special_tokens=True)
    examples["question_answers"] = questions_and_answers
    
    return examples

In [14]:
numbers_of_shards = 1000 // 60

In [15]:
numbers_of_shards

16

In [16]:
16 * 60

960

In [None]:
for shard_index in range(0, numbers_of_shards):
    shard = dataset["train"].shard(num_shards=numbers_of_shards, index=shard_index)
    shard = shard.map(extract_question_and_answers, batched=True, batch_size=2)
    shard.save_to_disk(dataset_path.joinpath(f"shard_{shard_index}"))
    print("done processing shard ", shard_index)

Map:   0%|          | 0/63 [00:00<?, ? examples/s]

Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:32000 for open-end gene

Saving the dataset (0/1 shards):   0%|          | 0/63 [00:00<?, ? examples/s]

done processing shard  0


Map:   0%|          | 0/63 [00:00<?, ? examples/s]

Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:32000 for open-end gene

Saving the dataset (0/1 shards):   0%|          | 0/63 [00:00<?, ? examples/s]

done processing shard  1


Map:   0%|          | 0/63 [00:00<?, ? examples/s]

Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:32000 for open-end gene

Saving the dataset (0/1 shards):   0%|          | 0/63 [00:00<?, ? examples/s]

done processing shard  2


Map:   0%|          | 0/63 [00:00<?, ? examples/s]

Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:32000 for open-end gene

Saving the dataset (0/1 shards):   0%|          | 0/63 [00:00<?, ? examples/s]

done processing shard  3


Map:   0%|          | 0/63 [00:00<?, ? examples/s]

Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:32000 for open-end gene

Saving the dataset (0/1 shards):   0%|          | 0/63 [00:00<?, ? examples/s]

done processing shard  4


Map:   0%|          | 0/63 [00:00<?, ? examples/s]

Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:32000 for open-end gene

Saving the dataset (0/1 shards):   0%|          | 0/63 [00:00<?, ? examples/s]

done processing shard  5


Map:   0%|          | 0/63 [00:00<?, ? examples/s]

Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:32000 for open-end gene

Saving the dataset (0/1 shards):   0%|          | 0/63 [00:00<?, ? examples/s]

done processing shard  6


Map:   0%|          | 0/63 [00:00<?, ? examples/s]

Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:32000 for open-end gene