## Question Answering: Chat Bot
**Utilisation du modèle T5 de google**

- Le jeu de données contient 200 avis, questions et réponses collectés auprès de visiteurs à Marrakech

In [20]:
!pip install -U transformers



In [4]:
# Installation des librairies libraries

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import torch
import seaborn as sns
import transformers
import json
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
import logging
logging.basicConfig(level=logging.ERROR)

In [5]:
# Vérification de la disponibilité du GPU
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")

Using device: cpu


In [6]:
pip install datasets



### Conversion au format json

In [8]:
# chargement de la data set contenant les questions et réponses
file_path = "QA.xlsx"
df = pd.read_excel(file_path)


if 'Question' not in df.columns or 'Answer' not in df.columns:
    raise ValueError("The Excel file must have 'Question' and 'Answer' columns.")

# Conversion au format json
data = []
for _, row in df.iterrows():
    data.append({
        "instruction": row['Question'],
        "input": row['Review'],
        "output": row['Answer']
    })

# enrégistrement au format json
output_path = "qa_dataset.json"
with open(output_path, "w") as json_file:
    json.dump(data, json_file, indent=4)

print(f"Data saved to {output_path}")

Data saved to qa_dataset.json


In [41]:
df.head()

Unnamed: 0,Review,Question,Answer
0,You can’t go to Marrakech without going to Jem...,Can you describe the ambiance of Jemaa el-Fnaa?,It’s crazy and wild and loud and busy - it’s a...
1,You can’t go to Marrakech without going to Jem...,How would you characterize the atmosphere of J...,It’s crazy and wild and loud and busy - it’s a...
2,You can’t go to Marrakech without going to Jem...,What sets the experience of visiting Jemaa el-...,It’s hard to imagine until you are actually th...
3,"Haggle Haggle Haggle, that's what you must do ...",What is the key approach to shopping in this a...,"Haggle Haggle Haggle, that's what you must do ..."
4,"Haggle Haggle Haggle, that's what you must do ...",How would you describe the pricing culture in ...,nothing is expensive.. just make sure you neve...


In [9]:
from transformers import T5ForConditionalGeneration, T5Tokenizer, TrainingArguments, Trainer
import torch
from datasets import load_dataset, Dataset
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

from datasets import Dataset

# Lire le fichier JSON manuellement pour éviter les message d'erreus
with open("qa_dataset.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# Convertir la liste en Dataset Hugging Face
dataset = Dataset.from_list(data)


# Split train/val
train_test_split = dataset.train_test_split(test_size=0.1)
train_data = train_test_split['train']
val_data = train_test_split['test']

# train/test(val)
train_test_split = dataset.train_test_split(test_size=0.1)
train_data = train_test_split['train']
val_data = train_test_split['test']

# clé d'authentification permettant le chargement du modèle de google sur hugging face
auth_token = "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"
model_name = "google/flan-t5-small" # prendre le grand model si GPU disponible
model = T5ForConditionalGeneration.from_pretrained(model_name, use_auth_token=auth_token)
tokenizer = T5Tokenizer.from_pretrained(model_name, use_auth_token=auth_token)

# Tokenize Data
def preprocess_function(examples):
    inputs = [f"question: {q} answer:" for q, r in zip(examples["instruction"], examples["input"])] # les questions et le context
    targets = examples["output"] # les réponses
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length") # récupération des ids, les mask d'attention
    labels = tokenizer(targets, max_length=128, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"] # récupération des ids des réponses
    return model_inputs

train_data = train_data.map(preprocess_function, batched=True)
val_data = val_data.map(preprocess_function, batched=True)

# Accuracy, precision, F-measure
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = torch.argmax(torch.tensor(logits), dim=1).numpy()
    acc = accuracy_score(labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average="binary")
    return {
        "accuracy": acc,
        "precision": precision,
        "recall": recall,
        "f1": f1}

# Definition des arguments
training_args = TrainingArguments(
    output_dir="fine_tuned_t5",
    save_strategy="steps",
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=2,
    load_best_model_at_end=True,
    logging_dir="./logs",
    logging_steps=50,
    eval_strategy="steps",
)

# Create the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=val_data,
    tokenizer=tokenizer
)





config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/308M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Map:   0%|          | 0/180 [00:00<?, ? examples/s]

Map:   0%|          | 0/21 [00:00<?, ? examples/s]

  trainer = Trainer(


### Entrainement du model

In [10]:
# suppression de wandb pour éviter les messages introduisnt l'achat du GPU
import os
os.environ["WANDB_DISABLED"] = "true"
import wandb
wandb.init(mode="disabled")
# Train the Model
trainer.train()

# Save the Fine-Tuned Model
trainer.save_model("fine_tuned_t5")
tokenizer.save_pretrained("fine_tuned_t5")
print("Model saved to fine_tuned_t5/")

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss,Validation Loss
50,21.8966,12.175153


Model saved to fine_tuned_t5/


In [11]:
# Chargement du fine-tuned model et le tokenizer
model = T5ForConditionalGeneration.from_pretrained("fine_tuned_t5") # model de google
tokenizer = T5Tokenizer.from_pretrained("fine_tuned_t5")

# Test tu tokenizer
input_text = "What can I visit in Morocco ?"
input_ids = tokenizer.encode(input_text, return_tensors="pt")
print("Tokenized Input:", input_ids)

# Generate an answer using the fine-tuned model
output_ids = model.generate(input_ids)
output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
print("Generated Answer:", output_text)

Tokenized Input: tensor([[  363,    54,    27,   719,    16, 25559,     3,    58,     1]])
Generated Answer: Morocco


In [37]:
# paramétrage du type de réponses
def generate_answer(question, model, tokenizer, max_length=256, min_length=50):
    # input (questions)
    input_text = f"question: {question} answer:"
    # Tokenize input
    input_ids = tokenizer.encode(input_text, return_tensors="pt")
    # Generate l' output avec des séquences longues
    output_ids = model.generate(
        input_ids,
        max_length=128,   # permet au chat de générer plus de token (mots)
        min_length=64,
        num_beams=5,
        do_sample=True,
        temperature=0.3,   # paramètre qui permet de gérer l'allucination.
        top_p=1.,
        no_repeat_ngram_size=3
    )

    # Decode l'output
    answer = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    return answer


In [40]:
question = "What can I visit in Marrakech?"
answer = generate_answer(question, model, tokenizer)
print(f"Question: {question}")
print(f"Answer: {answer}")

Question: What can I visit in Marrakech?
Answer: Mozambique Museum of Contemporary Arts and Crafts (Morocco) Museum of Modern Art and Architecture (Mozambia) Museums of Contemporary Art and Design (Micropolitan Art) Museum (Madrid, Morocco) Museum and Art Gallery (Muzambica, Morocco


In [None]:
# Comme j'ai augmenter le max-length 128, c'est pour cela le chat à plus bavarder.
# l'allucination peut s'expliquer par le fait qu'il y a eu mois d'input et d'output lors d' l'entrainement