# Retrieval exp√©rimentation

## 0. Setting up

### 0.1. Import

In [37]:
import os
import random 
import csv

In [38]:
import pandas as pd

In [39]:
from transformers import pipeline

### 0.2. Paths

In [40]:
chunks_folder_path = '../data/chunks'
chunks_dataset_csv_file_path = '../data/csv/validation_template.csv'
qa_dataset_csv_file_path = '../data/csv/validation_auto_qg.csv'

## 1.Create a data set for Question answering testing

### 1.1. A bunch of chunks dataset

#### commentaire : resort la liste enti√®re de tout mes chunks

In [41]:
chunks = sorted(os.listdir(chunks_folder_path))
nb_chunks = len(chunks)

#### commentaire: 
-  on prends le 1/10 **`au hasard`** des chunk comme pour du testing classique
- avoir plus tard si une analyse sur la distribution des donn√©es et des th√©matique ne pourrais pas aid√©

In [42]:
sample = random.sample(chunks, int(nb_chunks/10))

#### commentaire: g√©nration du fichier csv

In [53]:
with open(chunks_dataset_csv_file_path, 'w', newline='', encoding='utf-8') as f:
    writer = csv.writer(f)
    writer.writerow(['chunk_id', 'text', 'question', 'reference_answer'])
    for fname in sample:
        text = open(os.path.join(chunks_folder_path, fname), encoding='utf-8').read()
        snippet = text.replace('\n', ' ') + '‚Ä¶'
        writer.writerow([fname, snippet, '', ''])
print(f"fichier {chunks_dataset_csv_file_path} g√©n√©r√© avec {len(sample)} chunks.")

fichier ../data/csv/validation_template.csv g√©n√©r√© avec 153 chunks.


### 1.2. Generations des questions

##### 1.2.1. QG avec un mod√®le pr√©-entrain√©

In [49]:
model_qa_1, tokenizer_qa_1 = "valhalla/t5-base-e2e-qg", "valhalla/t5-base-e2e-qg"
model_qa_2, tokenizer_qa_2 = "t5-small", "t5-small"

In [50]:
# Pipeline end-to-end QG
qg = pipeline(
    "text2text-generation",
    model=model_qa_2,
    tokenizer=tokenizer_qa_2
)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


In [54]:
df = pd.read_csv(chunks_dataset_csv_file_path, encoding='utf-8')

In [55]:
df.columns

Index(['chunk_id', 'text', 'question', 'reference_answer'], dtype='object')

In [56]:
rows = []
for chunk_id, snippet, _, _ in df.itertuples(index=False):
    # Prompt plus simple et direct
    prompt = f"generate question: {snippet}"
    
    # Param√®tres ajust√©s pour de meilleures g√©n√©rations
    out = qg(
        prompt,
        max_length=100,  # Augment√© pour avoir plus de contexte
        min_length=10,   # Longueur minimale
        num_beams=5,     # Plus de beams pour de meilleurs r√©sultats
        num_return_sequences=1,
        early_stopping=True,
        do_sample=True,  # Ajout de sampling
        temperature=0.7, # Contr√¥le de la cr√©ativit√©
        clean_up_tokenization_spaces=True
    )[0]['generated_text'].strip()

    print(f"PROMPT: {prompt[:100]}...")
    print(f"OUTPUT: {out}")
    print("-" * 50)
    
    # Parsing am√©lior√©
    question = out
    if not question.endswith('?'):
        question += '?'
    
    # G√©n√©ration de la r√©ponse s√©par√©ment
    answer_prompt = f"answer the question based on context: {question} Context: {snippet}"
    answer_out = qg(
        answer_prompt,
        max_length=80,
        num_beams=3,
        num_return_sequences=1,
        early_stopping=True
    )[0]['generated_text'].strip()
    
    if question and len(question) > 5:
        rows.append({
            'chunk_id': chunk_id,
            'question': question,
            'reference_answer': answer_out
        })
    
    break  # pour test

# 5. Construire et afficher le DataFrame r√©sultat
df_out = pd.DataFrame(rows)
print("Exemple QA g√©n√©r√©e :", df_out.dropna().iloc[0][['question','reference_answer']].to_dict())

PROMPT: generate question: real-world, in-home environments, as this is where most seizures progressinthefie...
OUTPUT: non-EEG datastrictlyforneurologicalresearchanddoesnotstoreanynon-EEG detectionsystemscanbemodelledtoaccuratelyreflectthereal-world resources
--------------------------------------------------
Exemple QA g√©n√©r√©e : {'question': 'non-EEG datastrictlyforneurologicalresearchanddoesnotstoreanynon-EEG detectionsystemscanbemodelledtoaccuratelyreflectthereal-world resources?', 'reference_answer': 'complexities encountered by PWE in their day to day lives'}


##### 1.2.2. mistral

In [60]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import torch
from dotenv import load_dotenv

In [61]:
load_dotenv()
HF_TOKEN =os.getenv('HUGGINGFACE_API_TOKEN')

In [None]:
print("Chargement de Mistral avec token...")
model_name = "mistralai/Mistral-7B-Instruct-v0.1"

# Chargement avec token
tokenizer = AutoTokenizer.from_pretrained(model_name, token=HF_TOKEN)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    token=HF_TOKEN,
    torch_dtype=torch.float16,
    device_map="auto"
)

mistral = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer
)

print("‚úÖ Mistral charg√© avec succ√®s!")

Chargement de Mistral...




OSError: You are trying to access a gated repo.
Make sure to request access at https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1 and pass a token having permission to this repo either by logging in with `huggingface-cli login` or by passing `token=<your_token>`.

In [None]:
# 2. Fonction simple pour g√©n√©rer Q&A
def generer_qa(texte_medical):
    prompt = f"""[INST] Tu es un m√©decin sp√©cialis√© en √©pilepsie. 
Lis ce texte et cr√©√© UNE question et sa r√©ponse.

Texte: {texte_medical}

R√©ponds exactement dans ce format:
Q: [ta question]
R: [ta r√©ponse]
[/INST]"""

    response = mistral(
        prompt,
        max_new_tokens=120,
        temperature=0.7,
        do_sample=True,
        return_full_text=False
    )
    
    texte_genere = response[0]['generated_text']
    
    # Extraction simple
    lignes = texte_genere.split('\n')
    question = None
    reponse = None
    
    for ligne in lignes:
        if ligne.startswith('Q:'):
            question = ligne.replace('Q:', '').strip()
        elif ligne.startswith('R:'):
            reponse = ligne.replace('R:', '').strip()
    
    return question, reponse

In [None]:
# 3. Test sur vos chunks
rows = []

# Traitement de chaque chunk
for chunk_id, snippet, _, _ in df.itertuples(index=False):
    print(f"\nTraitement chunk {chunk_id}...")
    print(f"Texte: {snippet[:100]}...")
    
    question, reponse = generer_qa(snippet)
    
    if question and reponse:
        rows.append({
            'chunk_id': chunk_id,
            'question': question,
            'reference_answer': reponse
        })
        print(f"‚úÖ Q: {question}")
        print(f"‚úÖ R: {reponse}")
    else:
        print("‚ùå √âchec g√©n√©ration")
    
    # Test sur 3 chunks seulement
    if len(rows) >= 3:
        break

# 4. R√©sultat final
df_out = pd.DataFrame(rows)
print(f"\nüéâ {len(df_out)} Q&A g√©n√©r√©es!")

if not df_out.empty:
    print("\nExemple final:")
    exemple = df_out.iloc[0]
    print(f"Question: {exemple['question']}")
    print(f"R√©ponse: {exemple['reference_answer']}")