# Retrieval expérimentation

## 0. Setting up

### 0.1. Import

In [1]:
import os
import random 
import csv

In [2]:
import pandas as pd

In [19]:
from langchain.evaluation.qa.generate_chain import QAGenerateChain
from langchain_ollama import ChatOllama

### 0.2. Paths

In [3]:
chunks_folder_path = '../data/chunks'
chunks_dataset_csv_file_path = '../data/csv/validation_template.csv'
qa_dataset_csv_file_path = '../data/csv/validation_auto_qg.csv'

## 1.Create a data set for Question answering testing

### 1.1. A bunch of chunks dataset

#### commentaire : resort la liste entière de tout mes chunks

In [4]:
chunks = sorted(os.listdir(chunks_folder_path))
nb_chunks = len(chunks)

#### commentaire: 
-  on prends le 1/10 **`au hasard`** des chunk comme pour du testing classique
- avoir plus tard si une analyse sur la distribution des données et des thématique ne pourrais pas aidé

In [5]:
sample = random.sample(chunks, int(nb_chunks/10))

#### commentaire: génration du fichier csv

In [6]:
with open(chunks_dataset_csv_file_path, 'w', newline='', encoding='utf-8') as f:
    writer = csv.writer(f)
    writer.writerow(['chunk_id', 'text', 'question', 'reference_answer'])
    for fname in sample:
        text = open(os.path.join(chunks_folder_path, fname), encoding='utf-8').read()
        snippet = text.replace('\n', ' ') + '…'
        writer.writerow([fname, snippet, '', ''])
print(f"fichier {chunks_dataset_csv_file_path} généré avec {len(sample)} chunks.")

fichier ../data/csv/validation_template.csv généré avec 153 chunks.


### 1.2. Generations des questions

##### 1.2.1. QG avec un modèle pré-entrainé

In [49]:
model_qa_1, tokenizer_qa_1 = "valhalla/t5-base-e2e-qg", "valhalla/t5-base-e2e-qg"
model_qa_2, tokenizer_qa_2 = "t5-small", "t5-small"

In [50]:
# Pipeline end-to-end QG
qg = pipeline(
    "text2text-generation",
    model=model_qa_2,
    tokenizer=tokenizer_qa_2
)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


In [54]:
df = pd.read_csv(chunks_dataset_csv_file_path, encoding='utf-8')

In [55]:
df.columns

Index(['chunk_id', 'text', 'question', 'reference_answer'], dtype='object')

In [56]:
rows = []
for chunk_id, snippet, _, _ in df.itertuples(index=False):
    # Prompt plus simple et direct
    prompt = f"generate question: {snippet}"
    
    # Paramètres ajustés pour de meilleures générations
    out = qg(
        prompt,
        max_length=100,  # Augmenté pour avoir plus de contexte
        min_length=10,   # Longueur minimale
        num_beams=5,     # Plus de beams pour de meilleurs résultats
        num_return_sequences=1,
        early_stopping=True,
        do_sample=True,  # Ajout de sampling
        temperature=0.7, # Contrôle de la créativité
        clean_up_tokenization_spaces=True
    )[0]['generated_text'].strip()

    print(f"PROMPT: {prompt[:100]}...")
    print(f"OUTPUT: {out}")
    print("-" * 50)
    
    # Parsing amélioré
    question = out
    if not question.endswith('?'):
        question += '?'
    
    # Génération de la réponse séparément
    answer_prompt = f"answer the question based on context: {question} Context: {snippet}"
    answer_out = qg(
        answer_prompt,
        max_length=80,
        num_beams=3,
        num_return_sequences=1,
        early_stopping=True
    )[0]['generated_text'].strip()
    
    if question and len(question) > 5:
        rows.append({
            'chunk_id': chunk_id,
            'question': question,
            'reference_answer': answer_out
        })
    
    break  # pour test

# 5. Construire et afficher le DataFrame résultat
df_out = pd.DataFrame(rows)
print("Exemple QA générée :", df_out.dropna().iloc[0][['question','reference_answer']].to_dict())

PROMPT: generate question: real-world, in-home environments, as this is where most seizures progressinthefie...
OUTPUT: non-EEG datastrictlyforneurologicalresearchanddoesnotstoreanynon-EEG detectionsystemscanbemodelledtoaccuratelyreflectthereal-world resources
--------------------------------------------------
Exemple QA générée : {'question': 'non-EEG datastrictlyforneurologicalresearchanddoesnotstoreanynon-EEG detectionsystemscanbemodelledtoaccuratelyreflectthereal-world resources?', 'reference_answer': 'complexities encountered by PWE in their day to day lives'}


##### 1.2.2. mistral from ollama

from langchain_ollama import ChatOllama

response = llm.invoke([{"role": "user", "content": "Bonjour, peux-tu me dire le capital de la France ?"}])
print(response)

#### 1.2.2.1. question answer completion dataset

In [18]:
df = pd.read_csv(chunks_dataset_csv_file_path)
texts = df["text"].tolist()

In [30]:
texts[121]

'openlyaccessibledatabaseencompassingnon-EEGsensordatafrom OpenSeizureDatabase/blob/main/documentation/ multiple sensing modalities. LICENCE.md Thisstudydistinguishesitselfbyutilisingreal-worlddata,provid- ing an accurate depiction of everyday life compared to controlled Informed Consent Statement: The users gave their consent EMU-based datasets. The beta trial’s success led to an indefinite to publish the developed database by agreeing to the Privacy extension of the data collection period, showcasing our commitment Policy at https://github.com/OpenSeizureDetector/ to continually enriching the OSDB and contributing to non-EEG OpenSeizureDatabase/blob/main/documentation/…'

In [20]:
llm = ChatOllama(model="mistral")
qag = QAGenerateChain.from_llm(llm)

In [21]:
results = qag.batch(texts)

In [31]:
print(results[121])

{'doc': 'openlyaccessibledatabaseencompassingnon-EEGsensordatafrom OpenSeizureDatabase/blob/main/documentation/ multiple sensing modalities. LICENCE.md Thisstudydistinguishesitselfbyutilisingreal-worlddata,provid- ing an accurate depiction of everyday life compared to controlled Informed Consent Statement: The users gave their consent EMU-based datasets. The beta trial’s success led to an indefinite to publish the developed database by agreeing to the Privacy extension of the data collection period, showcasing our commitment Policy at https://github.com/OpenSeizureDetector/ to continually enriching the OSDB and contributing to non-EEG OpenSeizureDatabase/blob/main/documentation/…', 'qa_pairs': {'query': 'What is the name of the database used in this study, and where can one find more information about its licence?', 'answer': 'The name of the database used in this study is the OpenSeizureDatabase. More information about its licence can be found at https://github.com/OpenSeizureDetector

In [32]:
for i, res in enumerate(results):
    qap = res.get("qa_pairs", {})
    df.at[i, "question"] = qap.get("query", "")
    df.at[i, "reference_answer"] = qap.get("answer", "")
df.to_csv(qa_dataset_csv_file_path, index=False)

  df.at[i, "question"] = qap.get("query", "")
  df.at[i, "reference_answer"] = qap.get("answer", "")
