## import

In [31]:
import os
import json
import yaml
import random 
import csv
import mlflow
import pandas as pd

In [32]:
from langchain.evaluation.qa.generate_chain import QAGenerateChain
from langchain_ollama import ChatOllama
from tqdm import tqdm
from dotenv import load_dotenv
from sentence_transformers import SentenceTransformer
from pinecone import Pinecone, ServerlessSpec

In [33]:
import nltk
# Vérifiez que le répertoire nltk_data existe, sinon créez-le
nltk_data_path = os.path.join(os.getcwd(), 'venv','Lib', 'site-packages' 'nltk_data')
os.makedirs(nltk_data_path, exist_ok=True)

# Téléchargez les données NLTK
nltk.download('punkt_tab')
from nltk.tokenize import sent_tokenize

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\regis\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


## 1. config

In [18]:
load_dotenv()

True

### 1.1. pinecone

In [19]:
# Load .env variables
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
REGION = os.getenv("REGION", "us-east-1")

print(f"Using Pinecone API Key: {PINECONE_API_KEY[0:4]}****{PINECONE_API_KEY[-4:]}")
print(f"Using Pinecone Region: {REGION}")

Using Pinecone API Key: pcsk****4o5A
Using Pinecone Region: us-east-1


In [20]:
pc = Pinecone(api_key=PINECONE_API_KEY)

### 1.2. experimentaion

In [None]:
experiment_configs = [
    {
        "name": "exp_0",
        "embedding_model": "all-mpnet-base-v2",
        "chunk_size": 256,
        "top_k": 5,
        "index_name": "index-mpnet-256"
    },
    {
        "name": "exp_1",
        "embedding_model": "multi-qa-mpnet-base-dot-v1",
        "chunk_size": 512,
        "top_k": 10,
        "index_name": "index-multiqa-512"
    }
]

# Global paths
base_path = r"C:\Users\regis\OneDrive\Documents\aiclinique\chatbot-seizure"

# Chemins relatifs à partir du répertoire de base
RAW_TEXT_PATH = os.path.join(base_path, 'data', 'text')
CHUNKS_OUTPUT_PATH = os.path.join(base_path, 'data', 'chunks')
CSV_CHUNKS_FILE = os.path.join(base_path, 'data', 'csv', 'chunks.csv')
CSV_QUESTIONS_FILE = os.path.join(base_path, 'data', 'csv', 'questions.csv')

# Afficher les chemins pour vérifier
print("RAW_TEXT_PATH:", RAW_TEXT_PATH)
print("CHUNKS_OUTPUT_PATH:", CHUNKS_OUTPUT_PATH)
print("CSV_CHUNKS_FILE:", CSV_CHUNKS_FILE)
print("CSV_QUESTIONS_FILE:", CSV_QUESTIONS_FILE)


RAW_TEXT_PATH: C:\Users\regis\OneDrive\Documents\aiclinique\chatbot-seizure\data\text
CHUNKS_OUTPUT_PATH: C:\Users\regis\OneDrive\Documents\aiclinique\chatbot-seizure\data\chunks
CSV_QUESTIONS_FILE: C:\Users\regis\OneDrive\Documents\aiclinique\chatbot-seizure\data\csv\questions.csv


## 2. Functions

### 2.1. Chunking Function

In [22]:
def do_chunking(input_folder, output_folder, chunk_size):
    os.makedirs(output_folder, exist_ok=True)

    for fname in os.listdir(input_folder):
        with open(os.path.join(input_folder, fname), encoding='utf-8') as f:
            text = f.read()

        sentences = sent_tokenize(text)
        chunks = []
        current = ""
        for sent in sentences:
            if len(current) + len(sent) < chunk_size:
                current += " " + sent
            else:
                chunks.append(current.strip())
                current = sent
        if current:
            chunks.append(current.strip())

        for idx, chunk in enumerate(chunks):
            out_name = f"{os.path.splitext(fname)[0]}_chunk_{idx}.txt"
            with open(os.path.join(output_folder, out_name), "w", encoding='utf-8') as f:
                f.write(chunk)
    print(f"[INFO] Chunking completed with size ≈ {chunk_size}.")

## 2.2. Indexing Function

In [23]:
def do_indexing(embedding_model_name, index_name, chunks_folder, metric="cosine"):
    model = SentenceTransformer(embedding_model_name)
    dim = model.get_sentence_embedding_dimension()

    if index_name not in pc.list_indexes().names():
        pc.create_index(
            name=index_name,
            dimension=dim,
            metric=metric,
            spec=ServerlessSpec(cloud="aws", region=REGION)
        )
    index = pc.Index(index_name)

    files = os.listdir(chunks_folder)
    for fname in tqdm(files, desc=f"Indexing to {index_name}"):
        with open(os.path.join(chunks_folder, fname), encoding='utf-8') as f:
            text = f.read()
        vec = model.encode(text).tolist()
        index.upsert([(fname, vec, {"source": fname})])

    return index

## 2.3. create the question answer csv

In [34]:
def question_answering_creation(chunks_folder_path,chunks_dataset_csv_file_path,qa_dataset_csv_file_path):
    # 1. create the dataset csv file
    chunks = sorted(os.listdir(chunks_folder_path))
    nb_chunks = len(chunks)
    sample = random.sample(chunks, int(nb_chunks/10))
    with open(chunks_dataset_csv_file_path, 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(['chunk_id', 'text', 'question', 'reference_answer'])
        for fname in sample:
            text = open(os.path.join(chunks_folder_path, fname), encoding='utf-8').read()
            snippet = text.replace('\n', ' ') + '…'
            writer.writerow([fname, snippet, '', ''])
    print(f"fichier {chunks_dataset_csv_file_path} généré avec {len(sample)} chunks.")  
    
    # 2. create the question answer csv file
    df = pd.read_csv(chunks_dataset_csv_file_path)
    texts = df["text"].tolist()
    llm = ChatOllama(model="mistral")
    qag = QAGenerateChain.from_llm(llm)
    results = qag.batch(texts)
    for i, res in enumerate(results):
        qap = res.get("qa_pairs", {})
        df.at[i, "question"] = qap.get("query", "")
        df.at[i, "reference_answer"] = qap.get("answer", "")

    # Save the DataFrame to a CSV file
    df.to_csv(qa_dataset_csv_file_path, index=False)
    

## 2.4. Retrieval Function

In [24]:
def do_retrieval(index_name, embedding_model_name, top_k, csv_file):
    model = SentenceTransformer(embedding_model_name)
    index = pc.Index(index_name)

    df = pd.read_csv(csv_file)
    questions = []
    gold_mapping = {}
    for _, row in df.iterrows():
        qid = str(row["chunk_id"]).strip()
        questions.append({"id": qid, "question": str(row["question"]).strip()})
        gold_mapping[qid] = qid

    total = 0
    hits = 0
    reciprocal_ranks = []

    for q in tqdm(questions, desc=f"Retrieval (top_k={top_k})"):
        q_emb = model.encode(q["question"]).tolist()
        res = index.query(vector=q_emb, top_k=top_k, include_metadata=True)
        retrieved_ids = [m["id"] for m in res["matches"]]
        hit = gold_mapping[q["id"]] in retrieved_ids
        if hit:
            hits += 1
            rank = retrieved_ids.index(gold_mapping[q["id"]]) + 1
            rr = 1 / rank
        else:
            rr = 0
        reciprocal_ranks.append(rr)
        total += 1

    recall = hits / total
    mrr = sum(reciprocal_ranks) / total

    print(f"Recall@{top_k}: {recall:.4f}")
    print(f"MRR: {mrr:.4f}")
    return recall, mrr


# 3. experimentation

## 3.1. MLflow Loop

In [25]:
mlflow.set_experiment("Epilepsy-Full-Retrieval-Experiment")

<Experiment: artifact_location='file:///c:/Users/regis/OneDrive/Documents/aiclinique/chatbot-seizure/src/experimentation/mlruns/599340227410176957', creation_time=1751146716305, experiment_id='599340227410176957', last_update_time=1751146716305, lifecycle_stage='active', name='Epilepsy-Full-Retrieval-Experiment', tags={}>

In [None]:

for config in experiment_configs:
    print(f"\n=== Running {config['name']} ===")
    with mlflow.start_run(run_name=config['name']):
        mlflow.log_param("embedding_model", config["embedding_model"])
        mlflow.log_param("chunk_size", config["chunk_size"])
        mlflow.log_param("top_k", config["top_k"])
        mlflow.log_param("index_name", config["index_name"])

        # --- Chunking ---
        do_chunking(RAW_TEXT_PATH, CHUNKS_OUTPUT_PATH, config["chunk_size"])

        # --- Indexing ---
        do_indexing(config["embedding_model"], config["index_name"], CHUNKS_OUTPUT_PATH)
        
        #create the question index
        question_answering_creation(CHUNKS_OUTPUT_PATH, CSV_CHUNKS_FILE, CSV_QUESTIONS_FILE)

        # --- Retrieval ---
        recall, mrr = do_retrieval(config["index_name"], config["embedding_model"], config["top_k"], CSV_QUESTIONS_FILE)
        mlflow.log_metric("Recall_at_k", recall)
        mlflow.log_metric("MRR", mrr)



=== Running exp_0 ===
[INFO] Chunking completed with size ≈ 256.


Indexing to index-mpnet-256:   6%|▌         | 230/4174 [00:58<16:41,  3.94it/s]


KeyboardInterrupt: 