# Generation d'un ensemble de donnees pour tester specifiquement le module d'information retrieval

- A partir d'un chunk, on genere une question avec Gemini2.5 Flash en mode reasoning.
- Environ une centaine de questions au total

In [17]:
import pandas as pd

from google import genai
from google.genai.types import GenerateContentConfig

In [None]:
import sys
sys.path.append('../../src')

from entity import extract_entities, match_entity
from embeddings import generate_embeddings

In [None]:
import os

client = genai.Client(api_key=os.environ.get("GOOGLE_API_KEY"))
model_id = "gemini-2.5-flash-preview-04-17"

In [None]:
df = pd.read_parquet("../../data/articles.parquet")
df = df.sample(frac=0.01)

In [None]:
def generate_question(doc, system_prompt):
    response = client.models.generate_content(
        model=model_id,
        contents=doc,
        config=GenerateContentConfig(
            system_instruction=system_prompt,
        )
    )

    return response.candidates[0].content.parts[0].text

In [None]:
def generate_dataset(df, prompt):
    questions = {}

    for _, row in df.iterrows():
        if row.uuid in questions:
            continue

        try:
            question = generate_question(row.article, prompt)
            print(question)

            questions[row.uuid] = question
        except Exception as e:
            print(f"Error generating question for article: {e}")
            continue

    questions_df = (
        pd.DataFrame
        .from_dict(questions, orient='index', columns=['question'])
        .reset_index()
        .rename(columns={'index': 'uuid'})
    )

    # Extract entities
    entities = questions_df.question.apply(extract_entities)
    questions_df['entities'] = entities
    questions_df['entities'] = questions_df['entities'].apply(
        lambda x: {
            k: [match_entity(v, k) for v in v]
            for k,v in x.items()
            if v is not None
        }
    )

    # Add dense embeddings
    questions_df["question_embedding"] = generate_embeddings(questions_df.question.tolist())

    return questions_df

Premier dataset genere

In [None]:
system_prompt = """You are helping me creating retrieval dataset. Extract a question from the article bellow.
Make sure every question contains at least one game or console name.
Your answer should only be the question, without any additional text."""

In [None]:
questions_df = generate_dataset(df, system_prompt)
questions_df.to_parquet("../../data/retrieval_questions.parquet", index=False)

Le premier dataset etait trop facile (recall tres eleve). J'ai tente d'en creer un plus difficile

In [None]:
system_prompt_detail = """You are helping me creating retrieval dataset. Extract a question from the article bellow.
The question should be about a detail mentioned in the article and not the general topic.
Make sure every question contains at least one game or console name.
Your answer should only be the question, without any additional text."""

In [None]:
questions_df = generate_dataset(df, system_prompt_detail)
questions_df.to_parquet("../../data/retrieval_questions_detail.parquet", index=False)