# Generation d'un ensemble de donnes d'evaluation pour l'information retrieval (multichunk)

- On donne 2 chunks (articles) qui parlent du meme jeu a Gemini pour qu'il construise des questions qui doivent etre repondus avec les 2 chunks 

In [76]:
import pandas as pd

from google import genai
from google.genai import types
from google.genai.types import GenerateContentConfig

In [None]:
import sys
sys.path.append('../../src')

from entity import extract_entities, match_entity
from embeddings import generate_embeddings

In [77]:
client = genai.Client(api_key='AIzaSyBHqAozRSA30mY_q0TRnWfH9_Dsji-nh2Y')
model_id = "gemini-2.5-flash-preview-04-17"

In [78]:
system_prompt = """You are helping me creating retrieval dataset. Extract a question from the 2 articles bellow about the game {game}.
The question should be answerable with the 2 articles, but not by only one of them.
Make sure the question contains the name of the game ({game}).
Do not mention the articles in your answer.
Your answer should only be the question, without any additional text."""

In [None]:
df = pd.read_parquet("../../data/articles.parquet")

In [80]:
games = df.explode("Game").Game.value_counts()[:100].index.tolist()

In [81]:
def generate_question_about_game(df, game):
    subset = df[df.Game.apply(lambda x: game in x)].sample(2)

    docs = f"Articles 1:\n{subset.iloc[0].article}\n\nArticles 2:\n{subset.iloc[1].article}"

    response = client.models.generate_content(
        model=model_id,
        contents=docs,
        config=GenerateContentConfig(
            system_instruction=system_prompt.format(game=game),
            thinking_config=types.ThinkingConfig(thinking_budget=0)
        )
    )

    return subset.uuid.tolist(), response.candidates[0].content.parts[0].text

In [82]:
questions = []

In [83]:
for game in games[:100]:
    try:
        uuids, question = generate_question_about_game(df, game)
        questions.append((uuids, question))
    except Exception as e:
        print(f"Error generating question for article: {e}")
        continue


In [84]:
questions_df = pd.DataFrame(questions, columns=['target' , 'question'])

In [87]:
questions_df

Unnamed: 0,target,question
0,"[ec5be87e-5d79-46e5-a62f-5814664a798e, ad9be1e...",Which Sonic the Hedgehog game was released aft...
1,"[0a5dc19e-1638-4196-bb90-e85a1759bcd6, bb5c7eb...",What entity was co-owned by Nintendo and named...
2,"[36ea002c-6032-491b-8972-5231df3c721e, 97ce9cd...",What video games are mentioned alongside Final...
3,"[cf4f870a-3a7a-4370-9266-350171379ba4, cfb95b5...",What year did The Legend of Zelda game influen...
4,"[e6f31236-2425-479b-bbc0-91a0bd4a81f6, b5d69db...",Which game featuring Popeye that was released ...
...,...,...
95,"[470ab588-a660-4979-96e2-3718ebf1e1ae, 003d8bf...",Which Mario franchise game is a compilation of...
96,"[b6317f0d-c286-4e8e-8678-9ad59083af06, 3bbaef5...",Was God of War III originally released on the ...
97,"[a042ef3b-9031-485c-849f-c7a68aa57d70, 6c1e234...","Which video games, besides Asteroids, used the..."
98,"[87d0203d-cbd6-411e-9c06-72895213815b, 662a530...",What are some specific connections between Bub...


In [89]:
entities = questions_df.question.apply(extract_entities)

2025-05-11T16:02:03.989 [BAML [92mINFO[0m] [35mFunction ExtractEntities[0m:
    [33mClient: GeminiFlash (gemini-2.0-flash) - 774ms. StopReason: STOP. Tokens(in/out): 103/55[0m
    [34m---PROMPT---[0m
    [2m[43muser: [0m[2mExtract all the relevant entities from the user question on the video games and consonles domain.
     - Resolve entities to their canonic form (example: LOTR -> Lord of the Rings).
    
     User question:
     Which Sonic the Hedgehog game was released after Sonic Forces and utilized elements influenced by feedback from it?
    
    Answer with a JSON Array using this schema:
    [
      {
        name: string,
        type: 'Game' or 'Console' or 'Publisher',
      }
    ]
    [0m
    [34m---LLM REPLY---[0m
    [2m```json
    [
      {
        "name": "Sonic the Hedgehog",
        "type": "Game"
      },
      {
        "name": "Sonic Forces",
        "type": "Game"
      }
    ]
    ```[0m
    [34m---Parsed Response (list<class Entity>)---[0m
 

In [90]:
questions_df['entities'] = entities

In [91]:
questions_df['entities'] = questions_df['entities'].apply(
    lambda x: {
        k: [match_entity(v, k) for v in v]
        for k,v in x.items()
        if v is not None
    }
)

In [None]:
questions_df["question_embedding"] = generate_embeddings(questions_df.question.tolist())

In [None]:
questions_df.to_parquet("../../data/retrieval_questions_multichunks.parquet", index=False)