In this notebook we will index the french dump version of the wikipedia dataset, and use with the retrieval for our qa/system

In [None]:
import numpy as np
from haystack.utils import clean_wiki_text, convert_files_to_docs, fetch_archive_from_http, print_answers

In [None]:
from datasets import load_dataset

In [None]:
wiki_corpus = load_dataset("wikipedia", "20220301.fr", split="train")

In [None]:
wiki_corpus = wiki_corpus.shuffle(seed=42)

In [None]:
sample_wiki = wiki_corpus.shard(100, index=0)

In [None]:
sample_doc_dict = sample_wiki[1]

In [None]:
from haystack.schema import Document

In [None]:
docs = Document.from_dict(sample_doc_dict, field_map={"text": "content"})

#### Building the Elastic Search Index

In [2]:
from haystack.utils import clean_wiki_text, convert_files_to_docs

In [3]:
from haystack.errors import HaystackError
from haystack.schema import Document
from typing import List, Optional, Generator, Set, Union, Callable, Dict
from copy import deepcopy
from haystack.nodes import PreProcessor
import re

In [4]:
from gensim.utils import deaccent

def remove_accents(document):
    input_without_accent = deaccent(document)
    return input_without_accent

In [5]:
async def convert_wiki_article_to_docs(
    item: dict,
    clean_func: Optional[Callable] = None,
    split_paragraphs: bool = False,
) -> List[Document]:
    """
    item

    :param items: dict of items
    :param clean_func: a custom cleaning function that gets applied to each doc (input: str, output:str)
    :param split_paragraphs: split text in paragraphs.
    """
    documents = []
    processed = 0
    text = item.get("text")
    text = remove_accents(text)
    if clean_func:
        text = clean_func(text)
    if split_paragraphs:
        for para in text.split("\n"):
            if 200 <= len(para.strip()) <= 2000:
                # just pick the paragraph with length between 50 and 1500
                processed += 1
                doc = Document(content=para, meta={"title": item.get("title")}, id=item.get("id"))
                documents.append(doc)
            else:
                continue
    else:
        processed += 1
        doc = Document(content=text, meta={"title": item.get("title")}, id=item.get("id"))
        documents.append(doc)
    
    return documents

In [None]:
import asyncio

#### Saving the Document in the retriever

In [6]:
from tqdm import tqdm
from tqdm.asyncio import tqdm_asyncio
from haystack.document_stores import ElasticsearchDocumentStore


In [7]:
INDEX_NAME = 'fr-wikipedia'

In [8]:

document_store = ElasticsearchDocumentStore(index=INDEX_NAME, recreate_index=False, analyzer="french")

INFO - haystack.document_stores.elasticsearch -  Index 'fr-wikipedia' deleted.
INFO - haystack.document_stores.elasticsearch -  Index 'label' deleted.


In [9]:
def split_list_in_chunck(list_, len_):
    chunks = [list_[i:i+len_] for i in range(0, len(list_), len_)]
    return chunks

In [10]:
from elasticsearch.helpers import bulk, parallel_bulk, scan

In [21]:
from collections import deque

In [11]:

def write_documents_parallel(
        self,
        documents: Union[List[dict], List[Document]],
        index: Optional[str] = None,
        batch_size: int = 10_000,
        duplicate_documents: Optional[str] = None,
        headers: Optional[Dict[str, str]] = None,
    ):
        """
        Indexes documents for later queries in Elasticsearch./ an update for the index that use parralell bluk

        Behaviour if a document with the same ID already exists in ElasticSearch:
        a) (Default) Throw Elastic's standard error message for duplicate IDs.
        b) If `self.update_existing_documents=True` for DocumentStore: Overwrite existing documents.
        (This is only relevant if you pass your own ID when initializing a `Document`.
        If don't set custom IDs for your Documents or just pass a list of dictionaries here,
        they will automatically get UUIDs assigned. See the `Document` class for details)

        :param documents: a list of Python dictionaries or a list of Haystack Document objects.
                          For documents as dictionaries, the format is {"content": "<the-actual-text>"}.
                          Optionally: Include meta data via {"content": "<the-actual-text>",
                          "meta":{"name": "<some-document-name>, "author": "somebody", ...}}
                          It can be used for filtering and is accessible in the responses of the Finder.
                          Advanced: If you are using your own Elasticsearch mapping, the key names in the dictionary
                          should be changed to what you have set for self.content_field and self.name_field.
        :param index: Elasticsearch index where the documents should be indexed. If not supplied, self.index will be used.
        :param batch_size: Number of documents that are passed to Elasticsearch's bulk function at a time.
        :param duplicate_documents: Handle duplicates document based on parameter options.
                                    Parameter options : ( 'skip','overwrite','fail')
                                    skip: Ignore the duplicates documents
                                    overwrite: Update any existing documents with the same ID when adding documents.
                                    fail: an error is raised if the document ID of the document being added already
                                    exists.
        :param headers: Custom HTTP headers to pass to elasticsearch client (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='})
                Check out https://www.elastic.co/guide/en/elasticsearch/reference/current/http-clients.html for more information.
        :raises DuplicateDocumentError: Exception trigger on duplicate document
        :return: None
        """
        print("I am inside the function")
        if index and not self.client.indices.exists(index=index, headers=headers):
            self._create_document_index(index, headers=headers)

        if index is None:
            index = self.index
        duplicate_documents = duplicate_documents or self.duplicate_documents
        assert (
            duplicate_documents in self.duplicate_documents_options
        ), f"duplicate_documents parameter must be {', '.join(self.duplicate_documents_options)}"

        field_map = self._create_document_field_map()
        document_objects = [Document.from_dict(d, field_map=field_map) if isinstance(d, dict) else d for d in documents]
        document_objects_chunk = split_list_in_chunck(document_objects, 9990)
        document_objects_deduplicated = []
        for doc_chunk in document_objects_chunk:           
            chunk_deduplicated = self._handle_duplicate_documents(documents=doc_chunk, 
                                                                  index=index, 
                                                                  duplicate_documents=duplicate_documents, 
                                                                  headers=headers)
            document_objects_deduplicated.extend(chunk_deduplicated)
        documents_to_index = []
        # split the document in chunk of 10000
        for doc in tqdm(document_objects_deduplicated):
            _doc = {
                "_op_type": "index" if duplicate_documents == "overwrite" else "create",
                "_index": index,
                **doc.to_dict(field_map=self._create_document_field_map()),
            }  # type: Dict[str, Any]

            # cast embedding type as ES cannot deal with np.array
            if _doc[self.embedding_field] is not None:
                if type(_doc[self.embedding_field]) == np.ndarray:
                    _doc[self.embedding_field] = _doc[self.embedding_field].tolist()

            # rename id for elastic
            _doc["_id"] = str(_doc.pop("id"))

            # don't index query score and empty fields
            _ = _doc.pop("score", None)
            _doc = {k: v for k, v in _doc.items() if v is not None}

            # In order to have a flat structure in elastic + similar behaviour to the other DocumentStores,
            # we "unnest" all value within "meta"
            if "meta" in _doc.keys():
                for k, v in _doc["meta"].items():
                    _doc[k] = v
                _doc.pop("meta")
            documents_to_index.append(_doc)
        documents_to_index_chunk = split_list_in_chunck(documents_to_index, 9990)
        for doc_chunk in tqdm(documents_to_index_chunk):
            pb_ = parallel_bulk(self.client, 
                              doc_chunk, 
                              chunk_size=10000, 
                              thread_count=8, 
                              queue_size=8,
                              refresh=self.refresh_type, 
                              headers=headers)
            deque(pb_, maxlen=0)
            documents_to_index = []

In [12]:
document_store.write_documents_parallel = write_documents_parallel

In [13]:
%%script false --no-raise-error
all_docs = []
for i in range(0, 100):
    shard = wiki_corpus.shard(100, index=i)
    with tqdm(total=shard.shape[0]) as pbar:
        docs_in_shard = tqdm_asyncio.gather(*[convert_wiki_article_to_docs(item, clean_func=clean_wiki_text, split_paragraphs=True) for item in shard])
        all_docs.append(docs_in_shard)
    print("done with shard ", i)

In [14]:
from  functools import reduce
from operator import iconcat

In [15]:
%%script false --no-raise-error
with tqdm(total=len(all_docs)) as pbar:
     scan_results = await tqdm_asyncio.gather(*all_docs[90:100])

In [16]:
%%script false --no-raise-error
scan_results = reduce(iconcat, scan_results, [])
scan_results = reduce(iconcat, scan_results, [])

With our document indexted int the elastic searh we can search , use the piaf dataset which have question with answers without paragraph and leverage them.

#### Reading document from File

The firt approach have saved the document in differents files in our project, but now we need to read them to read those files for the second test

In [17]:
import json
from pathlib import Path
DATA_PATH = Path.cwd().parent.joinpath("data")
assert DATA_PATH.exists(), "the data path does not exist"
wikipedia_dump = DATA_PATH.joinpath("wikipedia")
assert wikipedia_dump.exists()

In [25]:
 saved_files = []

[PosixPath('/Users/es.py/Projects/Personal/unsupervised-open-domain-french-question-answering/data/wikipedia/dump_97.jsonl'),
 PosixPath('/Users/es.py/Projects/Personal/unsupervised-open-domain-french-question-answering/data/wikipedia/dump_57.jsonl'),
 PosixPath('/Users/es.py/Projects/Personal/unsupervised-open-domain-french-question-answering/data/wikipedia/dump_12.jsonl'),
 PosixPath('/Users/es.py/Projects/Personal/unsupervised-open-domain-french-question-answering/data/wikipedia/dump_73.jsonl'),
 PosixPath('/Users/es.py/Projects/Personal/unsupervised-open-domain-french-question-answering/data/wikipedia/dump_36.jsonl'),
 PosixPath('/Users/es.py/Projects/Personal/unsupervised-open-domain-french-question-answering/data/wikipedia/dump_71.jsonl'),
 PosixPath('/Users/es.py/Projects/Personal/unsupervised-open-domain-french-question-answering/data/wikipedia/dump_34.jsonl'),
 PosixPath('/Users/es.py/Projects/Personal/unsupervised-open-domain-french-question-answering/data/wikipedia/dump_55.j

In [27]:
%%script false --no-raise-error
for index, file in enumerate(wikipedia_dump.iterdir()):
    if file not in saved_files:
        with open(file, "r") as buffer:
            docs = []
            for index_, data in enumerate(buffer.readlines()):
                id_ = f"{index}{index_}"
                json_data = json.loads(json.loads(data))
                json_data["id"] = id_
                document = Document.from_dict(json_data)
                docs.append(document)
            write_documents_parallel(document_store, docs)
            print(f"done saving the {index}th batch document")
        saved_files.append(file)
    else:
        print("I have already processed that file")

#### Writing Congo News Dataset

In [139]:
DATA_PATH

PosixPath('/Users/es.py/Projects/Personal/unsupervised-open-domain-french-question-answering/data')

In [140]:

DRC_NEWS_DATA_PATH = DATA_PATH.joinpath("corpus", "drc-news-txt")

In [141]:
import pandas as pd

In [142]:
data_file_path = DATA_PATH.joinpath("corpus", "raw", 'drc-news-raws.csv')

In [143]:
cd_news_data = pd.read_csv(data_file_path, names=["content", "posted_at"])

In [144]:
cd_news_data = cd_news_data.fillna(value="")
cd_news_data.head()

Unnamed: 0,content,posted_at
0,Les membres de la Commission tarifaire viennent de proposer des mesures néce...,2022-09-05 00:00:00
1,Les membres de la Commission tarifaire sont en session extraordinaire d...,2022-04-05 00:00:00
2,"Vodacom Congo vient de signer un partenariat avec Kinshasa Digital Academy, ...",2022-04-23 00:00:00
3,"Le sélectionneur des Léopards de la RDC, Hectór Cúper est attendu à Kinshasa...",2022-03-05 00:00:00
4,Le protocole d’accord était déjà signé entre la RDC et la compagnie aérienne...,2022-11-05 00:00:00


In [180]:
pd.to_datetime(cd_news_data.posted_at, errors="coerce").dropna().max()

Timestamp('2022-12-05 00:00:00')

In [145]:
from haystack.nodes import TextConverter

In [146]:
from haystack.schema import Document
from secrets import token_hex

# @Todo: this is not working now , it was supposed to save the document to dataframe
def get_document_from_text(row):
    """numpy row with the text and the date of the post

    Args:
        row (_type_): _description_

    Returns:
        _type_: _description_
    """
    text = row[0].replace(u'\xa0', u' ')
    for paragraph in text.split("   "):
        if not paragraph.strip():  # skip empty paragraphs
            continue
        return Document(content=paragraph, meta={"posted_at":row[1] if row[1] else "" })

In [147]:
all_cd_news_docs = cd_news_data.apply(get_document_from_text, axis="columns")

In [148]:
len(all_cd_news_docs)

140638

In [149]:
all_cd_news_docs = all_cd_news_docs.dropna().to_list()

In [150]:
from haystack.errors import HaystackError
from haystack.schema import Document
from typing import List, Optional, Generator, Set, Union
from copy import deepcopy
from haystack.nodes import PreProcessor

class CustomPreProcessor(PreProcessor):
    def __init__(self, custom_preprocessor=None, **kwargs):
        super().__init__(**kwargs)
        self.custom_preprocessor = custom_preprocessor
    def clean(
        self,
        document: Union[dict, Document],
        clean_whitespace: bool,
        clean_header_footer: bool,
        clean_empty_lines: bool,
        remove_substrings: List[str],
        id_hash_keys: Optional[List[str]] = None,
    ) -> Document:
        """
        
        Perform document cleaning on a single document and return a single document. This method will deal with whitespaces, headers, footers
        and empty lines. Its exact functionality is defined by the parameters passed into PreProcessor.__init__().
        """
        if id_hash_keys is None:
            id_hash_keys = self.id_hash_keys

        if isinstance(document, dict):
            document = Document.from_dict(document, id_hash_keys=id_hash_keys)

        # Mainly needed for type checking
        if not isinstance(document, Document):
            raise HaystackError("Document must not be of type 'dict' but of type 'Document'.")
        text = document.content
        text = self.custom_preprocessor(text)
        if clean_header_footer:
            text = self._find_and_remove_header_footer(
                text, n_chars=300, n_first_pages_to_ignore=1, n_last_pages_to_ignore=1
            )

        if clean_whitespace:
            lines = text.splitlines()

            cleaned_lines = []
            for line in lines:
                line = line.strip()
                cleaned_lines.append(line)
            text = "\n".join(cleaned_lines)

        if clean_empty_lines:
            text = re.sub(r"\n\n+", "\n\n", text)

        for substring in remove_substrings:
            text = text.replace(substring, "")

        if text != document.content:
            document = deepcopy(document)
            document.content = text

        return document
    
    

In [151]:
from  functools import reduce
from operator import iconcat

In [152]:
all_cd_news_docs[0]

<Document: {'content': 'Les membres de la Commission tarifaire viennent de proposer des mesures nécessaires visant à corriger les erreurs matérielles pour faciliter la mise en œuvre de la taxe à valeur ajoutée sociale (TVA) et à l’extension du bénéfice du taux réduit à certaines positions tarifaires se rapportant aux produits à vocation sociale et de grande consommation ciblés notamment les produits laitiers pour nourrissons. C’est le résultat des travaux de 4 jours (du 03 au 06 mai dernier) des membres de la Commission tarifaire clôturés le vendredi 06 mai au Romeo Golf de la Gombe par Liévin Chiribagula, conseiller fiscal et représentant du ministre des Finances.', 'content_type': 'text', 'score': None, 'meta': {'posted_at': '2022-09-05 00:00:00'}, 'embedding': None, 'id': '5150ad41ae7d0d36c7fa0b291dbe704b'}>

With our document indexted int the elastic searh we can search , use the piaf dataset which have question with answers without paragraph and leverage them.

In [153]:
from collections import deque

In [154]:
import re
from gensim.utils import deaccent
from unicodedata import normalize as unicode_normalize

In [155]:
def replace_point(document):
    """replace the point with the wwt.www with space point before tokenizing the document .
    TOdos : this may have a a downside when the point is in the middle of a words
    Args:
        document (_type_): _description_
    """
    result = re.sub(r"(\S)\.(\S)", r"\1 . \2", document)
    return result

def replace_website_name(document):
    """sometimes the doucment has the name politico.cd or 7sur7.cd or actualite.cd, we would like to replace them by the 
    actual name of the website. before proper cleaning

    Args:
        document (_type_): _description_
    """
    # @TODO : not sure if this will work but , way better replace by the first line of match.
    
    result = re.sub(r"7SUR7.CD|politico.cd|actualite.cd|mediacongo.net", r"SITE_WEB", document, flags=re.IGNORECASE)
    return result

def remove_accents(document):
    input_without_accent = deaccent(document)
    return input_without_accent

def pre_clean_document(document):
    """pre clean the document by removing the accents and replacing the point with the wwt.www with space point before tokenizing the document .
    TOdos : this may have a a downside when the point is in the middle of a words
    and any other side of cleaning that we want to do .
    Args:
        document (_type_): _description_
    """
    result = remove_accents(document)
    result =  replace_website_name(result)
    result = replace_point(result)
    result = re.sub(r"This post has already been read \d+ times!", "", result) # remove unwanted text
    result = unicode_normalize("NFKD", result)
    return result

In [156]:
preprocessor = CustomPreProcessor(
    clean_empty_lines=True,
    clean_whitespace=True,
    clean_header_footer=False,
    split_by="word",
    split_length=200,
    split_respect_sentence_boundary=True,
    language="fr",
    custom_preprocessor=pre_clean_document,
)


cd_news_docs = preprocessor.process(all_cd_news_docs)


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 140235/140235 [07:51<00:00, 297.51docs/s]


In [157]:

print(f"\nn_docs_output: {len(cd_news_docs)}")


n_docs_output: 299449


In [158]:
write_documents_parallel(document_store, cd_news_docs)

I am inside the function


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 299449/299449 [00:04<00:00, 70277.26it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 30/30 [05:16<00:00, 10.55s/it]


#### Retrieving Documents

In [28]:
%%script false --no-raise-error
write_documents_parallel(document_store, scan_results)

#### PIAF Dataset

In [29]:
from haystack.nodes import BM25Retriever

In [30]:
bm25_retriever = BM25Retriever(document_store=document_store, all_terms_must_match=False)

In [31]:
import pandas as pd

In [33]:
from pathlib import Path
DATA_PATH = Path.cwd().parent.joinpath("data")
assert DATA_PATH.exists(), "the data path does not exist"

In [38]:
piaf_file

PosixPath('/Users/es.py/Projects/Personal/unsupervised-open-domain-french-question-answering/data/corpus/raw/piaf/question-reponse.csv')

In [None]:
/Users/es.py/Projects/Personal/unsupervised-open-domain-french-question-answering/data/corpus/raw/piaf/question-reponse.csv

In [187]:
piaf_file_without_context = DATA_PATH.joinpath("corpus", "raw", "piaf", "question-reponse.csv")

In [39]:
assert piaf_file.exists(), "the piaf dataset does not exist"

piaf_question = data

In [40]:
piaf_df_without_context = pd.read_csv(piaf_file_without_context)

In [41]:
sample_question_response = piaf_df_without_context.sample(1)
question = deaccent(sample_question_response.question.values[0])
response = deaccent(sample_question_response.reponse.values[0])


In [42]:
question

"Quel est le principal sponsor de l'equipe cycliste Vacansoleil-DCM?"

In [273]:
def get_positive_context(retriever: BM25Retriever, search_query:str, answer:str, positive_documents: int = 100) -> List[Document]:    
        """given entitity retrieve the positive context
        we will first retrieve the top  100 documents , 
        - if the answer is in the top 40 document the input of the reader is the top 40 documents
        if the top 40 documents does not contain the answer we check whithin the top 41 to 100 document if the anwer is ther and we put it ther.
        other wise we discard the sentence

        Args:
            retriever (BM25Retriever): _description_
            n_ctxs (int, optional): _description_. Defaults to 15.
            entity (Entity, optional): _description_. Defaults to None.
        """
        list_pos_ctxs = []
        retrieved_docs = retriever.retrieve(query=search_query, top_k=positive_documents)
        for index, retrieve_doc in enumerate(retrieved_docs[0:40]):
            if deaccent(answer.lower()) in retrieve_doc.content.lower():
                list_pos_ctxs.append(
                    {"title": retrieve_doc.meta.get("title"), "content": retrieve_doc.content}
                )
        if len(list_pos_ctxs) == 0:
            for index, retrieve_doc in enumerate(retrieved_docs[40:100]):
                if deaccent(answer.lower()) in retrieve_doc.content.lower():
                    list_pos_ctxs.append(
                        {"title": retrieve_doc.meta.get("title"), "content": retrieve_doc.content}
                    )
        else:
            pass
        if len(list_pos_ctxs) == 0:
            return []
        return list_pos_ctxs

#### Use Piaf Dataset to query context

In [45]:
from datasets import load_dataset

In [46]:
piaf_dataset = load_dataset("piaf")



  0%|          | 0/1 [00:00<?, ?it/s]

In [47]:
piaf_dataset = piaf_dataset["train"]

In [54]:
retrieved_docs

[{'title': 'Thurgood Marshall United State Courthouse',
  'content': "Le Thurgood Marshall United States Courthouse est un gratte-ciel de style neoclassique de 180 metres de hauteur construit a New York de 1933 a 1936 qui abrite un tribunal. L'immeuble a ete concu par l'architecte Cass Gilbert, le concepteur du Woolworth Building."},
 {'title': 'Architecture à New York',
  'content': "Le Woolworth Building, œuvre de l'architecte Cass Gilbert (1913), avec ses 60 etages, depassait alors la Metropolitan Life Tower. Les trois premiers niveaux sont pares d'un beau calcaire remplace aux niveaux suivants par de la terre cuite. La tendance neogothique a pousse l'architecte a ajouter des faux contreforts et des gargouilles. Compte tenu du gigantisme de l'edifice, les elements decoratifs ont ete surdimensionnes afin d'etre apercus depuis la rue. En 1924, Raymond Hood s'occupe de l'American Radiator Building de New York qu'il habille de couleurs et qu'il coiffe d'une decoration de terre cuite dor

In [183]:
%%script false --no-raise-error
for index in tqdm(piaf_df_without_context.index):
    question = piaf_df_without_context.loc[index].question
    response = piaf_df_without_context.loc[index].reponse
    question = deaccent(question)
    response = deaccent(response)
    retrieved_docs = get_positive_context(retriever=bm25_retriever, search_query=question, answer=response, positive_documents=100)
    piaf_df_without_context.loc[index, "positive_context"] = json.dumps(retrieved_docs)

By using our wiki corpus we are able to find some question with positive context, we will leverage them to build our qa system

In [188]:
piaf_with_context = piaf_df_without_context.loc[piaf_df_without_context.positive_context.apply(lambda x: len(json.loads(x))) > 0]

In [195]:
piaf_df_without_context = piaf_df_without_context.loc[piaf_df_without_context.positive_context.apply(lambda x: len(json.loads(x))) == 0]

In [196]:
piaf_df_without_context.head()

Unnamed: 0,question,reponse,positive_context
2,Comment fût payé le bâtiment commandé par Franck Woolworth?,en cash,[]
5,Quelle femme devint reine aux côtés de Philippe le Bel ?,Jeanne Ire de Navarre,[]
8,Quelle raison pousse Philippe Le Bel à organiser les premiers Etats généraux ?,pour lever de nouveaux impôts,[]
9,Quel souverain utilise les dévaluations monétaires pour s'enrichir ?,Philippe IV le Bel,[]
10,Quel souverain rattacha le compté de Toulouse au royaume ?,Philippe III le Hardi,[]


In [202]:
piaf_with_context = piaf_with_context.drop_duplicates(subset=["question", "reponse"])

with our dataset , we can find that {{piaf_with_context.shape[0]}} have positive context and that will be usefull to fine tune our model.

In [67]:
piaf_with_context = piaf_with_context.assign(length_positive_context = piaf_with_context["positive_context"].apply(lambda x: len(json.loads(x))))

In [68]:
piaf_with_context = piaf_with_context.loc[piaf_with_context.length_positive_context > 1].sort_values(by="length_positive_context", ascending=False)

In [390]:
piaf_with_context["contexts"] = piaf_with_context.positive_context.apply(lambda x: json.loads(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  piaf_with_context["contexts"] = piaf_with_context.positive_context.apply(lambda x: json.loads(x))


In [393]:
piaf_with_context.index.name = "id"

In [399]:
piaf_with_context = piaf_with_context.reset_index().loc[:, ["id", "question", "reponse", "contexts"]].rename({"reponse":"answer"}, axis="columns")

In [400]:
piaf_with_context.head()

Unnamed: 0,id,question,answer,contexts
0,0,Quel architecte fût à l'origine des plans du Woolworth building?,Cass Gilbert,"[{'title': 'Thurgood Marshall United State Courthouse', 'content': 'Le Thurg..."
1,1,Où se trouvait Franck Woolworth lors de l'inauguration de son immeuble New Y...,Washington,"[{'title': 'Rue Washington', 'content': ' 38-44 (et 29-31, rue de Berri) : i..."
2,3,En quelle année ouvrit le Woolworth Building ?,1913,"[{'title': 'Kay Johnson', 'content': 'Nee Katharine Johnson, elle etait la f..."
3,4,Qui commanda la construction du Woolworth Building ?,Frank Woolworth,"[{'title': 'Barbara Hutton', 'content': 'Edna Hutton se suicida alors que Ba..."
4,6,Quel créancier du roi fut supprimé en 1312 ?,l'ordre du Temple,"[{'title': 'Couvent de l'ordre du Christ', 'content': 'L'ordre du Temple a e..."


In [384]:
piaf_with_context_from_wikipedia = DATA_PATH.joinpath("corpus", "french-qa", "piaf-with-from-wikipedia-bm25")

In [385]:
piaf_with_context_from_wikipedia.mkdir(exist_ok=True)

In [402]:
for element in piaf_with_context.iterrows():
    key = element[0]
    with open(piaf_with_context_from_wikipedia.joinpath(f"{key}.json"), "w") as f:
        element[1].to_json(path_or_buf=f, orient="index", force_ascii=False, indent=4)

In [203]:
piaf_with_context.to_csv(DATA_PATH.joinpath("corpus", "raw", "piaf", "piaf_with_context.csv"))

In [None]:
DATA_PATH.joinpath("corpus", "french-qa",

In [204]:
piaf_with_context.shape

(5565, 3)

within our dataset , 5550 row have more than one context.

At this point , we have a dataset with context and we have contributed to something

### Adding Additional Context to PIAF

With our dataset with context, let us pull more question with context for the original piaf dataset with context

In [72]:
piaf_dataset = load_dataset("piaf")



  0%|          | 0/1 [00:00<?, ?it/s]

In [75]:
piaf_df = piaf_dataset.to_pandas()

In [76]:
piaf_df.head()

Unnamed: 0,id,title,context,question,answers
0,p140295443291664,Sport,"Les dépenses des ménages représentent plus de 50 % de ces montants (14,2 mil...",Combien de personnes travaillent au ministère des sports,"{'text': ['100 000'], 'answer_start': [472]}"
1,p140295443291520,Sport,"Les dépenses des ménages représentent plus de 50 % de ces montants (14,2 mil...",Combien d'employeurs,"{'text': ['20 000'], 'answer_start': [597]}"
2,p140295443291376,Sport,"Les dépenses des ménages représentent plus de 50 % de ces montants (14,2 mil...",Quel part du budget des ménages,"{'text': ['50'], 'answer_start': [46]}"
3,p140295443291088,Sport,"Les dépenses des ménages représentent plus de 50 % de ces montants (14,2 mil...",Quel montant en 2003,"{'text': ['14,2 milliards'], 'answer_start': [68]}"
4,p140295443290872,Sport,"Les dépenses des ménages représentent plus de 50 % de ces montants (14,2 mil...",Quel montant en 2019,"{'text': ['12 milliards'], 'answer_start': [102]}"


What are we trying to achive with this ?


We have question on the piaf dataset with context, for those question we will query elastic search to find out  the additional context. That context will will be consider as additional context to train our model with.


The algo : 

- for each question , query the elastic search and keep the top 10 documents context
- we will loop and send queries in a batch of 10 questions to elastic search
- retrieved the context and and save anything on the disk.

In the future to improve the quality of our finding we can consider only documents with named entities in the answers. We can check the paragraph , run the NER model on it and then check if the answer is in the named entities.

In [77]:
sample_questions = np.vectorize(deaccent)(piaf_df.question.loc[1:5])
retrieved_docs = bm25_retriever.retrieve_batch(queries=sample_questions.tolist(), top_k=30)



In [205]:
async def query_batch(retriever, queries, top_k=10):
    return retriever.retrieve_batch(queries=queries, top_k=top_k)

In [206]:
def decent_vectorize(queries):
    return np.vectorize(deaccent)(queries)

In [207]:
question_chunk = [decent_vectorize(piaf_df.question.loc[1:5]), decent_vectorize(piaf_df.question.loc[5:10])]

In [208]:
question_chunk

[array(["Combien d'employeurs", 'Quel part du budget des menages',
        'Quel montant en 2003', 'Quel montant en 2019',
        'En quelle annee Jakob Bohme tombe-t-il malade ?'], dtype='<U47'),
 array(['En quelle annee Jakob Bohme tombe-t-il malade ?',
        'Qui est mort en juillet ?',
        'Quel est le metier de Nicolas Thomas ?',
        'Que doit subir Jakob Bohme avant d’obtenir les derniers sacrements ?',
        'Pourquoi Bohme enfle-t-il ?',
        'Quel pays est surnomme la "perle de l\'Afrique" ?'], dtype='<U68')]

In [209]:
async def main():
    return await tqdm_asyncio.gather(*[query_batch(bm25_retriever, queries) for queries in question_chunk])

In [210]:
def write_to_json(data, path):
    with open(path, "w") as f:
        json.dump(data, f, indent=4, ensure_ascii=False)

In [211]:
question_chunk[0][-1]

'En quelle annee Jakob Bohme tombe-t-il malade ?'

In [212]:
piaf_dataset[0]

{'id': 'p140295443291664',
 'title': 'Sport',
 'context': "Les dépenses des ménages représentent plus de 50 % de ces montants (14,2 milliards d'euros en 2003 et 12 milliards d'euros en 2019), contre 7,9 milliards d'euros pour les collectivités locales, 3,2 pour l'État, et 2,2 pour les entreprises. Parmi les dépenses sportives des ménages en 2003, 3,7 milliards sont consacrés aux vêtements de sport et chaussures, 2 aux biens durables, 2,7 aux autres biens et 5,8 aux services. Le Ministère de la Jeunesse et des Sports estime à 100 000 (58 % d'hommes pour 42 % de femmes) le nombre de salariés travaillant pour le secteur sportif en France pour quelque 20 000 employeurs.",
 'question': 'Combien de personnes travaillent au ministère des sports',
 'answers': {'text': ['100 000'], 'answer_start': [472]}}

In [213]:
import json

In [234]:
def save_to_json(instances, path):
    "saves an iterator of multiple json files to files in the path directory"
    for instance in instances:
        with open(path.joinpath(f"{instance['id']}.json"), "w") as f:
            json.dump(instance, f, indent=4, ensure_ascii=False)

In [316]:
async def process_queries_chunk(retriever:BM25Retriever, queries_chunk: pd.DataFrame, path):
    """tak a query chunk process it , query the elastic, instance and write the document to the file.

    Args:
        queries_chunk (pd.DataFrame): _description_
    """
    questions = decent_vectorize(queries_chunk.question)
    titles = queries_chunk.title
    answers = [answer.get("text")[0] for answer in queries_chunk.answers]
    contexts = queries_chunk.context
    ids = queries_chunk.id
    retrieved_docs = retriever.retrieve_batch(queries=questions, top_k=100)
    instances = process_answers(ids=ids, 
                    questions=questions,
                    titles=titles,
                    answers=answers,
                    contexts=contexts,
                    retrieved_docs=retrieved_docs)
    save_to_json(instances, path)
    

In [343]:
def process_doc(retrieved_docs, answer, main_doc):
    """
    convert a list of retrieved document to list of array of list and content
    """
    doc_list = []
    for doc in retrieved_docs:
        content = deaccent(doc.content)
        if answer in doc.content and content != deaccent(main_doc):
            doc_ = {
                "title": doc.meta.get("title"),
                "content": deaccent(doc.content),
                }
            doc_list.append(doc_)
        else:
            pass
    return doc_list

In [318]:
piaf_with_multi_context_path = DATA_PATH.joinpath("corpus", "french-qa", "piaf-with-multi-context")
assert piaf_with_multi_context_path.exists()

In [344]:
def process_answers(ids, questions, titles, answers, contexts, retrieved_docs):
    """process answers and write them to the file.

    Args:
        questions (_type_): _description_
        titles (_type_): _description_
        answers (_type_): _description_
        contexts (_type_): _description_
        retrieved_docs (_type_): _description_
    """
    for id_, question, title, answer, context, retrieved_doc in zip(ids, questions, titles, answers, contexts, retrieved_docs):
        contexts = [{"title": title, "content": context}]
        additional_context = process_doc(retrieved_doc, answer, context)
        contexts.extend(additional_context)
        instance_json = {
            "question": question,
            "answer": answer,
            "contexts": contexts,
            "id": id_,
        }
        yield instance_json

    

In [345]:
async def main():
    await tqdm_asyncio.gather(*[process_queries_chunk(bm25_retriever, queries, piaf_with_multi_context_path) for _, queries in piaf_df.groupby(np.arange(len(piaf_df))//5)])

In [346]:

await main()

100%|█████████████████████████████████████████| 767/767 [03:12<00:00,  3.98it/s]


###  Adding Additional Context to Fquad

In [347]:
frquad_path_train = DATA_PATH.joinpath("corpus", "french-qa", "fquad", "train.json")
frquad_path_valid = DATA_PATH.joinpath("corpus", "french-qa", "fquad", "valid.json")

In [348]:
assert frquad_path_train.exists()
assert frquad_path_valid.exists()

In [349]:
with open(frquad_path_train, "r") as f:
    frquad_train = json.load(f)

In [350]:
frquad_train = frquad_train.get("data")

In [351]:
with open(frquad_path_valid, "r") as f:
    frquad_valid = json.load(f)

In [352]:
frquad_valid = frquad_valid.get("data")

In [353]:
fquad_path = DATA_PATH.joinpath("corpus", "french-qa", "fquad-with-multi-context")
fquad_path_output_train = fquad_path.joinpath("train")
fquad_path_output_valid = fquad_path.joinpath("valid")

In [354]:
for doc in frquad_train[0:2]:
    print(doc.get("title"))

(1)-cérès
american-idiot


In [355]:
async def process_fquad_paragraph(retriever, paragraph, title, path):
    """
    this will loop over the  a paragraph in frquad dataset.
    since each paragraph have a list of question ,
     it will return query the elasticsearch for each batch of question 
    and return the additional related_context

    Args:
        paragraph (_type_): _description_
    """
    context  = paragraph.get("context")
    question_answers = paragraph.get("qas")
    ids = [qa.get("id") for qa in question_answers]
    questions = [qa.get("question") for qa in question_answers]
    answers = [qa.get("answers")[0].get("text") for qa in question_answers]
    retrieved_docs = retriever.retrieve_batch(queries=questions, top_k=100)
    instances = process_answers(ids=ids, 
                                questions=questions,
                                titles=repeat(title, len(questions)),
                                answers=answers,
                                contexts=repeat(context, len(questions)),
                                retrieved_docs=retrieved_docs)
    save_to_json(instances, path)

In [356]:
assert fquad_path.exists()

In [357]:
def check_answer_in_retrieved_docs(answer, retrieved_docs):
    for doc in retrieved_docs:
        if answer in doc.content:
            print(doc)
            return True
    return False

In [358]:
await process_fquad_paragraph(bm25_retriever, frquad_train[0]["paragraphs"][0], "Cérès", fquad_path_output_train)

In [359]:
async def process_fquad(fquad, path, retriever):
    all_article_processor = list()
    for document in tqdm_asyncio(fquad):
        title = document.get("title")
        paragraphs = document.get("paragraphs")
        doc_coroutine = tqdm_asyncio.gather(*[process_fquad_paragraph(retriever=bm25_retriever, paragraph=paragraph, title=title, path=path) for paragraph in paragraphs])
        all_article_processor.append(doc_coroutine)
    return await tqdm_asyncio.gather(*all_article_processor)

In [360]:
await process_fquad(frquad_train, fquad_path_output_train, bm25_retriever)

100%|███████████████████████████████████████| 117/117 [00:00<00:00, 3601.32it/s]
  0%|                                                   | 0/117 [00:00<?, ?it/s]
  0%|                                                     | 0/9 [00:00<?, ?it/s][A

  0%|                                                    | 0/61 [00:00<?, ?it/s][A[A


  0%|                                                    | 0/34 [00:00<?, ?it/s][A[A[A



  0%|                                                    | 0/44 [00:00<?, ?it/s][A[A[A[A




  0%|                                                   | 0/145 [00:00<?, ?it/s][A[A[A[A[A





  0%|                                                    | 0/24 [00:00<?, ?it/s][A[A[A[A[A[A






  0%|                                                    | 0/64 [00:00<?, ?it/s][A[A[A[A[A[A[A







  0%|                                                    | 0/27 [00:00<?, ?it/s][A[A[A[A[A[A[A[A








  0%|                                         

100%|███████████████████████████████████████████| 21/21 [17:27<00:00, 49.86s/it][A


100%|███████████████████████████████████████████| 22/22 [17:27<00:00, 47.59s/it][A[A



100%|███████████████████████████████████████████| 76/76 [17:27<00:00, 13.78s/it][A[A[A




100%|███████████████████████████████████████████| 52/52 [17:27<00:00, 20.14s/it][A[A[A[A





100%|███████████████████████████████████████████| 28/28 [17:27<00:00, 37.39s/it][A[A[A[A[A






100%|███████████████████████████████████████████| 38/38 [17:27<00:00, 27.55s/it][A[A[A[A[A[A







100%|███████████████████████████████████████████| 41/41 [17:27<00:00, 25.54s/it][A[A[A[A[A[A[A








100%|████████████████████████████████████████████| 7/7 [17:27<00:00, 149.58s/it][A[A[A[A[A[A[A[A









100%|███████████████████████████████████████████| 16/16 [17:27<00:00, 65.44s/it][A[A[A[A[A[A[A[A[A










100%|███████████████████████████████████████████| 96/96 [17:27<00:00, 10.91s/it][

100%|████████████████████████████████████████████| 9/9 [17:27<00:00, 116.34s/it][A[A[A[A[A






100%|███████████████████████████████████████████| 54/54 [17:27<00:00, 19.39s/it][A[A[A[A[A[A







100%|███████████████████████████████████████████| 34/34 [17:27<00:00, 30.80s/it][A[A[A[A[A[A[A








100%|███████████████████████████████████████████| 53/53 [17:27<00:00, 19.76s/it][A[A[A[A[A[A[A[A









100%|███████████████████████████████████████████| 32/32 [17:27<00:00, 32.72s/it][A[A[A[A[A[A[A[A[A










100%|███████████████████████████████████████████| 64/64 [17:27<00:00, 16.36s/it][A[A[A[A[A[A[A[A[A[A











100%|███████████████████████████████████████████| 23/23 [17:27<00:00, 45.52s/it][A[A[A[A[A[A[A[A[A[A[A












100%|███████████████████████████████████████████| 27/27 [17:27<00:00, 38.78s/it][A[A[A[A[A[A[A[A[A[A[A[A













100%|███████████████████████████████████████████| 11/11 [17:27<00:00, 95

[[None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None],
 [None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None],
 [None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None],
 [None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  Non

In [361]:
await process_fquad(frquad_valid, fquad_path_output_valid, bm25_retriever)

100%|█████████████████████████████████████████| 18/18 [00:00<00:00, 2691.82it/s]
  0%|                                                    | 0/18 [00:00<?, ?it/s]
  0%|                                                    | 0/84 [00:00<?, ?it/s][A

  0%|                                                    | 0/35 [00:00<?, ?it/s][A[A


  0%|                                                    | 0/38 [00:00<?, ?it/s][A[A[A



  0%|                                                    | 0/31 [00:00<?, ?it/s][A[A[A[A




  0%|                                                    | 0/12 [00:00<?, ?it/s][A[A[A[A[A





  0%|                                                    | 0/59 [00:00<?, ?it/s][A[A[A[A[A[A






  0%|                                                    | 0/42 [00:00<?, ?it/s][A[A[A[A[A[A[A







  0%|                                                    | 0/39 [00:00<?, ?it/s][A[A[A[A[A[A[A[A








  0%|                                         

[[None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None],
 [None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None],
 [None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None

In [362]:
def save_doc_to_json(doc, file):
    "saves an iterator of multiple json files to files in the path directory"
    with open(file, "w") as f:
        json.dump(doc, f, indent=4, ensure_ascii=False)

In [363]:
def count_positive_context(path):
    json_files = path.glob('**/*.json')
    document_with_more = 0
    for file in json_files:
        with open(file, "r") as buffer:
            json_doc = json.load(buffer)
            contexts = json_doc["contexts"]
            if len(contexts) >1 :
                document_with_more +=1
    return document_with_more

In [364]:
count_piaf = count_positive_context(piaf_with_multi_context_path)

In [365]:
count_train_fquad = count_positive_context(fquad_path_output_train)

In [366]:
count_valid_fquad = count_positive_context(fquad_path_output_valid)

In [367]:
count_train_fquad

7418

In [368]:
count_valid_fquad

1130

In [369]:
count_piaf

1134

### Exetat_questions

At this point we have the piaf dataset with context, we have the frquad with context, let us look now the exetat questions with context.

In [264]:
exetat_questions_path = DATA_PATH.joinpath("corpus", "french-qa", "exetat-questions", "questions.json")

In [265]:
assert exetat_questions_path.exists()

In [266]:
with open(exetat_questions_path, "r") as f:
    exetat_questions = json.load(f)

In [267]:
len(exetat_questions)

173

for now we can say that we have 173 exams questions, we need more and get more context to train.frquad_train
But for now we are going to train our model on the conbinaision of piaf and frenchquad dataset.

In [274]:
def process_exetat_question(retriever, questions):
    """
    retrieve paragraph containing the answers from the extetat question

    Args:
        paragraph (_type_): _description_
    """
    processed_questions = []
    for data in tqdm(questions):
        question = data.get("question")
        correct_index = int(data.get("correct_index"))
        options = data.get("options")
        answer = options[correct_index]
        contexts = get_positive_context(retriever, question, answer, positive_documents=300)
        processed_object = {"question": question, "answer": answer, "contexts": contexts}
        processed_questions.append(processed_object)
    return processed_questions
        
        

In [275]:
exetat_questions_processed = process_exetat_question(bm25_retriever, exetat_questions)


  0%|                                                   | 0/173 [00:00<?, ?it/s][A

Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': "Indiquez l'attribution qui est exclusivement réservée au parlement", 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}



  1%|▏                                          | 1/173 [00:00<00:43,  3.91it/s][A
  1%|▍                                          | 2/173 [00:00<00:31,  5.46it/s][A

Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': 'Indiquez les éléments constitutifs du nationalisme', 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}
Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': "Le Facilitateur Eden Kodjo désigné par l'Union Africaine (U.A.) en vue d'amorcer le dialogue national inclusif en R.D.C. est de nationalité", 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}



  2%|▋                                          | 3/173 [00:00<00:38,  4.40it/s][A

Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': "Indiquez le seul pays au monde qui a supprimé l'armée en 1949 et dont l'ordre publique est assurée par la Garde royale", 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}



  2%|▉                                          | 4/173 [00:01<01:16,  2.21it/s][A

Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': "Indiquez le pays d'origine de Moussa Faki actuel Président de la Commission de l'Union Africaine", 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}



  3%|█▏                                         | 5/173 [00:01<01:06,  2.54it/s][A

Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': 'Le système politique par lequel le peuple exerce librement le pouvoir à travers ses élus est appelé:', 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}



  3%|█▍                                         | 6/173 [00:02<00:59,  2.79it/s][A
  4%|█▋                                         | 7/173 [00:02<00:47,  3.48it/s][A

Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': "L'ONU est née sur les cendres de (de la) (de l').", 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}
Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': 'Indiquez la proposition exacte (III) où les organisation ci-après (I) sont associées correctement à leurs sièges respectifs (II).', 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}



  5%|█▉                                         | 8/173 [00:02<00:44,  3.70it/s][A
  5%|██▏                                        | 9/173 [00:02<00:37,  4.33it/s][A

Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': 'En Afrique, le Drakensberg est:', 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}
Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': "Parmi ces propositions, indiquez le bief non navigable du fleuve Congo ayant comme handicap naturel: les chutes des Portes d'Enfer.", 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}



  6%|██▍                                       | 10/173 [00:02<00:44,  3.62it/s][A

Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': "L'Economie de la République Démocratique du Congo présente diverses caractéristiques dont l'une se traduit par le manque de croissance et s'accompagne d'une dégradation du tissu économique. une telle économie est dite:", 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}



  6%|██▋                                       | 11/173 [00:03<00:47,  3.40it/s][A

Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': "L'une des propositions ci-dessous associe une espèce animale à son milieu naturel localisé dans la province du Katanga.", 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}



  7%|██▉                                       | 12/173 [00:03<00:50,  3.16it/s][A

Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': "Le type d'homme ci-après a vécu exclusivement en Afrique.", 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}



  8%|███▏                                      | 13/173 [00:04<01:03,  2.54it/s][A

Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': "L'une des personnalités ci-dessous commandait les troupes de la Force publique lors de la prise de kigali le 06 mai 1916. Il s'agit de:", 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}



  8%|███▍                                      | 14/173 [00:04<01:04,  2.46it/s][A
  9%|███▋                                      | 15/173 [00:04<00:51,  3.07it/s][A

Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': "Dans la composition chimique de l'orge, l'eau a un pourcentage de:", 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}
Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': "Un commercant dispose de 140.650 carats de diamant dans son comptoir. S'il veut exprimer le carat en once au moment au moment de la commercialisation, le nombre de onces sera de:", 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}



  9%|███▉                                      | 16/173 [00:05<00:49,  3.20it/s][A
 10%|████▏                                     | 17/173 [00:05<00:39,  3.98it/s][A

Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': "L'airain est un:", 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}
Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': 'Dans un laboratoire, les défauts de surface des produits métallurgiques sont détectés par:', 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}



 10%|████▎                                     | 18/173 [00:05<00:33,  4.68it/s][A
 11%|████▌                                     | 19/173 [00:05<00:28,  5.46it/s][A

Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': 'Le jus extrait par le broyage de la canne à sucre est appelé:', 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}
Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': 'Le pacte républicain des acteurs politiques congolais était signé à Gaborone en présence de:', 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}



 12%|████▊                                     | 20/173 [00:05<00:27,  5.51it/s][A
 12%|█████                                     | 21/173 [00:05<00:27,  5.53it/s][A

Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': "Déterminez l'organisme spécialisé de l'ONU qui s'occupe de l'éducation, la science et la culture.", 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}
Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': 'Indiquez la proposition où les partis politiques (I) et leurs présidents (II) sont correctement associés (III).', 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}



 13%|█████▎                                    | 22/173 [00:06<00:33,  4.56it/s][A
 13%|█████▌                                    | 23/173 [00:06<00:31,  4.84it/s][A

Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': "En Afrique, l'esclavage est une des causes des migrations résultant de facteurs:", 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}
Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': 'Le thé est un(e):', 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}



 14%|█████▊                                    | 24/173 [00:06<00:28,  5.30it/s][A

Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': "L'une des propositions ci-dessous cadre mieux avec les réussites de l'Union Européenne. Il s'agit de (le, la, l'):", 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}



 14%|██████                                    | 25/173 [00:06<00:30,  4.87it/s][A

Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': 'Les frontières de la République Démocratique du Congo sont très longues (9.045km). Pour rentabiliser la fiscalité, le Gouvernement doit au préalable:', 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}



 15%|██████▎                                   | 26/173 [00:06<00:31,  4.62it/s][A
 16%|██████▌                                   | 27/173 [00:07<00:30,  4.77it/s][A

Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': "Les restes de Zinjanthrope découverts par Lewis Leakey dans la gorge d'Oldoway se classent dans la catégorie des sources dites:", 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}
Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': "La conférence de Yalta (1955) a vu la participation de l'un des pays ci-dessous. Il s'agit de, (du, des):", 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}



 16%|██████▊                                   | 28/173 [00:07<00:31,  4.67it/s][A
 17%|███████                                   | 29/173 [00:07<00:28,  5.06it/s][A

Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': 'La politique coloniale française fut caractérisée en Afrique par le système:', 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}
Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': 'Le gas-oil est utilisé comme (pour)', 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}



 17%|███████▎                                  | 30/173 [00:07<00:27,  5.28it/s][A
 18%|███████▌                                  | 31/173 [00:07<00:26,  5.44it/s][A

Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': 'Indiquez le nom scientifique du palmier à huile.', 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}
Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': "Le métal le plus ductible de tous les métaux s'appelle le(l').", 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}



 18%|███████▊                                  | 32/173 [00:07<00:23,  5.96it/s][A
 19%|████████                                  | 33/173 [00:08<00:23,  5.98it/s][A

Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': "La peinture est fabriqué à base d'huile de:", 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}
Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': "Les aciers au chrome sont utilisés pour la fabrication de (d', des):", 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}



 20%|████████▎                                 | 34/173 [00:08<00:21,  6.48it/s][A

Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': 'Indiquez le système politique où le pouvoir appartient aux femmes.', 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}



 20%|████████▍                                 | 35/173 [00:08<00:25,  5.39it/s][A
 21%|████████▋                                 | 36/173 [00:08<00:25,  5.35it/s][A

Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': "En rapport avec le terme Xénophobie, indiquez l'affirmation correcte.", 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}
Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': 'Indiquez le(s) nom(s) de(s) principale(s) espèce(s) animale(s) protégée(s) au Parc National de Salonga Nord et Sud en République Démocratique du Congo.', 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}



 21%|████████▉                                 | 37/173 [00:09<00:32,  4.13it/s][A
 22%|█████████▏                                | 38/173 [00:09<00:30,  4.49it/s][A

Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': 'indiquez le fleuve européen qui se jette dans la mer Noire.', 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}
Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': "Parmi les provinces administratives de la République démocratique du Congo citées ci-dessous, la plus grande productrice de la cassitérite est le(la,l')", 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}



 23%|█████████▍                                | 39/173 [00:09<00:35,  3.76it/s][A

Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': 'Le lac Mai-Ndombe appartient à la catégorie:', 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}



 23%|█████████▋                                | 40/173 [00:09<00:35,  3.80it/s][A

Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': "Sur la carte muette de la République démocratique du Congo ci-dessous, le dessinateur a indiqué par le chiffre 3 et la lettre E, respectivement le port et la rivière (de, d'):", 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}



 24%|█████████▉                                | 41/173 [00:10<00:42,  3.08it/s][A


Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': "Considérant la notion de date en Histoire, que signifie la 'Chronologie relative'", 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}


 24%|██████████▏                               | 42/173 [00:10<00:37,  3.47it/s][A

Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': "Dégagez ci-dessous l'affirmation correcte relative au déclin de l'empire Kanem-Bornou", 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}



 25%|██████████▍                               | 43/173 [00:10<00:37,  3.51it/s][A
 25%|██████████▋                               | 44/173 [00:10<00:32,  3.94it/s][A

Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': 'Indiquez le but des expéditions des explorateurs portugais vers le sud', 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}
Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': "Indiquez ci- dessous la (les) station(s) fondée(s) par l'explorateur Dr Wolf.", 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}



 26%|██████████▉                               | 45/173 [00:11<00:32,  3.98it/s][A
 27%|███████████▏                              | 46/173 [00:11<00:27,  4.56it/s][A

Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': "Indiquez l'industrie lithique appartenant à l'homo sapiens", 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}
Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': "Lorsqu'un historien se livre aux opérations d'analyse du contenu en vue de s'assurer de la valeur du témoignage, il fait de:", 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}



 27%|███████████▍                              | 47/173 [00:11<00:29,  4.31it/s][A

Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': "Indiquez l'auteur qui pense que ''le philosophe est celui qui possède la totalité du savoir dans la mesure du possible'':", 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}



 28%|███████████▋                              | 48/173 [00:11<00:33,  3.70it/s][A

Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': "Indiquez l'auteur de cette phrase ''Je ne puis être libre que si tous le sont'':", 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}



 28%|███████████▉                              | 49/173 [00:12<00:34,  3.57it/s][A

Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': 'Parmi les propositions ci-dessous, indiquez celle qui correspond au nombre de Titres de la Constitution de la République Démocratique du Congo du 18.02.2006', 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}



 29%|████████████▏                             | 50/173 [00:12<00:40,  3.05it/s][A

Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': "D'Aout 2203 à Mai 2007, la Chambre haute du parlement de la République Démocratique du Congo a été dirigée par:", 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}



 29%|████████████▍                             | 51/173 [00:13<00:41,  2.93it/s][A

Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': "Concernant l'étude économique des entités administratives de la République Démocratique du Congo, désignez la paire des produits agricole et minier exploités dans la province de Bandundu.", 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}



 30%|████████████▌                             | 52/173 [00:13<00:46,  2.59it/s][A

Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': "Parmi les epsèces animales protégées dans les aires gérées par l'Institut Congolais pour la Conservation de la Nature (ICCN) en République Démocratique du Congo, indiquez celle appartenant au Parc National de Salongo", 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}



 31%|████████████▊                             | 53/173 [00:14<00:48,  2.46it/s][A

Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': 'indiquez le nom du barrage africain construit sur le fleuve Niger.', 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}



 31%|█████████████                             | 54/173 [00:14<00:43,  2.76it/s][A

Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': "Parmi les propositions ci-dessous, déterminez le groupe de produits dont l'Egypt est premier producteur.", 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}



 32%|█████████████▎                            | 55/173 [00:14<00:47,  2.50it/s][A

Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': "L'institution de l'Union Européenne (U.E.) qui gère le budget est:", 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}



 32%|█████████████▌                            | 56/173 [00:15<00:43,  2.71it/s][A

Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': "Sur la carte muette de la République Démocratique du Congo ci-contre, le dessinateur a indiqué par le chiffre 5 et la lettre d respectivement le port et la rivière (de,d'):", 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}



 33%|█████████████▊                            | 57/173 [00:15<00:47,  2.46it/s][A
 34%|██████████████                            | 58/173 [00:15<00:38,  3.02it/s][A

Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': 'Sur le plan de la subdivision administrative, en 1914, le Congo Belge comptait:', 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}
Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': "En rapport avec l'histoire économique de la République Démocratique du Congo, l'année 1969 nous rappelle:", 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}



 34%|██████████████▎                           | 59/173 [00:16<00:36,  3.15it/s][A
 35%|██████████████▌                           | 60/173 [00:16<00:30,  3.75it/s][A

Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': 'En 1993, le prix Nobel de la paix a été attribué à:', 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}
Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': 'Le statut politique de la Zambie est:', 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}



 35%|██████████████▊                           | 61/173 [00:16<00:30,  3.65it/s][A

Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': 'Le Premier Ministre de la République Démocratique du Congo du 23 Avril 1981 au 5 Novembre 1982 est:', 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}



 36%|███████████████                           | 62/173 [00:16<00:36,  3.04it/s][A

Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': "L'organe de l'union Africaine qui a pour mission de préparer la conférence des chefs d'Etat et d'approuver le budget est:", 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}



 36%|███████████████▎                          | 63/173 [00:17<00:36,  3.03it/s][A

Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': "''Les filles sont capricieuses; Or Kingombe est une fille; Donc Kingombe est capricieuse''. Ce syllogisme est la meilleure expression:", 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}



 37%|███████████████▌                          | 64/173 [00:17<00:34,  3.17it/s][A
 38%|███████████████▊                          | 65/173 [00:17<00:29,  3.69it/s][A

Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': "''Vous ne payez pas, vous n'entrez pas''. Ce raisonnement constitue:", 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}
Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': "Ce qui est important, c'est ''de savoir si en général nous possédons la vérité et quel moyen nous avons pour nous en assurer''. Indiquez l'auteur de cette préoccupation", 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}



 38%|████████████████                          | 66/173 [00:18<00:31,  3.37it/s][A

Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': "La nature particulière d'agir s'appelle:", 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}



 39%|████████████████▎                         | 67/173 [00:18<00:31,  3.38it/s][A
 39%|████████████████▌                         | 68/173 [00:18<00:27,  3.82it/s][A

Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': 'Après examen du carré logique, indiquez les propositions contraires', 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}
Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': "''Certains enfants sont malins; Or quelques élèves sont enfant''. Ce syllogisme est invalide parce que:", 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}



 40%|████████████████▊                         | 69/173 [00:18<00:29,  3.47it/s][A


Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': 'Le pacte républicain des auteurs politiques congolais était signé à Gaborone en présence de:', 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}


 40%|████████████████▉                         | 70/173 [00:19<00:27,  3.81it/s][A

Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': "Déterminez l'organisme spécialisé de l'ONU qui s'occupe de l'éducation, la science et la culture.", 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}



 41%|█████████████████▏                        | 71/173 [00:19<00:27,  3.70it/s][A

Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': 'Indiquez la proposition où les partis politiques (I) et leurs présidents (II) sont correctement associés (III).', 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}



 42%|█████████████████▍                        | 72/173 [00:19<00:28,  3.57it/s][A
 42%|█████████████████▋                        | 73/173 [00:19<00:22,  4.40it/s][A

Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': "En Afrique, l'esclavage est une des causes des migrations résultant de facteurs:", 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}
Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': 'Le thé est un(e):', 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}



 43%|█████████████████▉                        | 74/173 [00:19<00:18,  5.26it/s][A

Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': "L'une des propositions ci-dessous cadre mieux avec les réussites de l'Union Européenne. Il s'agit de (le, la, l'):", 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}



 43%|██████████████████▏                       | 75/173 [00:20<00:21,  4.60it/s][A

Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': 'Les frontières de la République Démocratique du Congo sont très longues (9.045km). Pour rentabiliser la fiscalité, le Gouvernement doit au préalable:', 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}



 44%|██████████████████▍                       | 76/173 [00:20<00:22,  4.41it/s][A

Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': "Les restes de Zinjanthrope découverts par Lewis Leakey dans la gorge d'Oldoway se classent dans la catégorie des sources dites:", 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}



 45%|██████████████████▋                       | 77/173 [00:20<00:21,  4.49it/s][A

Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': "La conférence de Yalta (1955) a vu la participation de l'un des pays ci-dessous. Il s'agit de, (du, des):", 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}



 45%|██████████████████▉                       | 78/173 [00:21<00:25,  3.69it/s][A
 46%|███████████████████▏                      | 79/173 [00:21<00:23,  4.08it/s][A

Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': 'La politique coloniale française fut caractérisée en Afrique par le système:', 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}
Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': 'Le gas-oil est utilisé comme (pour)', 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}



 47%|███████████████████▋                      | 81/173 [00:21<00:16,  5.61it/s][A

Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': 'Indiquez le nom scientifique du palmier à huile.', 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}
Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': "Le métal le plus ductible de tous les métaux s'appelle le(l').", 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}



 47%|███████████████████▉                      | 82/173 [00:21<00:15,  5.92it/s][A
 48%|████████████████████▏                     | 83/173 [00:21<00:14,  6.04it/s][A

Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': "La peinture est fabriqué à base d'huile de:", 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}
Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': "Les aciers au chrome sont utilisés pour la fabrication de (d', des):", 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}



 49%|████████████████████▍                     | 84/173 [00:21<00:13,  6.54it/s][A

Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': 'Indiquez la proposition où les partis politiques (I) et leurs présidents (II) sont correctement associés (III).', 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}



 49%|████████████████████▋                     | 85/173 [00:22<00:18,  4.78it/s][A

Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': "Le système politique qui concentre le pouvoir entre les mains d'une seule personne est appelé:", 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}



 50%|████████████████████▉                     | 86/173 [00:22<00:24,  3.54it/s][A
 50%|█████████████████████                     | 87/173 [00:22<00:20,  4.21it/s][A

Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': 'En Afrique, le Nyiragongo est:', 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}
Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': "Parmi les moyens de télécommunication cités ci-dessous, un seul assure le plus d'informations à plus large spectre en République Démocratique du Congo. Il s'agit de (du):", 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}



 51%|█████████████████████▎                    | 88/173 [00:23<00:31,  2.72it/s][A

Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': "Sur la carte muette de l'Afrique ci-contre, les chiffres 2 et 4 représentent respectivement les climats:", 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}



 51%|█████████████████████▌                    | 89/173 [00:23<00:32,  2.61it/s][A

Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': 'La pyramide des âges en République Démocratique du Congo a un sommet rétréci. Cela veut dire que', 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}



 52%|█████████████████████▊                    | 90/173 [00:24<00:29,  2.84it/s][A

Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': "Le territoire de Yakoma situé dans la Province de l'Equateur est traversé par la rivière", 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}



 53%|██████████████████████                    | 91/173 [00:24<00:26,  3.10it/s][A

Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': 'La politique coloniale anglaise fut caractérisée en Afrique par le système:', 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}



 53%|██████████████████████▎                   | 92/173 [00:24<00:24,  3.32it/s][A

Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': 'Indiquez ci-dessous le nom du souverain du royaume Kongo:', 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}



 54%|██████████████████████▌                   | 93/173 [00:24<00:22,  3.53it/s][A
 54%|██████████████████████▊                   | 94/173 [00:25<00:18,  4.20it/s][A

Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': "L'étude des sceaux apposés sur les documents est la (l'):", 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}
Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': 'Indiquez la proposition exacte (III) où les pays ci-après (I) sont associés correctement aux leaders de leur décolonisation respective (II).', 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}



 55%|███████████████████████                   | 95/173 [00:25<00:18,  4.24it/s][A

Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': "Le type d'hommes ci-après constitue le deuxième chaînon de l'hominisation. Il s'agit de (l'):", 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}



 55%|███████████████████████▎                  | 96/173 [00:25<00:17,  4.31it/s][A
 56%|███████████████████████▌                  | 97/173 [00:25<00:16,  4.65it/s][A

Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': "L'expression verbale d'un jugement s'appelle:", 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}
Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': "Né en 470 avant notre ère, maître de PLATON, donnait cours sans salaire, fondateur de la ''Maieutique'', mort en 399 sans laisser d'écrits. Cette courte biographie est de:", 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}



 57%|███████████████████████▊                  | 98/173 [00:26<00:22,  3.31it/s][A

Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': "La branche de la philosophie qui essaie d'ériger une échelle de valeurs qui permet de distinguer le bien du mal est:", 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}



 57%|████████████████████████                  | 99/173 [00:26<00:24,  3.02it/s][A

Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': "L'état de mentalité du Congolais responsable dans le service de recettes qui est tenté surtout de voler l'argent du trésor public a pour origine:", 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}



 58%|███████████████████████▋                 | 100/173 [00:26<00:25,  2.81it/s][A

Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': 'Indiquez le sens du Cinquantenaire de la République Démocratique du Congo quant à la détermination, au verdict et aux aactions à entreprendre prochainement.', 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}



 58%|███████████████████████▉                 | 101/173 [00:27<00:24,  2.95it/s][A

Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': "Indiquez celui qui en République Démocratique du Congo, est à la tête de l'organe de gestion du pouvoir judiciaire", 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}



 59%|████████████████████████▏                | 102/173 [00:27<00:22,  3.18it/s][A

Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': 'Indiquez le nom du détroit européen qui est le passage de la Manche à la Mer du Nord.', 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}



 60%|████████████████████████▍                | 103/173 [00:27<00:19,  3.53it/s][A

Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': 'Parmi les handicaps naturels cités ci-dessous et parsemés le long du fleuve Congo, indiquez celui entre Ubundu et Kisangani.', 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}



 60%|████████████████████████▋                | 104/173 [00:28<00:23,  2.98it/s][A

Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': 'Indiquez la proposition (III) où la province congolaise (I) et son site touristique (II) sont correctement associés.', 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}



 61%|████████████████████████▉                | 105/173 [00:28<00:20,  3.25it/s][A

Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': "Sur la carte muette de l'Afrique ci-contre, le dessinateur a indiqué par le chiffre 3 le mont ou massif (de,d').", 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}



 61%|█████████████████████████                | 106/173 [00:28<00:22,  2.95it/s][A

Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': "Indiquez la portée exacte de la procédure d' ''émendatio''.", 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}



 62%|█████████████████████████▎               | 107/173 [00:29<00:19,  3.32it/s][A
 62%|█████████████████████████▌               | 108/173 [00:29<00:16,  3.84it/s][A

Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': "Indiquez ce qui caractérise le ''Paléolithique moyen''.", 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}
Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': "A partir de 1880, les européens vont s'intéresser à l'Afrique pour ''raison scientifique'', c'est-à-dire:", 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}



 63%|█████████████████████████▊               | 109/173 [00:29<00:19,  3.33it/s][A

Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': "Indiquez la province de la République Démocratique du Congo qui contient à la fois les gisements diamantifères et l'or.", 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}



 64%|██████████████████████████               | 110/173 [00:29<00:18,  3.32it/s][A
 64%|██████████████████████████▎              | 111/173 [00:30<00:15,  3.92it/s][A

Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': "Indiquez l'alliage de l'étain utilisé dans les conserveries.", 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}
Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': "Le rendement en métal pur d'uranium naturel est de:", 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}



 65%|██████████████████████████▌              | 112/173 [00:30<00:13,  4.37it/s][A

Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': "Parmi les pays de culture vivrière et industrielle, indiquez celui qui est originaire de l'Afrique.", 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}



 65%|██████████████████████████▊              | 113/173 [00:30<00:14,  4.23it/s][A
 66%|███████████████████████████              | 114/173 [00:30<00:12,  4.63it/s][A

Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': "Indiquez le premier pays producteur mondial d'uranium.", 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}
Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': 'Le système politique par lequel le peuple exerce librement le pouvoir à travers ses élus est appelé:', 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}



 66%|███████████████████████████▎             | 115/173 [00:30<00:13,  4.26it/s][A
 67%|███████████████████████████▍             | 116/173 [00:31<00:11,  5.04it/s][A

Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': "L'ONU est née sur les cendres de (de la) (de l').", 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}
Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': 'Indiquez la proposition exacte (III) où les organisation ci-après (I) sont associées correctement à leurs sièges respectifs (II).', 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}



 68%|███████████████████████████▋             | 117/173 [00:31<00:12,  4.49it/s][A
 68%|███████████████████████████▉             | 118/173 [00:31<00:10,  5.29it/s][A

Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': 'En Afrique, le Drakensberg est:', 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}
Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': "Parmi ces propositions, indiquez le bief non navigable du fleuve Congo ayant comme handicap naturel: les chutes des Portes d'Enfer.", 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}



 69%|████████████████████████████▏            | 119/173 [00:31<00:12,  4.16it/s][A

Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': "L'Economie de la République Démocratique du Congo présente diverses caractéristiques dont l'une se traduit par le manque de croissance et s'accompagne d'une dégradation du tissu économique. une telle économie est dite:", 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}



 69%|████████████████████████████▍            | 120/173 [00:32<00:14,  3.59it/s][A

Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': "L'une des propositions ci-dessous associe une espèce animale à son milieu naturel localisé dans la province du Katanga.", 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}



 70%|████████████████████████████▋            | 121/173 [00:32<00:14,  3.52it/s][A

Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': "Le type d'homme ci-après a vécu exclusivement en Afrique.", 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}



 71%|████████████████████████████▉            | 122/173 [00:32<00:14,  3.49it/s][A

Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': "L'une des personnalités ci-dessous commandait les troupes de la Force publique lors de la prise de kigali le 06 mai 1916. Il s'agit de:", 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}



 71%|█████████████████████████████▏           | 123/173 [00:33<00:15,  3.30it/s][A
 72%|█████████████████████████████▍           | 124/173 [00:33<00:12,  3.98it/s][A

Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': "Dans la composition chimique de l'orge, l'eau a un pourcentage de:", 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}
Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': "Un commercant dispose de 140.650 carats de diamant dans son comptoir. S'il veut exprimer le carat en once au moment au moment de la commercialisation, le nombre de onces sera de:", 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}



 72%|█████████████████████████████▌           | 125/173 [00:33<00:11,  4.13it/s][A
 73%|█████████████████████████████▊           | 126/173 [00:33<00:09,  4.85it/s][A

Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': "L'airain est un:", 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}
Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': 'Dans un laboratoire, les défauts de surface des produits métallurgiques sont détectés par:', 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}



 73%|██████████████████████████████           | 127/173 [00:33<00:09,  5.07it/s][A

Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': 'Le jus extrait par le broyage de la canne à sucre est appelé:', 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}
Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': 'Le pacte républicain des acteurs politiques congolais était signé à Gaborone en présence de:', 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}



 75%|██████████████████████████████▌          | 129/173 [00:34<00:07,  6.07it/s][A
 75%|██████████████████████████████▊          | 130/173 [00:34<00:07,  6.00it/s][A

Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': "Déterminez l'organisme spécialisé de l'ONU qui s'occupe de l'éducation, la science et la culture.", 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}
Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': 'Indiquez la proposition où les partis politiques (I) et leurs présidents (II) sont correctement associés (III).', 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}



 76%|███████████████████████████████          | 131/173 [00:34<00:09,  4.38it/s][A
 77%|███████████████████████████████▌         | 133/173 [00:34<00:06,  5.89it/s][A

Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': "En Afrique, l'esclavage est une des causes des migrations résultant de facteurs:", 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}
Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': 'Le thé est un(e):', 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}
Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': "L'une des propositions ci-dessous cadre mieux avec les réussites de l'Union Européenne. Il s'agit de (le, la, l'):", 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}



 77%|███████████████████████████████▊         | 134/173 [00:35<00:08,  4.79it/s][A

Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': 'Les frontières de la République Démocratique du Congo sont très longues (9.045km). Pour rentabiliser la fiscalité, le Gouvernement doit au préalable:', 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}



 78%|███████████████████████████████▉         | 135/173 [00:35<00:08,  4.71it/s][A

Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': "Les restes de Zinjanthrope découverts par Lewis Leakey dans la gorge d'Oldoway se classent dans la catégorie des sources dites:", 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}



 79%|████████████████████████████████▏        | 136/173 [00:35<00:08,  4.48it/s][A

Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': "La conférence de Yalta (1955) a vu la participation de l'un des pays ci-dessous. Il s'agit de, (du, des):", 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}



 79%|████████████████████████████████▍        | 137/173 [00:35<00:09,  3.76it/s][A
 80%|████████████████████████████████▋        | 138/173 [00:36<00:08,  4.08it/s][A

Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': 'La politique coloniale française fut caractérisée en Afrique par le système:', 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}
Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': 'Le gas-oil est utilisé comme (pour)', 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}



 80%|████████████████████████████████▉        | 139/173 [00:36<00:06,  4.90it/s][A
 81%|█████████████████████████████████▏       | 140/173 [00:36<00:06,  5.46it/s][A

Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': 'Indiquez le nom scientifique du palmier à huile.', 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}
Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': "Le métal le plus ductible de tous les métaux s'appelle le(l').", 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}



 82%|█████████████████████████████████▍       | 141/173 [00:36<00:05,  6.17it/s][A
 82%|█████████████████████████████████▋       | 142/173 [00:36<00:04,  6.47it/s][A

Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': "La peinture est fabriqué à base d'huile de:", 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}
Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': "Les aciers au chrome sont utilisés pour la fabrication de (d', des):", 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}



 83%|█████████████████████████████████▉       | 143/173 [00:36<00:05,  5.27it/s][A

Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': 'Indiquez la proposition où les partis politiques (I) et leurs présidents (II) sont correctement associés (III).', 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}



 83%|██████████████████████████████████▏      | 144/173 [00:37<00:06,  4.57it/s][A

Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': "Le système politique qui concentre le pouvoir entre les mains d'une seule personne est appelé:", 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}



 84%|██████████████████████████████████▎      | 145/173 [00:37<00:07,  3.60it/s][A

Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': 'En Afrique, le Nyiragongo est:', 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}
Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': "Parmi les moyens de télécommunication cités ci-dessous, un seul assure le plus d'informations à plus large spectre en République Démocratique du Congo. Il s'agit de (du):", 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}



 85%|██████████████████████████████████▊      | 147/173 [00:38<00:07,  3.30it/s][A

Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': "Sur la carte muette de l'Afrique ci-contre, les chiffres 2 et 4 représentent respectivement les climats:", 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}



 86%|███████████████████████████████████      | 148/173 [00:38<00:07,  3.30it/s][A
 86%|███████████████████████████████████▎     | 149/173 [00:38<00:06,  3.80it/s][A

Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': 'La pyramide des âges en République Démocratique du Congo a un sommet rétréci. Cela veut dire que', 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}
Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': "Le territoire de Yakoma situé dans la Province de l'Equateur est traversé par la rivière", 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}



 87%|███████████████████████████████████▌     | 150/173 [00:38<00:05,  4.35it/s][A
 87%|███████████████████████████████████▊     | 151/173 [00:39<00:04,  4.79it/s][A

Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': 'La politique coloniale anglaise fut caractérisée en Afrique par le système:', 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}
Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': 'Indiquez ci-dessous le nom du souverain du royaume Kongo:', 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}



 88%|████████████████████████████████████     | 152/173 [00:39<00:04,  5.02it/s][A
 88%|████████████████████████████████████▎    | 153/173 [00:39<00:03,  5.40it/s][A

Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': "L'étude des sceaux apposés sur les documents est la (l'):", 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}
Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': 'Indiquez la proposition exacte (III) où les pays ci-après (I) sont associés correctement aux leaders de leur décolonisation respective (II).', 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}



 89%|████████████████████████████████████▍    | 154/173 [00:39<00:03,  4.96it/s][A

Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': "Le type d'hommes ci-après constitue le deuxième chaînon de l'hominisation. Il s'agit de (l'):", 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}



 90%|████████████████████████████████████▋    | 155/173 [00:39<00:03,  4.84it/s][A
 90%|████████████████████████████████████▉    | 156/173 [00:39<00:03,  5.14it/s][A

Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': "L'expression verbale d'un jugement s'appelle:", 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}
Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': "Né en 470 avant notre ère, maître de PLATON, donnait cours sans salaire, fondateur de la ''Maieutique'', mort en 399 sans laisser d'écrits. Cette courte biographie est de:", 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}



 91%|█████████████████████████████████████▏   | 157/173 [00:40<00:04,  3.76it/s][A
 91%|█████████████████████████████████████▍   | 158/173 [00:40<00:03,  4.05it/s][A

Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': "La branche de la philosophie qui essaie d'ériger une échelle de valeurs qui permet de distinguer le bien du mal est:", 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}
Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': "L'état de mentalité du Congolais responsable dans le service de recettes qui est tenté surtout de voler l'argent du trésor public a pour origine:", 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}



 92%|█████████████████████████████████████▋   | 159/173 [00:40<00:03,  3.59it/s][A

Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': 'Indiquez le sens du Cinquantenaire de la République Démocratique du Congo quant à la détermination, au verdict et aux aactions à entreprendre prochainement.', 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}



 92%|█████████████████████████████████████▉   | 160/173 [00:41<00:03,  3.72it/s][A


Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': "Indiquez celui qui en République Démocratique du Congo, est à la tête de l'organe de gestion du pouvoir judiciaire", 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}


 93%|██████████████████████████████████████▏  | 161/173 [00:41<00:02,  4.01it/s][A

Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': 'Indiquez le nom du détroit européen qui est le passage de la Manche à la Mer du Nord.', 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}



 94%|██████████████████████████████████████▍  | 162/173 [00:41<00:02,  4.13it/s][A

Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': 'Parmi les handicaps naturels cités ci-dessous et parsemés le long du fleuve Congo, indiquez celui entre Ubundu et Kisangani.', 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}



 94%|██████████████████████████████████████▋  | 163/173 [00:41<00:02,  3.74it/s][A

Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': 'Indiquez la proposition (III) où la province congolaise (I) et son site touristique (II) sont correctement associés.', 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}



 95%|██████████████████████████████████████▊  | 164/173 [00:42<00:02,  3.88it/s][A

Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': "Sur la carte muette de l'Afrique ci-contre, le dessinateur a indiqué par le chiffre 3 le mont ou massif (de,d').", 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}



 95%|███████████████████████████████████████  | 165/173 [00:42<00:02,  3.32it/s][A
 97%|███████████████████████████████████████▌ | 167/173 [00:42<00:01,  4.87it/s][A

Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': "Indiquez la portée exacte de la procédure d' ''émendatio''.", 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}
Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': "Indiquez ce qui caractérise le ''Paléolithique moyen''.", 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}
Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': "A partir de 1880, les européens vont s'intéresser à l'Afrique pour ''raison scientifique'', c'est-à-dire:", 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}



 97%|███████████████████████████████████████▊ | 168/173 [00:43<00:01,  4.54it/s][A
 98%|████████████████████████████████████████ | 169/173 [00:43<00:00,  4.68it/s][A

Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': "Indiquez la province de la République Démocratique du Congo qui contient à la fois les gisements diamantifères et l'or.", 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}
Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': "Indiquez l'alliage de l'étain utilisé dans les conserveries.", 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}



 98%|████████████████████████████████████████▎| 170/173 [00:43<00:00,  5.13it/s][A
 99%|████████████████████████████████████████▌| 171/173 [00:43<00:00,  5.49it/s][A

Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': "Le rendement en métal pur d'uranium naturel est de:", 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}
Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': "Parmi les pays de culture vivrière et industrielle, indiquez celui qui est originaire de l'Afrique.", 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}



 99%|████████████████████████████████████████▊| 172/173 [00:43<00:00,  5.18it/s][A
100%|█████████████████████████████████████████| 173/173 [00:43<00:00,  3.94it/s][A

Retriever query: {'size': '300', 'query': {'bool': {'must': [{'multi_match': {'query': "Indiquez le premier pays producteur mondial d'uranium.", 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}





In [276]:
exetat_with_answers = list(filter(lambda x: len(x["contexts"]) >0, exetat_questions_processed))
exetat_without_anwers = list(filter(lambda x: len(x["contexts"]) == 0, exetat_questions_processed))

In [277]:
for element in piaf_with_context.iterrows():
    key = element[0]
    with open(piaf_with_context_from_wikipedia.joinpath(f"{key}.json"), "w") as f:
        element[1].to_json(path_or_buf=f, orient="index", force_ascii=False, indent=4)

35

Only 27 question from our extetat dataset have an answer after querying using bm25

In [404]:
exetat_with_context_bm25 = exetat_questions_path.parent.parent.joinpath("exetat-with-context-from-wikipedia-bm25")

In [408]:
exetat_with_context_bm25.mkdir(exist_ok=True)

In [413]:
for key, element in enumerate(exetat_with_answers):
    with open(exetat_with_context_bm25.joinpath(f"{key}.json"), "w") as f:
        json.dump(element, f, ensure_ascii=False, indent=4)

In [286]:
with open(exetat_questions_path.parent.joinpath("exetat-with-answers.json"), "w+") as buffer:
    json.dump(exetat_with_answers, buffer)

In [287]:
with open(exetat_questions_path.parent.joinpath("exetat-without-answers.json"), "w+") as buffer:
    json.dump(exetat_without_anwers, buffer)