In this notebook we will index the french dump version of the wikipedia dataset, and use with the retrieval for our qa/system

In [40]:
import numpy as np
from haystack.utils import clean_wiki_text, convert_files_to_docs, fetch_archive_from_http, print_answers

In [2]:
from datasets import load_dataset

In [3]:
wiki_corpus = load_dataset("wikipedia", "20220301.fr", split="train")



In [4]:
wiki_corpus = wiki_corpus.shuffle(seed=42)



In [5]:
sample_wiki = wiki_corpus.shard(100, index=0)

In [6]:
sample_doc_dict = sample_wiki[1]

In [7]:
sample_doc_dict

{'id': '12576878',
 'url': 'https://fr.wikipedia.org/wiki/Teinture%20naturelle',
 'title': 'Teinture naturelle',
 'text': "Une teinture naturelle est une matière colorante dérivée de plantes ou d'invertébrés. On parle de teinture quand il s'agit de colorer des textiles au moyen de produits solubles dans l'eau ; les minéraux donnent des colorants insolubles appelés pigments qui s'appliquent mal à cet usage.\n\nLa plupart des teintures naturelles proviennent de racines, de baies, d'écorce, de feuilles ou de bois de plantes tinctoriales ou bien de champignons ou de lichens.\n\nHistoire \n\nLe concept de teinture naturelle se développe par opposition aux teintures issues de la chimie organique dans la deuxième moitié du . Il oppose aux produits de l'industrie chimique, les teintures que des artisans peuvent extraire de plantes, de lichens, d'insectes. Auparavant, on ne considérait comme naturelles que les quelques substances utilisables sans transformation, tandis que la plupart des teintu

In [10]:
from haystack.schema import Document

In [11]:
docs = Document.from_dict(sample_doc_dict, field_map={"text": "content"})

#### Building the Elastic Search Index

In [12]:
from haystack.utils import clean_wiki_text, convert_files_to_docs

In [39]:
from haystack.errors import HaystackError
from haystack.schema import Document
from typing import List, Optional, Generator, Set, Union, Callable, Dict
from copy import deepcopy
from haystack.nodes import PreProcessor
import re

In [14]:
from gensim.utils import deaccent

def remove_accents(document):
    input_without_accent = deaccent(document)
    return input_without_accent

In [15]:
async def convert_wiki_article_to_docs(
    item: dict,
    clean_func: Optional[Callable] = None,
    split_paragraphs: bool = False,
) -> List[Document]:
    """
    item

    :param items: dict of items
    :param clean_func: a custom cleaning function that gets applied to each doc (input: str, output:str)
    :param split_paragraphs: split text in paragraphs.
    """
    documents = []
    processed = 0
    text = item.get("text")
    text = remove_accents(text)
    if clean_func:
        text = clean_func(text)
    if split_paragraphs:
        for para in text.split("\n"):
            if 200 <= len(para.strip()) <= 2000:
                # just pick the paragraph with length between 50 and 1500
                processed += 1
                doc = Document(content=para, meta={"title": item.get("title")}, id=item.get("id"))
                documents.append(doc)
            else:
                continue
    else:
        processed += 1
        doc = Document(content=text, meta={"title": item.get("title")}, id=item.get("id"))
        documents.append(doc)
    
    return documents

In [16]:
import asyncio

#### Saving the Document in the retriever

In [17]:
from tqdm import tqdm
from tqdm.asyncio import tqdm_asyncio
from haystack.document_stores import ElasticsearchDocumentStore


In [137]:
INDEX_NAME = 'fr-wikipedia'

In [138]:
%%script false --no-raise-error
document_store = ElasticsearchDocumentStore(index=INDEX_NAME, recreate_index=True, analyzer="french")

In [76]:
%%script false --no-raise-error
all_docs = []
for i in range(0, 100):
    shard = wiki_corpus.shard(100, index=i)
    with tqdm(total=shard.shape[0]) as pbar:
        docs_in_shard = tqdm_asyncio.gather(*[convert_wiki_article_to_docs(item, clean_func=clean_wiki_text, split_paragraphs=True) for item in shard])
        all_docs.append(docs_in_shard)
    print("done with shard ", i)

In [None]:
from  functools import reduce
from operator import iconcat

In [77]:
%%script false --no-raise-error
with tqdm(total=len(all_docs)) as pbar:
     scan_results = await tqdm_asyncio.gather(*all_docs[90:100])

In [78]:
%%script false --no-raise-error
scan_results = reduce(iconcat, scan_results, [])
scan_results = reduce(iconcat, scan_results, [])

With our document indexted int the elastic searh we can search , use the piaf dataset which have question with answers without paragraph and leverage them.

In [67]:
from collections import deque

In [68]:
from elasticsearch.helpers import bulk, parallel_bulk, scan
def write_documents_parallel(
        self,
        documents: Union[List[dict], List[Document]],
        index: Optional[str] = None,
        batch_size: int = 10_000,
        duplicate_documents: Optional[str] = None,
        headers: Optional[Dict[str, str]] = None,
    ):
        """
        Indexes documents for later queries in Elasticsearch./ an update for the index that use parralell bluk

        Behaviour if a document with the same ID already exists in ElasticSearch:
        a) (Default) Throw Elastic's standard error message for duplicate IDs.
        b) If `self.update_existing_documents=True` for DocumentStore: Overwrite existing documents.
        (This is only relevant if you pass your own ID when initializing a `Document`.
        If don't set custom IDs for your Documents or just pass a list of dictionaries here,
        they will automatically get UUIDs assigned. See the `Document` class for details)

        :param documents: a list of Python dictionaries or a list of Haystack Document objects.
                          For documents as dictionaries, the format is {"content": "<the-actual-text>"}.
                          Optionally: Include meta data via {"content": "<the-actual-text>",
                          "meta":{"name": "<some-document-name>, "author": "somebody", ...}}
                          It can be used for filtering and is accessible in the responses of the Finder.
                          Advanced: If you are using your own Elasticsearch mapping, the key names in the dictionary
                          should be changed to what you have set for self.content_field and self.name_field.
        :param index: Elasticsearch index where the documents should be indexed. If not supplied, self.index will be used.
        :param batch_size: Number of documents that are passed to Elasticsearch's bulk function at a time.
        :param duplicate_documents: Handle duplicates document based on parameter options.
                                    Parameter options : ( 'skip','overwrite','fail')
                                    skip: Ignore the duplicates documents
                                    overwrite: Update any existing documents with the same ID when adding documents.
                                    fail: an error is raised if the document ID of the document being added already
                                    exists.
        :param headers: Custom HTTP headers to pass to elasticsearch client (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='})
                Check out https://www.elastic.co/guide/en/elasticsearch/reference/current/http-clients.html for more information.
        :raises DuplicateDocumentError: Exception trigger on duplicate document
        :return: None
        """

        if index and not self.client.indices.exists(index=index, headers=headers):
            self._create_document_index(index, headers=headers)

        if index is None:
            index = self.index
        duplicate_documents = duplicate_documents or self.duplicate_documents
        assert (
            duplicate_documents in self.duplicate_documents_options
        ), f"duplicate_documents parameter must be {', '.join(self.duplicate_documents_options)}"

        field_map = self._create_document_field_map()
        document_objects = [Document.from_dict(d, field_map=field_map) if isinstance(d, dict) else d for d in documents]
        document_objects = self._handle_duplicate_documents(
            documents=document_objects, index=index, duplicate_documents=duplicate_documents, headers=headers
        )
        documents_to_index = []
        for doc in tqdm(document_objects):
            _doc = {
                "_op_type": "index" if duplicate_documents == "overwrite" else "create",
                "_index": index,
                **doc.to_dict(field_map=self._create_document_field_map()),
            }  # type: Dict[str, Any]

            # cast embedding type as ES cannot deal with np.array
            if _doc[self.embedding_field] is not None:
                if type(_doc[self.embedding_field]) == np.ndarray:
                    _doc[self.embedding_field] = _doc[self.embedding_field].tolist()

            # rename id for elastic
            _doc["_id"] = str(_doc.pop("id"))

            # don't index query score and empty fields
            _ = _doc.pop("score", None)
            _doc = {k: v for k, v in _doc.items() if v is not None}

            # In order to have a flat structure in elastic + similar behaviour to the other DocumentStores,
            # we "unnest" all value within "meta"
            if "meta" in _doc.keys():
                for k, v in _doc["meta"].items():
                    _doc[k] = v
                _doc.pop("meta")
            documents_to_index.append(_doc)

            # Pass batch_size number of documents to bulk
            if len(documents_to_index) % batch_size == 0:
                pb_ = parallel_bulk(self.client, 
                              documents_to_index, 
                              chunk_size=10000, 
                              thread_count=8, 
                              queue_size=8,
                              refresh=self.refresh_type, 
                              headers=headers)
                deque(pb_, maxlen=0)
                documents_to_index = []

        if documents_to_index:
            pb_= parallel_bulk(self.client, 
                          documents_to_index, 
                          chunk_size=10000, 
                          thread_count=8, 
                          queue_size=8,
                          refresh=self.refresh_type, 
                          headers=headers)
            deque(pb_, maxlen=0)

In [43]:
document_store.write_documents_parallel = write_documents_parallel

In [79]:
%%script false --no-raise-error
write_documents_parallel(document_store, scan_results)

In [81]:
from haystack.nodes import BM25Retriever

In [82]:
bm25_retriever = BM25Retriever(document_store=document_store, all_terms_must_match=False)

In [83]:
import pandas as pd

In [84]:
from pathlib import Path
DATA_PATH = Path.cwd().joinpath("data")
assert DATA_PATH.exists(), "the data path does not exist"

In [85]:
piaf_file = DATA_PATH.joinpath("corpus", "raw", "piaf", "questoin-reponse.csv")

In [86]:
assert piaf_file.exists(), "the piaf dataset does not exist"

piaf_question = data

In [87]:
piaf_df_without_context = pd.read_csv(piaf_file)

In [88]:
sample_question_response = piaf_df_without_context.sample(1)
question = deaccent(sample_question_response.question.values[0])
response = deaccent(sample_question_response.reponse.values[0])


In [89]:
question

'Quels plats de viandes epicees sont prepares a partir de ces ingredients ?'

In [90]:
def get_positive_context(retriever: BM25Retriever, search_query:str, answer:str, positive_documents: int = 100) -> List[Document]:    
        """given entitity retrieve the positive context
        we will first retrieve the top  100 documents , 
        - if the answer is in the top 40 document the input of the reader is the top 40 documents
        if the top 40 documents does not contain the answer we check whithin the top 41 to 100 document if the anwer is ther and we put it ther.
        other wise we discard the sentence

        Args:
            retriever (BM25Retriever): _description_
            n_ctxs (int, optional): _description_. Defaults to 15.
            entity (Entity, optional): _description_. Defaults to None.
        """
        list_pos_ctxs = []
        retrieved_docs = retriever.retrieve(query=search_query, top_k=positive_documents)
        for index, retrieve_doc in enumerate(retrieved_docs[0:40]):
            if answer.lower() in retrieve_doc.content.lower():
                list_pos_ctxs.append(
                    {"title": retrieve_doc.meta.get("title"), "content": retrieve_doc.content}
                )
        if len(list_pos_ctxs) == 0:
            for index, retrieve_doc in enumerate(retrieved_docs[40:100]):
                if answer.lower() in retrieve_doc.content.lower():
                    list_pos_ctxs.append(
                        {"title": retrieve_doc.meta.get("title"), "content": retrieve_doc.content}
                    )
        else:
            pass
        if len(list_pos_ctxs) == 0:
            return []
        return list_pos_ctxs

#### Use Piaf Dataset to query context

In [91]:
piaf_dataset = load_dataset("piaf")



  0%|          | 0/1 [00:00<?, ?it/s]

In [186]:
piaf_dataset = piaf_dataset["train"]

In [92]:
for index in tqdm(piaf_df_without_context.index):
    question = piaf_df_without_context.loc[index].question
    response = piaf_df_without_context.loc[index].reponse
    question = deaccent(question)
    response = deaccent(response)
    retrieved_docs = get_positive_context(retriever=bm25_retriever, search_query=question, answer=response, positive_documents=100)
    piaf_df_without_context.loc[index, "positive_context"] = retrieved_docs



By using our wiki corpus we are able to find some question with positive context, we will leverage them to build our qa system

In [93]:
piaf_with_context = piaf_df_without_context.loc[piaf_df_without_context.positive_context.apply(lambda x: len(x)) > 0]

In [94]:
piaf_with_context.shape

(2459, 3)

with our dataset , we can find that {{piaf_with_context.shape[0]}} have positive context and that will be usefull to fine tune our model.

In [95]:
piaf_with_context = piaf_with_context.assign(length_positive_context =piaf_with_context["positive_context"].apply(lambda x: len(x)))

In [117]:
piaf_with_context.loc[piaf_with_context.length_positive_context > 1].sort_values(by="length_positive_context", ascending=False)

Unnamed: 0,question,reponse,positive_context,length_positive_context
6756,quel mot ne figure pas dans le webster?,A,"[{'title': 'They singulier', 'content': 'En , le pronom au singulier fait s...",40
2516,Un dirigeant a-t-il séparé sa région de l'empire ?,Le,"[{'title': 'Conquêtes mongoles', 'content': 'Durant ces annees, l'empire se ...",39
6762,quel est le nom court pour République italienne?,Italie,"[{'title': 'Constitution de l'Italie', 'content': 'La constitution italienne...",38
5539,Comment accordait-on le mridang?,on,"[{'title': 'Parchemin', 'content': ' tambours : djembe (chevre, antilope, ze...",38
11413,qui a parachuté des provisions et mitraillé les positions chinoises ?,la,"[{'title': 'Pierre Claude', 'content': 'Le , lors d'un combat aerien contre ...",37
...,...,...,...,...
5891,Dans quelle matières sont façonnés les bijoux algériens ?,argent,"[{'title': 'Bouton de manchette', 'content': 'Il est souvent considere comme...",2
5918,Combien d'écoles anglaises appartiennent aux 100 meilleures universités en 2...,12,"[{'title': 'Flávio Augusto da Silva', 'content': 'Ne et eleve dans la banlie...",2
5919,Combien d'écoles anglaises appartiennent aux 100 meilleures universités en 2...,12,"[{'title': 'Flávio Augusto da Silva', 'content': 'Ne et eleve dans la banlie...",2
5920,où est classée l'université d'oxford en 2018 ?,première,"[{'title': 'Alfred Dürr', 'content': 'Durr etudia la musicologie et la philo...",2


In [190]:
retrieved_docs = bm25_retriever.retrieve(query=
    deaccent("De quelle langue est issue le mot mycelium ?"), top_k=30)

Retriever query: {'size': '30', 'query': {'bool': {'must': [{'multi_match': {'query': 'Combien de personnes travaillent au ministere des sports', 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}




In [191]:
retrieved_docs

[<Document: {'content': " Memorandum d'accord sur la cooperation dans le domaine du sport entre le ministere de la Jeunesse et des Sports de la Republique d'Azerbaidjan et le ministere de la Culture, des Sports et du Tourisme de la Republique islamique du Pakistan (Departement du sport et du tourisme) le . ", 'content_type': 'text', 'score': 0.8916248110404073, 'meta': {'title': 'Ministère de la Jeunesse et des Sports (Azerbaïdjan)'}, 'embedding': None, 'id': '12511970'}>,
 <Document: {'content': " Un assistant parlementaire ou assistante parlementaire, utilise parfois pour designer une personne travaillant dans une institution parlementaire aupres d'un(e) elu(e). La denomination officielle est cependant collaborateur ou collaboratrice parlementaire, on parle aussi parfois d'attache parlementaire pour ceux qui travaillent dans les Ministeres.", 'content_type': 'text', 'score': 0.891140095656625, 'meta': {'title': 'Assistant'}, 'embedding': None, 'id': '2487015'}>,
 <Document: {'content

In [118]:
piaf_with_context.to_csv(DATA_PATH.joinpath("corpus", "raw", "piaf", "piaf_with_context.csv"))

within our dataset , 740 row have more than one context.

With our dataset with context, let us pull more question with context for the original piaf dataset with context

In [119]:
piaf_dataset = load_dataset("piaf")



  0%|          | 0/1 [00:00<?, ?it/s]

In [121]:
piaf_dataset = piaf_dataset["train"]

In [129]:
piaf_df = piaf_dataset.to_pandas()

In [156]:
piaf_df.head()

Unnamed: 0,id,title,context,question,answers
0,p140295443291664,Sport,"Les dépenses des ménages représentent plus de 50 % de ces montants (14,2 mil...",Combien de personnes travaillent au ministère des sports,"{'text': ['100 000'], 'answer_start': [472]}"
1,p140295443291520,Sport,"Les dépenses des ménages représentent plus de 50 % de ces montants (14,2 mil...",Combien d'employeurs,"{'text': ['20 000'], 'answer_start': [597]}"
2,p140295443291376,Sport,"Les dépenses des ménages représentent plus de 50 % de ces montants (14,2 mil...",Quel part du budget des ménages,"{'text': ['50'], 'answer_start': [46]}"
3,p140295443291088,Sport,"Les dépenses des ménages représentent plus de 50 % de ces montants (14,2 mil...",Quel montant en 2003,"{'text': ['14,2 milliards'], 'answer_start': [68]}"
4,p140295443290872,Sport,"Les dépenses des ménages représentent plus de 50 % de ces montants (14,2 mil...",Quel montant en 2019,"{'text': ['12 milliards'], 'answer_start': [102]}"


What are we trying to achive with this ?


We have question on the piaf dataset with context, for those question we will query elastic search to find out  the additional context. That context will will be consider as additional context to train our model with.


The algo : 

- for each question , query the elastic search and keep the top 10 documents context
- we will loop and send queries in a batch of 10 questions to elastic search
- retrieved the context and and save anything on the disk.

In the future to improve the quality of our finding we can consider only documents with named entities in the answers. We can check the paragraph , run the NER model on it and then check if the answer is in the named entities.

In [154]:
sample_questions = np.vectorize(deaccent)(piaf_df.question.loc[1:5])
retrieved_docs = bm25_retriever.retrieve_batch(queries=sample_questions.tolist(), top_k=30)



array(["Combien d'employeurs", 'Quel part du budget des menages',
       'Quel montant en 2003', 'Quel montant en 2019',
       'En quelle annee Jakob Bohme tombe-t-il malade ?'], dtype='<U47')

In [158]:
retrieved_docs[1]

[<Document: {'content': 'La Classification des fonctions de consommation des menages ( COICOP) est une nomenclature internationale des fonctions de consommation des menages. En France, elle est notamment utilisee pour calculer l\'indice des prix a la consommation, l\'enquete "budget de famille" ou le systeme de comptabilite nationale.', 'content_type': 'text', 'score': 0.9016003740616048, 'meta': {'title': 'Classification des fonctions de consommation des ménages'}, 'embedding': None, 'id': '7734046'}>,
 <Document: {'content': "Le budget de la communaute de communes est compose du budget principal et de huit budgets annexes : dechets menagers, spanc, maison de sante de l'Arconce, office de tourisme, port de plaisance, Barbereche, Zac des Muriers, Ligerval. Ensemble leur montant est de 36 880 930,64 € en fonctionnement 10 836 558,61 € en investissements pour l'annee 2019.", 'content_type': 'text', 'score': 0.8936375103809415, 'meta': {'title': 'Communauté de communes Le Grand Charolais'

In [241]:
async def query_batch(retriever, queries, top_k=10):
    return retriever.retrieve_batch(queries=queries, top_k=top_k)

In [242]:
def decent_vectorize(queries):
    return np.vectorize(deaccent)(queries)

In [243]:
question_chunk = [decent_vectorize(piaf_df.question.loc[1:5]), decent_vectorize(piaf_df.question.loc[5:10])]

In [244]:
async def main():
    return await tqdm_asyncio.gather(*[query_batch(bm25_retriever, queries) for queries in question_chunk])

In [269]:
def write_to_json(data, path):
    with open(path, "w") as f:
        json.dump(data, f, indent=4, ensure_ascii=False)

In [246]:
question_chunk[0][-1]

'En quelle annee Jakob Bohme tombe-t-il malade ?'

In [188]:
answers[0][-1]

[<Document: {'content': "Gregor Richter est le fils du dernier forgeron de l'abbaye de Marienthal. A Breslau, il frequente l'ecole secondaire a partir de 1576 et etudie la theologie a l'universite de Francfort-sur-l’Oder. En 1584, Gregor Richter devient professeur au gymnasium de Gorlitz et en 1587 il devient pasteur a Rauscha pres de Gorlitz. Trois ans plus tard, il retourne a Gorlitz, d'abord comme diacre et a partir de 1595 comme archidiacre. En 1606, il devient Pastor primarius a Gorlitz et en tant que tel est l'un des plus grands opposants theologiques de Jakob Bohme, qui est membre de sa congregation a l'eglise Saints-Pierre-et-Paul de Gorlitz. Apres le premier opuscule de Bohme, Aurora, ecrit en 1612, Richter l'accuse d'heresie. Bohme est brievement arrete par le conseil municipal et interdit d'ecrire, ce qu'il accepte pendant plusieurs annees. Lorsque Bohme meurt un quart d'annee apres Richter, les habitants agites cassent la tombe de Bohme au cimetiere Saint-Nicolas de Gorlitz

In [189]:
piaf_dataset[0]

{'id': 'p140295443291664',
 'title': 'Sport',
 'context': "Les dépenses des ménages représentent plus de 50 % de ces montants (14,2 milliards d'euros en 2003 et 12 milliards d'euros en 2019), contre 7,9 milliards d'euros pour les collectivités locales, 3,2 pour l'État, et 2,2 pour les entreprises. Parmi les dépenses sportives des ménages en 2003, 3,7 milliards sont consacrés aux vêtements de sport et chaussures, 2 aux biens durables, 2,7 aux autres biens et 5,8 aux services. Le Ministère de la Jeunesse et des Sports estime à 100 000 (58 % d'hommes pour 42 % de femmes) le nombre de salariés travaillant pour le secteur sportif en France pour quelque 20 000 employeurs.",
 'question': 'Combien de personnes travaillent au ministère des sports',
 'answers': {'text': ['100 000'], 'answer_start': [472]}}

In [283]:
import json

In [284]:
async def process_queries_chunk(retriever:BM25Retriever, queries_chunk: pd.DataFrame):
    """tak a query chunk process it , query the elastic, instance and write the document to the file.

    Args:
        queries_chunk (pd.DataFrame): _description_
    """
    questions = decent_vectorize(queries_chunk.question)
    titles = queries_chunk.title
    answers = [answer.get("text")[0] for answer in queries_chunk.answers]
    contexts = queries_chunk.context
    ids = queries_chunk.id
    retrieved_docs = retriever.retrieve_batch(queries=questions, top_k=10)
    process_answers(ids=ids, 
                    questions=questions,
                    titles=titles,
                    answers=answers,
                    contexts=contexts,
                    retrieved_docs=retrieved_docs)

In [285]:
def process_doc(retrieved_docs):
    """
    convert a list of retrieved document to list of array of list and content
    """
    doc_list = []
    for doc in retrieved_docs:
        doc_ = {
            "title": doc.meta.get("title"),
            "content": doc.content,
            }
        doc_list.append(doc_)
    return doc_list

In [286]:
piaf_with_multi_context_path = DATA_PATH.joinpath("corpus", "french-qa", "piaf-with-multi-context")
assert piaf_with_multi_context_path.exists()

In [287]:
def process_answers(ids, questions, titles, answers, contexts, retrieved_docs):
    """process answers and write them to the file.

    Args:
        questions (_type_): _description_
        titles (_type_): _description_
        answers (_type_): _description_
        contexts (_type_): _description_
        retrieved_docs (_type_): _description_
    """
    for id_, question, title, answer, context, retrieved_doc in zip(ids, questions, titles, answers, contexts, retrieved_docs):
        contexts = [{"title": title, "content": context}]
        additional_context = process_doc(retrieved_doc)
        contexts.extend(additional_context)
        instance_json = {
            "question": question,
            "answer": answer,
            "contexts": contexts,
            "id": id_,
        }
        path = piaf_with_multi_context_path.joinpath(f"{id_}.json")
        write_to_json(instance_json, path)

    

In [197]:
for  in piaf_df.loc[:10].groupby(np.arange(len(piaf_df))//20):
    print(k, g)

PosixPath('/Users/es.py/Projects/Personal/unsupervised-open-domain-french-question-answering/data/corpus/french_qa/piaf-with-multi-context')

In [212]:
np.arange(10)//5

array([0, 0, 0, 0, 0, 1, 1, 1, 1, 1])

In [217]:
piaf_df.loc[:9].shape

(10, 5)

In [290]:
async def main():
    return await tqdm_asyncio.gather(*[process_queries_chunk(bm25_retriever, queries) for _, queries in piaf_df.groupby(np.arange(len(piaf_df))//5)])

In [291]:
await main()

100%|██████████| 767/767 [01:27<00:00,  8.73it/s] 


[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,

In [294]:
bm25_retriever.retrieve(query=
    deaccent("De quelle langue est issue le mot mycelium ?"), top_k=10)

Retriever query: {'size': '10', 'query': {'bool': {'must': [{'multi_match': {'query': 'De quelle langue est issue le mot mycelium ?', 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}


[<Document: {'content': 'Un mot savant est un neologisme ou un mot dont l’evolution etymologique a ete freinee par le contexte socio-historique et qui reste de ce fait plus proche de son etymon. Par exemple, « fragile » est un mot savant dont le correspondant populaire est « frele ». Plus generalement, la langue savante (marquee de formations savantes) est la langue scientifique issue du latin ou du grec, opposee a la langue populaire ou vulgaire.', 'content_type': 'text', 'score': 0.9178798627418258, 'meta': {'title': 'Mot savant'}, 'embedding': None, 'id': '9713327'}>,
 <Document: {'content': "La croissance du mycelium qui a lieu dans l'espace intercellulaire, est fortement correlee a la croissance de l'hote. Le mycelium colonise les nouvelles feuilles et les nouveaux talles. La transmission par la semence est tres efficace. Dans la graine le mycelium est localise essentiellement dans la couche a aleurone.", 'content_type': 'text', 'score': 0.8908335557594226, 'meta': {'title': 'Epic