### Dense Passage Retriever

In the following notebook , we will train a dense passage retriever model. We will use it to retrieve document on 3 datasets:

- The wikipedia dataset
- The the congo news dataset
- history book dataset

In [2]:
import numpy as np
from haystack.utils import clean_wiki_text, convert_files_to_docs, fetch_archive_from_http, print_answers

In [3]:
from datasets import load_dataset

In [3]:
wiki_corpus = load_dataset("wikipedia", "20220301.fr", split="train")



In [4]:
wiki_corpus = wiki_corpus.shuffle(seed=42)



In [5]:
sample_wiki = wiki_corpus.shard(100, index=0)

In [6]:
sample_doc_dict = sample_wiki[1]

In [7]:
from haystack.schema import Document

#### Building the Elastic Search Index

In [8]:
from haystack.utils import clean_wiki_text, convert_files_to_docs

In [9]:
from haystack.errors import HaystackError
from haystack.schema import Document
from typing import List, Optional, Generator, Set, Union, Callable, Dict
from copy import deepcopy
from haystack.nodes import PreProcessor
import re

In [10]:
from gensim.utils import deaccent

def remove_accents(document):
    input_without_accent = deaccent(document)
    return input_without_accent

In [11]:
async def convert_wiki_article_to_docs(
    item: dict,
    clean_func: Optional[Callable] = None,
    split_paragraphs: bool = False,
) -> List[Document]:
    """
    item

    :param items: dict of items
    :param clean_func: a custom cleaning function that gets applied to each doc (input: str, output:str)
    :param split_paragraphs: split text in paragraphs.
    """
    documents = []
    processed = 0
    text = item.get("text")
    text = remove_accents(text)
    if clean_func:
        text = clean_func(text)
    if split_paragraphs:
        for para in text.split("\n"):
            if 200 <= len(para.strip()) <= 2000:
                # just pick the paragraph with length between 50 and 1500
                processed += 1
                doc = Document(content=para, meta={"title": item.get("title")}, id=item.get("id"))
                documents.append(doc)
            else:
                continue
    else:
        processed += 1
        doc = Document(content=text, meta={"title": item.get("title")}, id=item.get("id"))
        documents.append(doc)
    
    return documents

In [4]:
import asyncio

#### Saving the Document in the retriever

In [7]:
from tqdm import tqdm
from tqdm.asyncio import tqdm_asyncio
from haystack.utils import launch_milvus
from haystack.document_stores import MilvusDocumentStore
from haystack.document_stores import FAISSDocumentStore

In [5]:
INDEX_NAME = 'fr-wikipedia'

In [8]:
document_store = FAISSDocumentStore(faiss_index_factory_str="Flat")

In [16]:
all_docs = []
for i in range(0, 100):
    shard = wiki_corpus.shard(100, index=i)
    with tqdm(total=shard.shape[0]) as pbar:
        docs_in_shard = tqdm_asyncio.gather(*[convert_wiki_article_to_docs(item, clean_func=clean_wiki_text, split_paragraphs=True) for item in shard])
        all_docs.append(docs_in_shard)
    print("done with shard ", i)

  0%|          | 0/24021 [00:20<?, ?it/s]


done with shard  0


  0%|          | 0/24021 [00:16<?, ?it/s]


done with shard  1


  0%|          | 0/24021 [00:13<?, ?it/s]


done with shard  2


  0%|          | 0/24021 [00:13<?, ?it/s]


done with shard  3


  0%|          | 0/24021 [00:13<?, ?it/s]


done with shard  4


  0%|          | 0/24021 [00:13<?, ?it/s]


done with shard  5


  0%|          | 0/24021 [00:12<?, ?it/s]


done with shard  6


  0%|          | 0/24021 [00:12<?, ?it/s]


done with shard  7


  0%|          | 0/24021 [00:13<?, ?it/s]


done with shard  8


  0%|          | 0/24021 [00:12<?, ?it/s]


done with shard  9


  0%|          | 0/24021 [00:11<?, ?it/s]


done with shard  10


  0%|          | 0/24021 [00:11<?, ?it/s]


done with shard  11


  0%|          | 0/24021 [00:10<?, ?it/s]


done with shard  12


  0%|          | 0/24021 [00:10<?, ?it/s]


done with shard  13


  0%|          | 0/24021 [00:11<?, ?it/s]


done with shard  14


  0%|          | 0/24021 [00:10<?, ?it/s]


done with shard  15


  0%|          | 0/24021 [00:11<?, ?it/s]


done with shard  16


  0%|          | 0/24021 [00:10<?, ?it/s]


done with shard  17


  0%|          | 0/24021 [00:10<?, ?it/s]


done with shard  18


  0%|          | 0/24021 [00:10<?, ?it/s]


done with shard  19


  0%|          | 0/24021 [00:10<?, ?it/s]


done with shard  20


  0%|          | 0/24021 [00:10<?, ?it/s]


done with shard  21


  0%|          | 0/24021 [00:11<?, ?it/s]


done with shard  22


  0%|          | 0/24021 [00:09<?, ?it/s]


done with shard  23


  0%|          | 0/24021 [00:09<?, ?it/s]


done with shard  24


  0%|          | 0/24021 [00:09<?, ?it/s]


done with shard  25


  0%|          | 0/24021 [00:12<?, ?it/s]


done with shard  26


  0%|          | 0/24021 [00:12<?, ?it/s]


done with shard  27


  0%|          | 0/24021 [00:14<?, ?it/s]


done with shard  28


  0%|          | 0/24021 [00:17<?, ?it/s]


done with shard  29


  0%|          | 0/24021 [00:16<?, ?it/s]


done with shard  30


  0%|          | 0/24021 [00:12<?, ?it/s]


done with shard  31


  0%|          | 0/24021 [00:14<?, ?it/s]


done with shard  32


  0%|          | 0/24021 [00:11<?, ?it/s]


done with shard  33


  0%|          | 0/24021 [00:13<?, ?it/s]


done with shard  34


  0%|          | 0/24021 [00:12<?, ?it/s]


done with shard  35


  0%|          | 0/24021 [00:12<?, ?it/s]


done with shard  36


  0%|          | 0/24021 [00:13<?, ?it/s]


done with shard  37


  0%|          | 0/24021 [00:13<?, ?it/s]


done with shard  38


  0%|          | 0/24021 [00:17<?, ?it/s]


done with shard  39


  0%|          | 0/24021 [00:14<?, ?it/s]


done with shard  40


  0%|          | 0/24021 [00:14<?, ?it/s]


done with shard  41


  0%|          | 0/24021 [00:16<?, ?it/s]


done with shard  42


  0%|          | 0/24021 [00:14<?, ?it/s]


done with shard  43


  0%|          | 0/24021 [00:14<?, ?it/s]


done with shard  44


  0%|          | 0/24021 [00:15<?, ?it/s]


done with shard  45


  0%|          | 0/24021 [00:15<?, ?it/s]


done with shard  46


  0%|          | 0/24021 [00:13<?, ?it/s]


done with shard  47


  0%|          | 0/24021 [00:14<?, ?it/s]


done with shard  48


  0%|          | 0/24021 [00:12<?, ?it/s]


done with shard  49


  0%|          | 0/24021 [00:12<?, ?it/s]


done with shard  50


  0%|          | 0/24021 [00:15<?, ?it/s]


done with shard  51


  0%|          | 0/24021 [00:19<?, ?it/s]


done with shard  52


  0%|          | 0/24021 [00:16<?, ?it/s]


done with shard  53


  0%|          | 0/24021 [00:13<?, ?it/s]


done with shard  54


  0%|          | 0/24021 [00:18<?, ?it/s]


done with shard  55


  0%|          | 0/24021 [00:14<?, ?it/s]


done with shard  56


  0%|          | 0/24021 [00:15<?, ?it/s]


done with shard  57


  0%|          | 0/24021 [00:14<?, ?it/s]


done with shard  58


  0%|          | 0/24021 [00:11<?, ?it/s]


done with shard  59


  0%|          | 0/24021 [00:12<?, ?it/s]


done with shard  60


  0%|          | 0/24021 [00:11<?, ?it/s]


done with shard  61


  0%|          | 0/24021 [00:12<?, ?it/s]


done with shard  62


  0%|          | 0/24021 [00:12<?, ?it/s]


done with shard  63


  0%|          | 0/24021 [00:12<?, ?it/s]


done with shard  64


  0%|          | 0/24021 [00:14<?, ?it/s]


done with shard  65


  0%|          | 0/24021 [00:19<?, ?it/s]


done with shard  66


  0%|          | 0/24021 [00:22<?, ?it/s]


done with shard  67


  0%|          | 0/24021 [00:27<?, ?it/s]


done with shard  68


  0%|          | 0/24021 [00:24<?, ?it/s]


done with shard  69


  0%|          | 0/24021 [00:13<?, ?it/s]


done with shard  70


  0%|          | 0/24021 [00:17<?, ?it/s]


done with shard  71


  0%|          | 0/24021 [00:14<?, ?it/s]


done with shard  72


  0%|          | 0/24021 [00:12<?, ?it/s]


done with shard  73


  0%|          | 0/24021 [00:17<?, ?it/s]


done with shard  74


  0%|          | 0/24021 [00:16<?, ?it/s]


done with shard  75


  0%|          | 0/24021 [00:17<?, ?it/s]


done with shard  76


  0%|          | 0/24021 [00:14<?, ?it/s]


done with shard  77


  0%|          | 0/24021 [00:14<?, ?it/s]


done with shard  78


  0%|          | 0/24021 [00:15<?, ?it/s]


done with shard  79


  0%|          | 0/24021 [00:15<?, ?it/s]


done with shard  80


  0%|          | 0/24021 [00:21<?, ?it/s]


done with shard  81


  0%|          | 0/24021 [00:18<?, ?it/s]


done with shard  82


  0%|          | 0/24021 [00:15<?, ?it/s]


done with shard  83


  0%|          | 0/24021 [00:12<?, ?it/s]


done with shard  84


  0%|          | 0/24021 [00:14<?, ?it/s]


done with shard  85


  0%|          | 0/24021 [00:16<?, ?it/s]


done with shard  86


  0%|          | 0/24021 [00:12<?, ?it/s]


done with shard  87


  0%|          | 0/24021 [00:31<?, ?it/s]


done with shard  88


  0%|          | 0/24021 [00:15<?, ?it/s]


done with shard  89


  0%|          | 0/24021 [00:18<?, ?it/s]


done with shard  90


  0%|          | 0/24021 [00:16<?, ?it/s]


done with shard  91


  0%|          | 0/24021 [00:16<?, ?it/s]


done with shard  92


  0%|          | 0/24021 [00:18<?, ?it/s]


done with shard  93


  0%|          | 0/24021 [00:22<?, ?it/s]


done with shard  94


  0%|          | 0/24020 [00:17<?, ?it/s]


done with shard  95


  0%|          | 0/24020 [00:14<?, ?it/s]


done with shard  96


  0%|          | 0/24020 [00:15<?, ?it/s]


done with shard  97


  0%|          | 0/24020 [00:13<?, ?it/s]


done with shard  98


  0%|          | 0/24020 [00:16<?, ?it/s]

done with shard  99





In [30]:
with tqdm(total=len(all_docs)) as pbar:
     scan_results = await tqdm_asyncio.gather(*all_docs)

  0%|          | 0/100 [00:00<?, ?it/s]
[A

[A[A


[A[A[A



[A[A[A[A




[A[A[A[A[A





[A[A[A[A[A[A






[A[A[A[A[A[A[A







[A[A[A[A[A[A[A[A








[A[A[A[A[A[A[A[A[A









[A[A[A[A[A[A[A[A[A[A










[A[A[A[A[A[A[A[A[A[A[A











[A[A[A[A[A[A[A[A[A[A[A[A












[A[A[A[A[A[A[A[A[A[A[A[A[A













[A[A[A[A[A[A[A[A[A[A[A[A[A[A














[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A















[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A
















[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A

















[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A
[A
[A
[A


















100%|██████████| 24021/24021 [52:18<00:00,  7.65it/s]A[A


[A[A

[A[A

100%|██████████| 24021/24021 [52:18<00:00,  7.65it/s]



[A[A[A


[A[A[A


100%|██████████| 24021/24021 [52:11<00:00,  7.67it/s]




[A[A[A[A



[A[A[A[A




In [48]:

scan_results = reduce(iconcat, scan_results, [])
scan_results = reduce(iconcat, scan_results, [])

In [49]:
len(scan_results)

9454610

In [50]:
document_store.write_documents(scan_results)

Writing Documents:   0%|          | 0/9454610 [00:00<?, ?it/s]

#### Writing Congo News Dataset

In [10]:
DATA_PATH

PosixPath('/Users/es.py/Projects/Personal/unsupervised-open-domain-french-question-answering/notebooks/data')

In [11]:
from pathlib import Path
DATA_PATH = Path.cwd().parent.joinpath("data")
assert DATA_PATH.exists(), "the data path does not exist"
DRC_NEWS_DATA_PATH = DATA_PATH.joinpath("corpus", "drc-news-txt")

In [12]:
import pandas as pd

In [13]:
data_file_path = DATA_PATH.joinpath("corpus", "raw", 'drc-news-raws.csv')

In [14]:
cd_news_data = pd.read_csv(data_file_path, names=["content", "posted_at"])

In [15]:
cd_news_data = cd_news_data.fillna(value="")
cd_news_data.head()

Unnamed: 0,content,posted_at
0,Les membres de la Commission tarifaire viennent de proposer des mesures néce...,2022-09-05 00:00:00
1,Les membres de la Commission tarifaire sont en session extraordinaire d...,2022-04-05 00:00:00
2,"Vodacom Congo vient de signer un partenariat avec Kinshasa Digital Academy, ...",2022-04-23 00:00:00
3,"Le sélectionneur des Léopards de la RDC, Hectór Cúper est attendu à Kinshasa...",2022-03-05 00:00:00
4,Le protocole d’accord était déjà signé entre la RDC et la compagnie aérienne...,2022-11-05 00:00:00


In [16]:
from haystack.nodes import TextConverter

In [17]:
from haystack.schema import Document
from secrets import token_hex

# @Todo: this is not working now , it was supposed to save the document to dataframe
def get_document_from_text(row):
    """numpy row with the text and the date of the post

    Args:
        row (_type_): _description_

    Returns:
        _type_: _description_
    """
    text = row[0].replace(u'\xa0', u' ')
    for paragraph in text.split("   "):
        if not paragraph.strip():  # skip empty paragraphs
            continue
        return Document(content=paragraph, meta={"posted_at":row[1] if row[1] else "" })

In [18]:
all_cd_news_docs = cd_news_data.apply(get_document_from_text, axis="columns")

In [19]:
len(all_cd_news_docs)

140638

In [20]:
all_cd_news_docs = all_cd_news_docs.dropna().to_list()

In [21]:
from haystack.errors import HaystackError
from haystack.schema import Document
from typing import List, Optional, Generator, Set, Union
from copy import deepcopy
from haystack.nodes import PreProcessor

class CustomPreProcessor(PreProcessor):
    def __init__(self, custom_preprocessor=None, **kwargs):
        super().__init__(**kwargs)
        self.custom_preprocessor = custom_preprocessor
    def clean(
        self,
        document: Union[dict, Document],
        clean_whitespace: bool,
        clean_header_footer: bool,
        clean_empty_lines: bool,
        remove_substrings: List[str],
        id_hash_keys: Optional[List[str]] = None,
    ) -> Document:
        """
        
        Perform document cleaning on a single document and return a single document. This method will deal with whitespaces, headers, footers
        and empty lines. Its exact functionality is defined by the parameters passed into PreProcessor.__init__().
        """
        if id_hash_keys is None:
            id_hash_keys = self.id_hash_keys

        if isinstance(document, dict):
            document = Document.from_dict(document, id_hash_keys=id_hash_keys)

        # Mainly needed for type checking
        if not isinstance(document, Document):
            raise HaystackError("Document must not be of type 'dict' but of type 'Document'.")
        text = document.content
        text = self.custom_preprocessor(text)
        if clean_header_footer:
            text = self._find_and_remove_header_footer(
                text, n_chars=300, n_first_pages_to_ignore=1, n_last_pages_to_ignore=1
            )

        if clean_whitespace:
            lines = text.splitlines()

            cleaned_lines = []
            for line in lines:
                line = line.strip()
                cleaned_lines.append(line)
            text = "\n".join(cleaned_lines)

        if clean_empty_lines:
            text = re.sub(r"\n\n+", "\n\n", text)

        for substring in remove_substrings:
            text = text.replace(substring, "")

        if text != document.content:
            document = deepcopy(document)
            document.content = text

        return document
    
    

In [22]:
from  functools import reduce
from operator import iconcat

In [23]:
all_cd_news_docs[0]

<Document: {'content': 'Les membres de la Commission tarifaire viennent de proposer des mesures nécessaires visant à corriger les erreurs matérielles pour faciliter la mise en œuvre de la taxe à valeur ajoutée sociale (TVA) et à l’extension du bénéfice du taux réduit à certaines positions tarifaires se rapportant aux produits à vocation sociale et de grande consommation ciblés notamment les produits laitiers pour nourrissons. C’est le résultat des travaux de 4 jours (du 03 au 06 mai dernier) des membres de la Commission tarifaire clôturés le vendredi 06 mai au Romeo Golf de la Gombe par Liévin Chiribagula, conseiller fiscal et représentant du ministre des Finances.', 'content_type': 'text', 'score': None, 'meta': {'posted_at': '2022-09-05 00:00:00'}, 'embedding': None, 'id': '5150ad41ae7d0d36c7fa0b291dbe704b'}>

With our document indexted int the elastic searh we can search , use the piaf dataset which have question with answers without paragraph and leverage them.

In [24]:
from collections import deque

In [25]:
import re
from gensim.utils import deaccent
from unicodedata import normalize as unicode_normalize

In [26]:
def replace_point(document):
    """replace the point with the wwt.www with space point before tokenizing the document .
    TOdos : this may have a a downside when the point is in the middle of a words
    Args:
        document (_type_): _description_
    """
    result = re.sub(r"(\S)\.(\S)", r"\1 . \2", document)
    return result

def replace_website_name(document):
    """sometimes the doucment has the name politico.cd or 7sur7.cd or actualite.cd, we would like to replace them by the 
    actual name of the website. before proper cleaning

    Args:
        document (_type_): _description_
    """
    # @TODO : not sure if this will work but , way better replace by the first line of match.
    
    result = re.sub(r"7SUR7.CD|politico.cd|actualite.cd|mediacongo.net", r"SITE_WEB", document, flags=re.IGNORECASE)
    return result

def remove_accents(document):
    input_without_accent = deaccent(document)
    return input_without_accent

def pre_clean_document(document):
    """pre clean the document by removing the accents and replacing the point with the wwt.www with space point before tokenizing the document .
    TOdos : this may have a a downside when the point is in the middle of a words
    and any other side of cleaning that we want to do .
    Args:
        document (_type_): _description_
    """
    result = remove_accents(document)
    result =  replace_website_name(result)
    result = replace_point(result)
    result = re.sub(r"This post has already been read \d+ times!", "", result) # remove unwanted text
    result = unicode_normalize("NFKD", result)
    return result

In [27]:
preprocessor = CustomPreProcessor(
    clean_empty_lines=True,
    clean_whitespace=True,
    clean_header_footer=False,
    split_by="word",
    split_length=200,
    split_respect_sentence_boundary=True,
    language="fr",
    custom_preprocessor=pre_clean_document,
)


cd_news_docs = preprocessor.process(all_cd_news_docs)


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 140235/140235 [02:56<00:00, 794.07docs/s]


NameError: name 'all_docs' is not defined

In [28]:

print(f"\nn_docs_output: {len(cd_news_docs)}")


n_docs_output: 299449


In [29]:
document_store.write_documents(cd_news_docs)


Writing Documents:   0%|          | 0/299449 [00:00<?, ?it/s]

After saving both the wikipedia articles and the congo news websites articles to the document store , we need to initialize the retriever and use the dense passage retrieval retriever to get the documents.


In [31]:
from haystack.nodes import DensePassageRetriever

In [32]:
dense_passage_retriever = DensePassageRetriever(
    document_store=document_store,
    query_embedding_model="etalab-ia/dpr-question_encoder-fr_qa-camembert",
    passage_embedding_model="etalab-ia/dpr-ctx_encoder-fr_qa-camembert",
    infer_tokenizer_classes=True,
)


INFO - haystack.modeling.utils -  Using devices: CPU
INFO - haystack.modeling.utils -  Number of GPUs: 0


Downloading:   0%|          | 0.00/518 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.33M [00:00<?, ?B/s]

INFO - haystack.modeling.model.language_model -  LOADING MODEL
INFO - haystack.modeling.model.language_model -  Could not find etalab-ia/dpr-question_encoder-fr_qa-camembert locally.
INFO - haystack.modeling.model.language_model -  Looking on Transformers Model Hub (in local cache and online)...


Downloading:   0%|          | 0.00/422M [00:00<?, ?B/s]

INFO - haystack.modeling.model.language_model -  Automatically detected language from language model name: french
INFO - haystack.modeling.model.language_model -  Loaded etalab-ia/dpr-question_encoder-fr_qa-camembert


Downloading:   0%|          | 0.00/517 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.33M [00:00<?, ?B/s]

INFO - haystack.modeling.model.language_model -  LOADING MODEL
INFO - haystack.modeling.model.language_model -  Could not find etalab-ia/dpr-ctx_encoder-fr_qa-camembert locally.
INFO - haystack.modeling.model.language_model -  Looking on Transformers Model Hub (in local cache and online)...


Downloading:   0%|          | 0.00/422M [00:00<?, ?B/s]

INFO - haystack.modeling.model.language_model -  Automatically detected language from language model name: french
INFO - haystack.modeling.model.language_model -  Loaded etalab-ia/dpr-ctx_encoder-fr_qa-camembert


In [33]:
document_store.update_embeddings(dense_passage_retriever)


INFO - haystack.document_stores.faiss -  Updating embeddings for 297840 docs...


Updating Embedding:   0%|          | 0/297840 [00:00<?, ? docs/s]

KeyboardInterrupt: 

#### Reading the Question Answering Datasets.

In [27]:
import pandas as pd

In [28]:
from pathlib import Path
DATA_PATH = Path.cwd().joinpath("data")
assert DATA_PATH.exists(), "the data path does not exist"

In [85]:
piaf_file = DATA_PATH.joinpath("corpus", "raw", "piaf", "questoin-reponse.csv")

In [86]:
assert piaf_file.exists(), "the piaf dataset does not exist"

piaf_question = data

In [87]:
piaf_df_without_context = pd.read_csv(piaf_file)

In [88]:
sample_question_response = piaf_df_without_context.sample(1)
question = deaccent(sample_question_response.question.values[0])
response = deaccent(sample_question_response.reponse.values[0])


In [89]:
question

'Quels plats de viandes epicees sont prepares a partir de ces ingredients ?'

In [29]:
def get_positive_context(retriever: BM25Retriever, search_query:str, answer:str, positive_documents: int = 100) -> List[Document]:    
        """given entitity retrieve the positive context
        we will first retrieve the top  100 documents , 
        - if the answer is in the top 40 document the input of the reader is the top 40 documents
        if the top 40 documents does not contain the answer we check whithin the top 41 to 100 document if the anwer is ther and we put it ther.
        other wise we discard the sentence

        Args:
            retriever (BM25Retriever): _description_
            n_ctxs (int, optional): _description_. Defaults to 15.
            entity (Entity, optional): _description_. Defaults to None.
        """
        list_pos_ctxs = []
        retrieved_docs = retriever.retrieve(query=search_query, top_k=positive_documents)
        for index, retrieve_doc in enumerate(retrieved_docs[0:40]):
            if answer.lower() in retrieve_doc.content.lower():
                list_pos_ctxs.append(
                    {"title": retrieve_doc.meta.get("title"), "content": retrieve_doc.content}
                )
        if len(list_pos_ctxs) == 0:
            for index, retrieve_doc in enumerate(retrieved_docs[40:100]):
                if answer.lower() in retrieve_doc.content.lower():
                    list_pos_ctxs.append(
                        {"title": retrieve_doc.meta.get("title"), "content": retrieve_doc.content}
                    )
        else:
            pass
        if len(list_pos_ctxs) == 0:
            return []
        return list_pos_ctxs

#### Use Piaf Dataset to query context

In [30]:
piaf_dataset = load_dataset("piaf")



  0%|          | 0/1 [00:00<?, ?it/s]

In [186]:
piaf_dataset = piaf_dataset["train"]

In [92]:
for index in tqdm(piaf_df_without_context.index):
    question = piaf_df_without_context.loc[index].question
    response = piaf_df_without_context.loc[index].reponse
    question = deaccent(question)
    response = deaccent(response)
    retrieved_docs = get_positive_context(retriever=bm25_retriever, search_query=question, answer=response, positive_documents=100)
    piaf_df_without_context.loc[index, "positive_context"] = retrieved_docs



By using our wiki corpus we are able to find some question with positive context, we will leverage them to build our qa system

In [93]:
piaf_with_context = piaf_df_without_context.loc[piaf_df_without_context.positive_context.apply(lambda x: len(x)) > 0]

In [94]:
piaf_with_context.shape

(2459, 3)

with our dataset , we can find that {{piaf_with_context.shape[0]}} have positive context and that will be usefull to fine tune our model.

In [95]:
piaf_with_context = piaf_with_context.assign(length_positive_context =piaf_with_context["positive_context"].apply(lambda x: len(x)))

In [117]:
piaf_with_context.loc[piaf_with_context.length_positive_context > 1].sort_values(by="length_positive_context", ascending=False)

Unnamed: 0,question,reponse,positive_context,length_positive_context
6756,quel mot ne figure pas dans le webster?,A,"[{'title': 'They singulier', 'content': 'En , le pronom au singulier fait s...",40
2516,Un dirigeant a-t-il séparé sa région de l'empire ?,Le,"[{'title': 'Conquêtes mongoles', 'content': 'Durant ces annees, l'empire se ...",39
6762,quel est le nom court pour République italienne?,Italie,"[{'title': 'Constitution de l'Italie', 'content': 'La constitution italienne...",38
5539,Comment accordait-on le mridang?,on,"[{'title': 'Parchemin', 'content': ' tambours : djembe (chevre, antilope, ze...",38
11413,qui a parachuté des provisions et mitraillé les positions chinoises ?,la,"[{'title': 'Pierre Claude', 'content': 'Le , lors d'un combat aerien contre ...",37
...,...,...,...,...
5891,Dans quelle matières sont façonnés les bijoux algériens ?,argent,"[{'title': 'Bouton de manchette', 'content': 'Il est souvent considere comme...",2
5918,Combien d'écoles anglaises appartiennent aux 100 meilleures universités en 2...,12,"[{'title': 'Flávio Augusto da Silva', 'content': 'Ne et eleve dans la banlie...",2
5919,Combien d'écoles anglaises appartiennent aux 100 meilleures universités en 2...,12,"[{'title': 'Flávio Augusto da Silva', 'content': 'Ne et eleve dans la banlie...",2
5920,où est classée l'université d'oxford en 2018 ?,première,"[{'title': 'Alfred Dürr', 'content': 'Durr etudia la musicologie et la philo...",2


In [33]:
retrieved_docs = bm25_retriever.retrieve(query=
    deaccent("De quelle langue est issue le mot mycelium ?"), top_k=30)

Retriever query: {'size': '30', 'query': {'bool': {'must': [{'multi_match': {'query': 'De quelle langue est issue le mot mycelium ?', 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}


In [32]:
retrieved_docs

[<Document: {'content': 'Un mot savant est un neologisme ou un mot dont l’evolution etymologique a ete freinee par le contexte socio-historique et qui reste de ce fait plus proche de son etymon. Par exemple, « fragile » est un mot savant dont le correspondant populaire est « frele ». Plus generalement, la langue savante (marquee de formations savantes) est la langue scientifique issue du latin ou du grec, opposee a la langue populaire ou vulgaire.', 'content_type': 'text', 'score': 0.9178798627418258, 'meta': {'title': 'Mot savant'}, 'embedding': None, 'id': '9713327'}>,
 <Document: {'content': "La croissance du mycelium qui a lieu dans l'espace intercellulaire, est fortement correlee a la croissance de l'hote. Le mycelium colonise les nouvelles feuilles et les nouveaux talles. La transmission par la semence est tres efficace. Dans la graine le mycelium est localise essentiellement dans la couche a aleurone.", 'content_type': 'text', 'score': 0.8908335557594226, 'meta': {'title': 'Epic

In [118]:
piaf_with_context.to_csv(DATA_PATH.joinpath("corpus", "raw", "piaf", "piaf_with_context.csv"))

within our dataset , 740 row have more than one context.

With our dataset with context, let us pull more question with context for the original piaf dataset with context

In [119]:
piaf_dataset = load_dataset("piaf")



  0%|          | 0/1 [00:00<?, ?it/s]

In [38]:
piaf_dataset = piaf_dataset["train"]

In [39]:
piaf_df = piaf_dataset.to_pandas()

In [40]:
piaf_df.head()

Unnamed: 0,id,title,context,question,answers
0,p140295443291664,Sport,"Les dépenses des ménages représentent plus de 50 % de ces montants (14,2 mil...",Combien de personnes travaillent au ministère des sports,"{'text': ['100 000'], 'answer_start': [472]}"
1,p140295443291520,Sport,"Les dépenses des ménages représentent plus de 50 % de ces montants (14,2 mil...",Combien d'employeurs,"{'text': ['20 000'], 'answer_start': [597]}"
2,p140295443291376,Sport,"Les dépenses des ménages représentent plus de 50 % de ces montants (14,2 mil...",Quel part du budget des ménages,"{'text': ['50'], 'answer_start': [46]}"
3,p140295443291088,Sport,"Les dépenses des ménages représentent plus de 50 % de ces montants (14,2 mil...",Quel montant en 2003,"{'text': ['14,2 milliards'], 'answer_start': [68]}"
4,p140295443290872,Sport,"Les dépenses des ménages représentent plus de 50 % de ces montants (14,2 mil...",Quel montant en 2019,"{'text': ['12 milliards'], 'answer_start': [102]}"


What are we trying to achive with this ?


We have question on the piaf dataset with context, for those question we will query elastic search to find out  the additional context. That context will will be consider as additional context to train our model with.


The algo : 

- for each question , query the elastic search and keep the top 10 documents context
- we will loop and send queries in a batch of 10 questions to elastic search
- retrieved the context and and save anything on the disk.

In the future to improve the quality of our finding we can consider only documents with named entities in the answers. We can check the paragraph , run the NER model on it and then check if the answer is in the named entities.

In [41]:
sample_questions = np.vectorize(deaccent)(piaf_df.question.loc[1:5])
retrieved_docs = bm25_retriever.retrieve_batch(queries=sample_questions.tolist(), top_k=30)



In [35]:
async def query_batch(retriever, queries, top_k=10):
    return retriever.retrieve_batch(queries=queries, top_k=top_k)

In [36]:
def decent_vectorize(queries):
    return np.vectorize(deaccent)(queries)

In [42]:
question_chunk = [decent_vectorize(piaf_df.question.loc[1:5]), decent_vectorize(piaf_df.question.loc[5:10])]

In [43]:
question_chunk

[array(["Combien d'employeurs", 'Quel part du budget des menages',
        'Quel montant en 2003', 'Quel montant en 2019',
        'En quelle annee Jakob Bohme tombe-t-il malade ?'], dtype='<U47'),
 array(['En quelle annee Jakob Bohme tombe-t-il malade ?',
        'Qui est mort en juillet ?',
        'Quel est le metier de Nicolas Thomas ?',
        'Que doit subir Jakob Bohme avant d’obtenir les derniers sacrements ?',
        'Pourquoi Bohme enfle-t-il ?',
        'Quel pays est surnomme la "perle de l\'Afrique" ?'], dtype='<U68')]

In [244]:
async def main():
    return await tqdm_asyncio.gather(*[query_batch(bm25_retriever, queries) for queries in question_chunk])

In [44]:
def write_to_json(data, path):
    with open(path, "w") as f:
        json.dump(data, f, indent=4, ensure_ascii=False)

In [45]:
question_chunk[0][-1]

'En quelle annee Jakob Bohme tombe-t-il malade ?'

In [47]:
answers[0][-1]

NameError: name 'answers' is not defined

In [48]:
piaf_dataset[0]

{'id': 'p140295443291664',
 'title': 'Sport',
 'context': "Les dépenses des ménages représentent plus de 50 % de ces montants (14,2 milliards d'euros en 2003 et 12 milliards d'euros en 2019), contre 7,9 milliards d'euros pour les collectivités locales, 3,2 pour l'État, et 2,2 pour les entreprises. Parmi les dépenses sportives des ménages en 2003, 3,7 milliards sont consacrés aux vêtements de sport et chaussures, 2 aux biens durables, 2,7 aux autres biens et 5,8 aux services. Le Ministère de la Jeunesse et des Sports estime à 100 000 (58 % d'hommes pour 42 % de femmes) le nombre de salariés travaillant pour le secteur sportif en France pour quelque 20 000 employeurs.",
 'question': 'Combien de personnes travaillent au ministère des sports',
 'answers': {'text': ['100 000'], 'answer_start': [472]}}

In [49]:
import json

In [50]:
async def process_queries_chunk(retriever:BM25Retriever, queries_chunk: pd.DataFrame):
    """tak a query chunk process it , query the elastic, instance and write the document to the file.

    Args:
        queries_chunk (pd.DataFrame): _description_
    """
    questions = decent_vectorize(queries_chunk.question)
    titles = queries_chunk.title
    answers = [answer.get("text")[0] for answer in queries_chunk.answers]
    contexts = queries_chunk.context
    ids = queries_chunk.id
    retrieved_docs = retriever.retrieve_batch(queries=questions, top_k=10)
    process_answers(ids=ids, 
                    questions=questions,
                    titles=titles,
                    answers=answers,
                    contexts=contexts,
                    retrieved_docs=retrieved_docs)

In [51]:
def process_doc(retrieved_docs):
    """
    convert a list of retrieved document to list of array of list and content
    """
    doc_list = []
    for doc in retrieved_docs:
        doc_ = {
            "title": doc.meta.get("title"),
            "content": doc.content,
            }
        doc_list.append(doc_)
    return doc_list

In [52]:
piaf_with_multi_context_path = DATA_PATH.joinpath("corpus", "french-qa", "piaf-with-multi-context")
assert piaf_with_multi_context_path.exists()

In [101]:
def process_answers(ids, questions, titles, answers, contexts, retrieved_docs):
    """process answers and write them to the file.

    Args:
        questions (_type_): _description_
        titles (_type_): _description_
        answers (_type_): _description_
        contexts (_type_): _description_
        retrieved_docs (_type_): _description_
    """
    for id_, question, title, answer, context, retrieved_doc in zip(ids, questions, titles, answers, contexts, retrieved_docs):
        contexts = [{"title": title, "content": context}]
        additional_context = process_doc(retrieved_doc)
        contexts.extend(additional_context)
        instance_json = {
            "question": question,
            "answer": answer,
            "contexts": contexts,
            "id": id_,
        }
        yield instance_json

    

In [102]:
def save_to_json(instances, path):
    "saves an iterator of multiple json files to files in the path directory"
    for instance in instances:
        with open(path.joinpath(f"{instance['id']}.json"), "w") as f:
            json.dump(instance, f, indent=4, ensure_ascii=False)

In [54]:
async def main():
    return await tqdm_asyncio.gather(*[process_queries_chunk(bm25_retriever, queries) for _, queries in piaf_df.groupby(np.arange(len(piaf_df))//5)])

In [55]:
%%script false --no-raise-error
await main()

In [58]:
bm25_retriever.retrieve(query=
    deaccent("De quelle langue est issue le mot mycelium ?"), top_k=10)

Retriever query: {'size': '10', 'query': {'bool': {'must': [{'multi_match': {'query': 'De quelle langue est issue le mot mycelium ?', 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}


[<Document: {'content': 'Un mot savant est un neologisme ou un mot dont l’evolution etymologique a ete freinee par le contexte socio-historique et qui reste de ce fait plus proche de son etymon. Par exemple, « fragile » est un mot savant dont le correspondant populaire est « frele ». Plus generalement, la langue savante (marquee de formations savantes) est la langue scientifique issue du latin ou du grec, opposee a la langue populaire ou vulgaire.', 'content_type': 'text', 'score': 0.9178798627418258, 'meta': {'title': 'Mot savant'}, 'embedding': None, 'id': '9713327'}>,
 <Document: {'content': "La croissance du mycelium qui a lieu dans l'espace intercellulaire, est fortement correlee a la croissance de l'hote. Le mycelium colonise les nouvelles feuilles et les nouveaux talles. La transmission par la semence est tres efficace. Dans la graine le mycelium est localise essentiellement dans la couche a aleurone.", 'content_type': 'text', 'score': 0.8908335557594226, 'meta': {'title': 'Epic

##### Processing Fquad

In [59]:
frquad_path_train = DATA_PATH.joinpath("corpus", "french-qa", "fquad", "train.json")
frquad_path_valid = DATA_PATH.joinpath("corpus", "french-qa", "fquad", "valid.json")

In [60]:
assert frquad_path_train.exists()
assert frquad_path_valid.exists()

In [61]:
with open(frquad_path_train, "r") as f:
    frquad_train = json.load(f)

In [63]:
frquad_train = frquad_train.get("data")

In [147]:
with open(frquad_path_valid, "r") as f:
    frquad_valid = json.load(f)

In [149]:
frquad_valid = frquad_valid.get("data")

In [151]:
frquad_valid[0]

{'title': 'anges-musiciens-(national-gallery)',
 'paragraphs': [{'qas': [{'answers': [{'answer_start': 161,
       'text': 'La Vierge aux rochers'}],
     'id': 'a524504f-0816-4f58-9f2d-27f82a85c73d',
     'question': 'Que concerne principalement les documents ?'},
    {'answers': [{'answer_start': 46, 'text': 'documents contemporains'}],
     'id': '8a72ad1c-b2fe-4fe6-9f87-35fcb713cf38',
     'question': 'Par quoi sont décrit les deux tableaux ?'},
    {'answers': [{'answer_start': 204, 'text': 'objets de spéculations'}],
     'id': 'b2db7f77-f6a7-4c3d-9274-9807f9764e97',
     'question': "Quels types d'objets sont les deux tableaux aux yeux des chercheurs ?"}],
   'context': "Les deux tableaux sont certes décrits par des documents contemporains à leur création mais ceux-ci ne le font qu'indirectement car ils concernent principalement La Vierge aux rochers. Aussi demeurent-ils objets de spéculations pour les chercheurs quant à leur statut de première ou seconde version de l'œuvre, leu

In [98]:
from itertools import repeat

In [107]:
fquad_path = DATA_PATH.joinpath("corpus", "french-qa", "fquad-with-multi-context")
fquad_path_output_train = fquad_path.joinpath("train")
fquad_path_output_valid = fquad_path.joinpath("valid")

In [136]:
for doc in frquad_train[0:2]:
    print(doc.get("title"))

(1)-cérès
american-idiot


In [115]:
async def process_fquad_paragraph(retriever, paragraph, title, path):
    """
    this will loop over the  a paragraph in frquad dataset.
    since each paragraph have a list of question ,
     it will return query the elasticsearch for each batch of question 
    and return the additional related_context

    Args:
        paragraph (_type_): _description_
    """
    context  = paragraph.get("context")
    question_answers = paragraph.get("qas")
    ids = [qa.get("id") for qa in question_answers]
    questions = [qa.get("question") for qa in question_answers]
    answers = [qa.get("answers")[0].get("text") for qa in question_answers]
    retrieved_docs = retriever.retrieve_batch(queries=questions, top_k=10)
    instances = process_answers(ids=ids, 
                                questions=questions,
                                titles=repeat(title, len(questions)),
                                answers=answers,
                                contexts=repeat(context, len(questions)),
                                retrieved_docs=retrieved_docs)
    save_to_json(instances, path)

In [106]:
assert fquad_path.exists()

In [79]:
def check_answer_in_retrieved_docs(answer, retrieved_docs):
    for doc in retrieved_docs:
        if answer in doc.content:
            print(doc)
            return True
    return False

In [86]:
retrieved_docs = bm25_retriever.retrieve(query=deaccent("Combien de fois Piazzi est-il parvenu à observer Cérès?"), top_k=30)


Retriever query: {'size': '30', 'query': {'bool': {'must': [{'multi_match': {'query': 'Combien de fois Piazzi est-il parvenu a observer Ceres?', 'type': 'most_fields', 'fields': ['content'], 'operator': 'OR'}}]}}}


In [119]:
await process_fquad_paragraph(bm25_retriever, frquad_train[0]["paragraphs"][0], "Cérès", fquad_path_output_train)

In [139]:
async def process_fquad(fquad, path, retriever):
    all_article_processor = list()
    for document in fquad:
        title = document.get("title")
        paragraphs = document.get("paragraphs")
        doc_coroutine = tqdm_asyncio.gather(*[process_fquad_paragraph(retriever=bm25_retriever, paragraph=paragraph, title=title, path=path) for paragraph in paragraphs])
        all_article_processor.append(doc_coroutine)
    return await tqdm_asyncio.gather(*all_article_processor)

In [141]:
await process_fquad(frquad_train, fquad_path_output_train, bm25_retriever)

  0%|          | 0/117 [00:00<?, ?it/s]
[A

[A[A


[A[A[A



[A[A[A[A




[A[A[A[A[A





[A[A[A[A[A[A






[A[A[A[A[A[A[A







[A[A[A[A[A[A[A[A








[A[A[A[A[A[A[A[A[A









[A[A[A[A[A[A[A[A[A[A










[A[A[A[A[A[A[A[A[A[A[A











[A[A[A[A[A[A[A[A[A[A[A[A












[A[A[A[A[A[A[A[A[A[A[A[A[A













[A[A[A[A[A[A[A[A[A[A[A[A[A[A














[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A















[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A
















[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A

















[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A


















100%|██████████| 37/37 [02:02<00:00,  3.32s/it]  [A[A[A

100%|██████████| 70/70 [02:02<00:00,  1.75s/it]  


100%|██████████| 1/1 [02:02<00:00, 122.82s/it]



100%|██████████| 33/33 [02:02<00:00,  3.72s/it]  




100%|██████████| 39/39 [02:02<00:00,  3.15s

[[None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None],
 [None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None],
 [None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None],
 [None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  Non

In [153]:
await process_fquad(frquad_valid, fquad_path_output_valid, bm25_retriever)



















[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A


















100%|██████████| 35/35 [00:23<00:00,  1.47it/s]A[A[A[A
100%|██████████| 2/2 [00:23<00:00, 11.91s/it]
100%|██████████| 19/19 [00:23<00:00,  1.25s/it]
100%|██████████| 83/83 [00:23<00:00,  3.48it/s]
100%|██████████| 16/16 [00:23<00:00,  1.49s/it]
100%|██████████| 19/19 [00:23<00:00,  1.25s/it]
100%|██████████| 42/42 [00:23<00:00,  1.76it/s]
100%|██████████| 113/113 [00:23<00:00,  4.74it/s]
100%|██████████| 39/39 [00:23<00:00,  1.64it/s]
100%|██████████| 38/38 [00:23<00:00,  1.60it/s]
100%|██████████| 34/34 [00:23<00:00,  1.43it/s]
100%|██████████| 31/31 [00:23<00:00,  1.30it/s]
100%|██████████| 84/84 [00:23<00:00,  3.53it/s]
100%|██████████| 35/35 [00:23<00:00,  1.47it/s]
100%|██████████| 72/72 [00:23<00:00,  3.02it/s]
100%|██████████| 35/35 [00:23<00:00,  1.47it/s]
100%|██████████| 12/12 [00:23<00:00,  1.98s/it]
100%|██████████| 59/59 [00:23<00:00,  2.48it/s]
100%|██████████| 18/18 [00:23<00:00

[[None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None],
 [None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None],
 [None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None

At this point we have the piaf dataset with context, we have the frquad with context, let us look now the exetat questions with context.

In [173]:
exetat_questions_path = DATA_PATH.joinpath("corpus", "french-qa", "exetat-questions", "questions.json")

In [174]:
assert exetat_questions_path.exists()

In [192]:
with open(exetat_questions_path, "r") as f:
    exetat_questions = json.load(f)

In [195]:
len(exetat_questions)

173

for now we can say that we have 173 exams questions, we need more and get more context to train.frquad_train
But for now we are going to train our model on the conbinaision of piaf and frenchquad dataset.