### Dense Passage Retriever

In the following notebook , we will train a dense passage retriever model. We will use it to retrieve document on 3 datasets:

- The wikipedia dataset
- The the congo news dataset
- history book dataset

In [1]:
import numpy as np
from haystack.utils import clean_wiki_text, convert_files_to_docs, fetch_archive_from_http, print_answers

In [2]:
from datasets import load_dataset

In [3]:
wiki_corpus = load_dataset("wikipedia", "20220301.fr", split="train")



In [4]:
wiki_corpus = wiki_corpus.shuffle(seed=42)



In [5]:
sample_wiki = wiki_corpus.shard(100, index=0)

In [6]:
sample_doc_dict = sample_wiki[1]

In [7]:
from haystack.schema import Document

In [8]:
wiki_corpus.shape

(2402095, 4)

##### Processing Documents

In [9]:
from haystack.utils import clean_wiki_text, convert_files_to_docs

In [10]:
from haystack.errors import HaystackError
from haystack.schema import Document
from typing import List, Optional, Generator, Set, Union, Callable, Dict
from copy import deepcopy
from haystack.nodes import PreProcessor
import re

In [11]:
from gensim.utils import deaccent

def remove_accents(document):
    input_without_accent = deaccent(document)
    return input_without_accent

In [12]:
async def convert_wiki_article_to_docs(
    item: dict,
    clean_func: Optional[Callable] = None,
    split_paragraphs: bool = False,
) -> List[Document]:
    """
    item

    :param items: dict of items
    :param clean_func: a custom cleaning function that gets applied to each doc (input: str, output:str)
    :param split_paragraphs: split text in paragraphs.
    """
    documents = []
    processed = 0
    text = item.get("text")
    text = remove_accents(text)
    if clean_func:
        text = clean_func(text)
    if split_paragraphs:
        for para in text.split("\n"):
            if 200 <= len(para.strip()) <= 2000:
                # just pick the paragraph with length between 50 and 1500
                processed += 1
                doc = Document(content=para, meta={"title": item.get("title")}, id=item.get("id"))
                documents.append(doc.to_json())
            else:
                continue
    else:
        processed += 1
        doc = Document(content=text, meta={"title": item.get("title")}, id=item.get("id"))
        documents.append(doc.to_json())
    
    return documents

In [18]:
import asyncio
from tqdm import tqdm
from tqdm.asyncio import tqdm_asyncio

In [19]:
wiki_corpus.shape

(2402095, 4)

In [21]:
all_docs = []
for i in range(0, 100):
    shard = wiki_corpus.shard(100, index=i)
    with tqdm(total=shard.shape[0]) as pbar:
        docs_in_shard = tqdm_asyncio.gather(*[convert_wiki_article_to_docs(item, clean_func=clean_wiki_text, split_paragraphs=True) for item in shard])
        all_docs.append(docs_in_shard)
    print("done with shard ", i)

  all_docs = []
  all_docs = []
Exception ignored in: <coroutine object convert_wiki_article_to_docs at 0x3d58317c0>
Traceback (most recent call last):
    def _warn_unawaited_coroutine(coro):
KeyboardInterrupt: 
  docs_in_shard = tqdm_asyncio.gather(*[convert_wiki_article_to_docs(item, clean_func=clean_wiki_text, split_paragraphs=True) for item in shard])
  docs_in_shard = tqdm_asyncio.gather(*[convert_wiki_article_to_docs(item, clean_func=clean_wiki_text, split_paragraphs=True) for item in shard])
  0%|                                                                                                                                  | 0/24021 [00:13<?, ?it/s]


done with shard  0


  0%|                                                                                                                                  | 0/24021 [00:12<?, ?it/s]


done with shard  1


  0%|                                                                                                                                  | 0/24021 [00:13<?, ?it/s]


done with shard  2


  0%|                                                                                                                                  | 0/24021 [00:09<?, ?it/s]


done with shard  3


  0%|                                                                                                                                  | 0/24021 [00:11<?, ?it/s]


done with shard  4


  0%|                                                                                                                                  | 0/24021 [00:09<?, ?it/s]


done with shard  5


  0%|                                                                                                                                  | 0/24021 [00:09<?, ?it/s]


done with shard  6


  0%|                                                                                                                                  | 0/24021 [00:11<?, ?it/s]


done with shard  7


  0%|                                                                                                                                  | 0/24021 [00:11<?, ?it/s]


done with shard  8


  0%|                                                                                                                                  | 0/24021 [00:10<?, ?it/s]


done with shard  9


  0%|                                                                                                                                  | 0/24021 [00:10<?, ?it/s]


done with shard  10


  0%|                                                                                                                                  | 0/24021 [00:09<?, ?it/s]


done with shard  11


  0%|                                                                                                                                  | 0/24021 [00:09<?, ?it/s]


done with shard  12


  0%|                                                                                                                                  | 0/24021 [00:13<?, ?it/s]


done with shard  13


  0%|                                                                                                                                  | 0/24021 [00:09<?, ?it/s]


done with shard  14


  0%|                                                                                                                                  | 0/24021 [00:11<?, ?it/s]


done with shard  15


  0%|                                                                                                                                  | 0/24021 [00:10<?, ?it/s]


done with shard  16


  0%|                                                                                                                                  | 0/24021 [00:09<?, ?it/s]


done with shard  17


  0%|                                                                                                                                  | 0/24021 [00:09<?, ?it/s]


done with shard  18


  0%|                                                                                                                                  | 0/24021 [00:10<?, ?it/s]


done with shard  19


  0%|                                                                                                                                  | 0/24021 [00:11<?, ?it/s]


done with shard  20


  0%|                                                                                                                                  | 0/24021 [00:10<?, ?it/s]


done with shard  21


  0%|                                                                                                                                  | 0/24021 [00:09<?, ?it/s]


done with shard  22


  0%|                                                                                                                                  | 0/24021 [00:09<?, ?it/s]


done with shard  23


  0%|                                                                                                                                  | 0/24021 [00:09<?, ?it/s]


done with shard  24


  0%|                                                                                                                                  | 0/24021 [00:08<?, ?it/s]


done with shard  25


  0%|                                                                                                                                  | 0/24021 [00:09<?, ?it/s]


done with shard  26


  0%|                                                                                                                                  | 0/24021 [00:11<?, ?it/s]


done with shard  27


  0%|                                                                                                                                  | 0/24021 [00:11<?, ?it/s]


done with shard  28


  0%|                                                                                                                                  | 0/24021 [00:14<?, ?it/s]


done with shard  29


  0%|                                                                                                                                  | 0/24021 [00:10<?, ?it/s]


done with shard  30


  0%|                                                                                                                                  | 0/24021 [00:10<?, ?it/s]


done with shard  31


  0%|                                                                                                                                  | 0/24021 [00:10<?, ?it/s]


done with shard  32


  0%|                                                                                                                                  | 0/24021 [00:10<?, ?it/s]


done with shard  33


  0%|                                                                                                                                  | 0/24021 [00:09<?, ?it/s]


done with shard  34


  0%|                                                                                                                                  | 0/24021 [00:09<?, ?it/s]


done with shard  35


  0%|                                                                                                                                  | 0/24021 [00:09<?, ?it/s]


done with shard  36


  0%|                                                                                                                                  | 0/24021 [00:09<?, ?it/s]


done with shard  37


  0%|                                                                                                                                  | 0/24021 [00:11<?, ?it/s]


done with shard  38


  0%|                                                                                                                                  | 0/24021 [00:13<?, ?it/s]


done with shard  39


  0%|                                                                                                                                  | 0/24021 [00:10<?, ?it/s]


done with shard  40


  0%|                                                                                                                                  | 0/24021 [00:10<?, ?it/s]


done with shard  41


  0%|                                                                                                                                  | 0/24021 [00:10<?, ?it/s]


done with shard  42


  0%|                                                                                                                                  | 0/24021 [00:10<?, ?it/s]


done with shard  43


  0%|                                                                                                                                  | 0/24021 [00:10<?, ?it/s]


done with shard  44


  0%|                                                                                                                                  | 0/24021 [00:12<?, ?it/s]


done with shard  45


  0%|                                                                                                                                  | 0/24021 [00:16<?, ?it/s]


done with shard  46


  0%|                                                                                                                                  | 0/24021 [00:12<?, ?it/s]


done with shard  47


  0%|                                                                                                                                  | 0/24021 [00:13<?, ?it/s]


done with shard  48


  0%|                                                                                                                                  | 0/24021 [00:15<?, ?it/s]


done with shard  49


  0%|                                                                                                                                  | 0/24021 [00:17<?, ?it/s]


done with shard  50


  0%|                                                                                                                                  | 0/24021 [00:14<?, ?it/s]


done with shard  51


  0%|                                                                                                                                  | 0/24021 [00:26<?, ?it/s]


done with shard  52


  0%|                                                                                                                                  | 0/24021 [01:01<?, ?it/s]


done with shard  53


  0%|                                                                                                                                  | 0/24021 [00:17<?, ?it/s]


done with shard  54


  0%|                                                                                                                                  | 0/24021 [00:15<?, ?it/s]


done with shard  55


  0%|                                                                                                                                  | 0/24021 [00:15<?, ?it/s]


done with shard  56


  0%|                                                                                                                                  | 0/24021 [00:15<?, ?it/s]


done with shard  57


  0%|                                                                                                                                  | 0/24021 [00:15<?, ?it/s]


done with shard  58


  0%|                                                                                                                                  | 0/24021 [00:14<?, ?it/s]


done with shard  59


  0%|                                                                                                                                  | 0/24021 [00:14<?, ?it/s]


done with shard  60


  0%|                                                                                                                                  | 0/24021 [00:15<?, ?it/s]


done with shard  61


  0%|                                                                                                                                  | 0/24021 [00:14<?, ?it/s]


done with shard  62


  0%|                                                                                                                                  | 0/24021 [00:14<?, ?it/s]


done with shard  63


  0%|                                                                                                                                  | 0/24021 [00:13<?, ?it/s]


done with shard  64


  0%|                                                                                                                                  | 0/24021 [00:14<?, ?it/s]


done with shard  65


  0%|                                                                                                                                  | 0/24021 [00:14<?, ?it/s]


done with shard  66


  0%|                                                                                                                                  | 0/24021 [00:14<?, ?it/s]


done with shard  67


  0%|                                                                                                                                  | 0/24021 [00:13<?, ?it/s]


done with shard  68


  0%|                                                                                                                                  | 0/24021 [00:24<?, ?it/s]


done with shard  69


  0%|                                                                                                                                  | 0/24021 [00:14<?, ?it/s]


done with shard  70


  0%|                                                                                                                                  | 0/24021 [00:13<?, ?it/s]


done with shard  71


  0%|                                                                                                                                  | 0/24021 [00:13<?, ?it/s]


done with shard  72


  0%|                                                                                                                                  | 0/24021 [00:13<?, ?it/s]


done with shard  73


  0%|                                                                                                                                  | 0/24021 [00:12<?, ?it/s]


done with shard  74


  0%|                                                                                                                                  | 0/24021 [00:13<?, ?it/s]


done with shard  75


  0%|                                                                                                                                  | 0/24021 [00:13<?, ?it/s]


done with shard  76


  0%|                                                                                                                                  | 0/24021 [00:13<?, ?it/s]


done with shard  77


  0%|                                                                                                                                  | 0/24021 [01:12<?, ?it/s]


done with shard  78


  0%|                                                                                                                                  | 0/24021 [00:15<?, ?it/s]


done with shard  79


  0%|                                                                                                                                  | 0/24021 [00:13<?, ?it/s]


done with shard  80


  0%|                                                                                                                                  | 0/24021 [00:13<?, ?it/s]


done with shard  81


  0%|                                                                                                                                  | 0/24021 [00:14<?, ?it/s]


done with shard  82


  0%|                                                                                                                                  | 0/24021 [00:13<?, ?it/s]


done with shard  83


  0%|                                                                                                                                  | 0/24021 [00:13<?, ?it/s]


done with shard  84


  0%|                                                                                                                                  | 0/24021 [00:13<?, ?it/s]


done with shard  85


  0%|                                                                                                                                  | 0/24021 [00:12<?, ?it/s]


done with shard  86


  0%|                                                                                                                                  | 0/24021 [00:11<?, ?it/s]


done with shard  87


  0%|                                                                                                                                  | 0/24021 [00:11<?, ?it/s]


done with shard  88


  0%|                                                                                                                                  | 0/24021 [00:11<?, ?it/s]


done with shard  89


  0%|                                                                                                                                  | 0/24021 [00:29<?, ?it/s]


done with shard  90


  0%|                                                                                                                                  | 0/24021 [00:12<?, ?it/s]


done with shard  91


  0%|                                                                                                                                  | 0/24021 [00:12<?, ?it/s]


done with shard  92


  0%|                                                                                                                                  | 0/24021 [00:12<?, ?it/s]


done with shard  93


  0%|                                                                                                                                  | 0/24021 [00:12<?, ?it/s]


done with shard  94


  0%|                                                                                                                                  | 0/24020 [00:11<?, ?it/s]


done with shard  95


  0%|                                                                                                                                  | 0/24020 [00:11<?, ?it/s]


done with shard  96


  0%|                                                                                                                                  | 0/24020 [00:11<?, ?it/s]


done with shard  97


  0%|                                                                                                                                  | 0/24020 [00:10<?, ?it/s]


done with shard  98


  0%|                                                                                                                                  | 0/24020 [00:11<?, ?it/s]

done with shard  99





In [23]:
from functools import reduce
from operator import iconcat

In [22]:
with tqdm(total=len(all_docs)) as pbar:
     results = await tqdm_asyncio.gather(*all_docs)

  0%|                                                                                                                                    | 0/100 [00:00<?, ?it/s]
  0%|                                                                                                                                    | 0/100 [00:00<?, ?it/s][A

  0%|                                                                                                                                  | 0/24021 [00:00<?, ?it/s][A[A


  0%|                                                                                                                                  | 0/24021 [00:00<?, ?it/s][A[A[A



  0%|                                                                                                                                  | 0/24021 [00:00<?, ?it/s][A[A[A[A




  0%|                                                                                                                                  | 0/24021 [00:0

 62%|████████████████████████████████████████████████████████████████████████                                            | 14917/24021 [2:16:29<26:25,  5.74it/s][A[A[A[A[A[A[A[A[A








100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 24021/24021 [2:16:29<00:00,  2.93it/s][A[A[A[A[A[A[A[A[A










  0%|                                                                                                                | 1/24021 [2:16:29<54642:46:50, 8189.59s/it][A[A[A[A[A[A[A[A[A[A









 23%|██████████████████████████▉                                                                                        | 5638/24021 [2:16:29<5:11:33,  1.02s/it][A[A[A[A[A[A[A[A[A[A









 47%|█████████████████████████████████████████████████████▉                                                            | 11372/24021 [2:16:29<1:27:23,  2.41it/s][A[A[A[A[A[A[A[A[A[A









1

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 24021/24021 [2:16:23<00:00,  2.94it/s][A[A[A[A[A[A[A








  0%|                                                                                                                | 1/24021 [2:16:23<54598:57:17, 8183.02s/it][A[A[A[A[A[A[A[A







 26%|██████████████████████████████▎                                                                                    | 6340/24021 [2:16:23<4:26:15,  1.11it/s][A[A[A[A[A[A[A[A







 60%|█████████████████████████████████████████████████████████████████████▍                                              | 14373/24021 [2:16:23<51:42,  3.11it/s][A[A[A[A[A[A[A[A







100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 24021/24021 [2:16:23<00:00,  2.94it/s][A[A[A[A[A[A[A[A









  0%|                              

  0%|                                                                                                                | 1/24021 [2:16:22<54595:10:44, 8182.46s/it][A[A[A[A[A[A[A[A







 44%|██████████████████████████████████████████████████▍                                                               | 10625/24021 [2:16:22<2:00:21,  1.85it/s][A[A[A[A[A[A[A[A







100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 24021/24021 [2:16:22<00:00,  2.94it/s][A[A[A[A[A[A[A[A









  0%|                                                                                                                | 1/24021 [2:16:22<54595:51:45, 8182.56s/it][A[A[A[A[A[A[A[A[A








 48%|██████████████████████████████████████████████████████▊                                                           | 11538/24021 [2:16:22<1:43:17,  2.01it/s][A[A[A[A[A[A[A[A[A








100%|████████████████████

 43%|████████████████████████████████████████████████▋                                                                 | 10260/24021 [2:16:20<2:08:00,  1.79it/s][A[A[A[A[A[A[A[A[A[A[A










100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 24021/24021 [2:16:20<00:00,  2.94it/s][A[A[A[A[A[A[A[A[A[A[A












  0%|                                                                                                                | 1/24021 [2:16:20<54583:14:55, 8180.67s/it][A[A[A[A[A[A[A[A[A[A[A[A











 45%|███████████████████████████████████████████████████▏                                                              | 10796/24021 [2:16:20<1:56:55,  1.89it/s][A[A[A[A[A[A[A[A[A[A[A[A











100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 24021/24021 [2:16:20<00:00,  2.94it/s][A[A[A

  0%|                                                                                                                | 1/24021 [2:16:18<54571:00:04, 8178.83s/it][A[A[A


 44%|█████████████████████████████████████████████████▋                                                                | 10458/24021 [2:16:18<2:03:45,  1.83it/s][A[A[A


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 24021/24021 [2:16:19<00:00,  2.94it/s][A[A[A




  0%|                                                                                                                | 1/24021 [2:16:18<54571:37:26, 8178.93s/it][A[A[A[A



100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 24021/24021 [2:16:19<00:00,  2.94it/s][A[A[A[A





100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 

  0%|                                                                                                                | 1/24021 [2:16:03<54465:43:48, 8163.06s/it][A[A[A[A



100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 24021/24021 [2:16:03<00:00,  2.94it/s][A[A[A[A





  0%|                                                                                                                | 1/24021 [2:16:03<54465:40:33, 8163.05s/it][A[A[A[A[A




100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 24021/24021 [2:16:03<00:00,  2.94it/s][A[A[A[A[A






  0%|                                                                                                                | 1/24021 [2:16:03<54465:39:05, 8163.05s/it][A[A[A[A[A[A





100%|██████████████████████████████████████████████████████████████████████████████████████████

In [24]:
import json

In [25]:
from pathlib import Path
DATA_PATH = Path.cwd().parent.joinpath("data")
assert DATA_PATH.exists(), "the data path does not exist"

In [26]:
wikipedia_dump = DATA_PATH.joinpath("processed", "wikipedia")

In [27]:
assert wikipedia_dump.exists

In [28]:
def write_to_json(data, indice):
    with open(wikipedia_dump.joinpath(f"dump_{indice}"), 'w+') as buffer:
        for item in data:
            buffer.write(json.dumps(item, ensure_ascii=True) + "\n")

In [29]:
for indice, result in enumerate(results):
    results_flatten_ = reduce(iconcat, result, [])
    write_to_json(results_flatten_, indice)

#### Writing Congo News Dataset

In [None]:
DATA_PATH

In [None]:

DRC_NEWS_DATA_PATH = DATA_PATH.joinpath("corpus", "drc-news-txt")

In [None]:
import pandas as pd

In [None]:
data_file_path = DATA_PATH.joinpath("corpus", "raw", 'drc-news-raws.csv')

In [None]:
cd_news_data = pd.read_csv(data_file_path, names=["content", "posted_at"])

In [None]:
cd_news_data = cd_news_data.fillna(value="")
cd_news_data.head()

In [None]:
from haystack.nodes import TextConverter

In [None]:
from haystack.schema import Document
from secrets import token_hex

# @Todo: this is not working now , it was supposed to save the document to dataframe
def get_document_from_text(row):
    """numpy row with the text and the date of the post

    Args:
        row (_type_): _description_

    Returns:
        _type_: _description_
    """
    text = row[0].replace(u'\xa0', u' ')
    for paragraph in text.split("   "):
        if not paragraph.strip():  # skip empty paragraphs
            continue
        return Document(content=paragraph, meta={"posted_at":row[1] if row[1] else "" })

In [None]:
all_cd_news_docs = cd_news_data.apply(get_document_from_text, axis="columns")

In [None]:
len(all_cd_news_docs)

In [None]:
all_cd_news_docs = all_cd_news_docs.dropna().to_list()

In [None]:
from haystack.errors import HaystackError
from haystack.schema import Document
from typing import List, Optional, Generator, Set, Union
from copy import deepcopy
from haystack.nodes import PreProcessor

class CustomPreProcessor(PreProcessor):
    def __init__(self, custom_preprocessor=None, **kwargs):
        super().__init__(**kwargs)
        self.custom_preprocessor = custom_preprocessor
    def clean(
        self,
        document: Union[dict, Document],
        clean_whitespace: bool,
        clean_header_footer: bool,
        clean_empty_lines: bool,
        remove_substrings: List[str],
        id_hash_keys: Optional[List[str]] = None,
    ) -> Document:
        """
        
        Perform document cleaning on a single document and return a single document. This method will deal with whitespaces, headers, footers
        and empty lines. Its exact functionality is defined by the parameters passed into PreProcessor.__init__().
        """
        if id_hash_keys is None:
            id_hash_keys = self.id_hash_keys

        if isinstance(document, dict):
            document = Document.from_dict(document, id_hash_keys=id_hash_keys)

        # Mainly needed for type checking
        if not isinstance(document, Document):
            raise HaystackError("Document must not be of type 'dict' but of type 'Document'.")
        text = document.content
        text = self.custom_preprocessor(text)
        if clean_header_footer:
            text = self._find_and_remove_header_footer(
                text, n_chars=300, n_first_pages_to_ignore=1, n_last_pages_to_ignore=1
            )

        if clean_whitespace:
            lines = text.splitlines()

            cleaned_lines = []
            for line in lines:
                line = line.strip()
                cleaned_lines.append(line)
            text = "\n".join(cleaned_lines)

        if clean_empty_lines:
            text = re.sub(r"\n\n+", "\n\n", text)

        for substring in remove_substrings:
            text = text.replace(substring, "")

        if text != document.content:
            document = deepcopy(document)
            document.content = text

        return document
    
    

In [None]:
from  functools import reduce
from operator import iconcat

In [None]:
all_cd_news_docs[0]

With our document indexted int the elastic searh we can search , use the piaf dataset which have question with answers without paragraph and leverage them.

In [None]:
from collections import deque

In [None]:
import re
from gensim.utils import deaccent
from unicodedata import normalize as unicode_normalize

In [None]:
def replace_point(document):
    """replace the point with the wwt.www with space point before tokenizing the document .
    TOdos : this may have a a downside when the point is in the middle of a words
    Args:
        document (_type_): _description_
    """
    result = re.sub(r"(\S)\.(\S)", r"\1 . \2", document)
    return result

def replace_website_name(document):
    """sometimes the doucment has the name politico.cd or 7sur7.cd or actualite.cd, we would like to replace them by the 
    actual name of the website. before proper cleaning

    Args:
        document (_type_): _description_
    """
    # @TODO : not sure if this will work but , way better replace by the first line of match.
    
    result = re.sub(r"7SUR7.CD|politico.cd|actualite.cd|mediacongo.net", r"SITE_WEB", document, flags=re.IGNORECASE)
    return result

def remove_accents(document):
    input_without_accent = deaccent(document)
    return input_without_accent

def pre_clean_document(document):
    """pre clean the document by removing the accents and replacing the point with the wwt.www with space point before tokenizing the document .
    TOdos : this may have a a downside when the point is in the middle of a words
    and any other side of cleaning that we want to do .
    Args:
        document (_type_): _description_
    """
    result = remove_accents(document)
    result =  replace_website_name(result)
    result = replace_point(result)
    result = re.sub(r"This post has already been read \d+ times!", "", result) # remove unwanted text
    result = unicode_normalize("NFKD", result)
    return result

In [None]:
preprocessor = CustomPreProcessor(
    clean_empty_lines=True,
    clean_whitespace=True,
    clean_header_footer=False,
    split_by="word",
    split_length=200,
    split_respect_sentence_boundary=True,
    language="fr",
    custom_preprocessor=pre_clean_document,
)


cd_news_docs = preprocessor.process(all_cd_news_docs)


In [None]:

print(f"\nn_docs_output: {len(cd_news_docs)}")

In [None]:
document_store.write_documents(cd_news_docs)


After saving both the wikipedia articles and the congo news websites articles to the document store , we need to initialize the retriever and use the dense passage retrieval retriever to get the documents.


In [None]:
from haystack.nodes import DensePassageRetriever

In [None]:
dense_passage_retriever = DensePassageRetriever(
    document_store=document_store,
    query_embedding_model="etalab-ia/dpr-question_encoder-fr_qa-camembert",
    passage_embedding_model="etalab-ia/dpr-ctx_encoder-fr_qa-camembert",
    infer_tokenizer_classes=True,
)


In [None]:
document_store.update_embeddings(dense_passage_retriever)


#### Reading the Question Answering Datasets.

In [None]:
import pandas as pd

In [None]:
from pathlib import Path
DATA_PATH = Path.cwd().joinpath("data")
assert DATA_PATH.exists(), "the data path does not exist"

In [None]:
piaf_file = DATA_PATH.joinpath("corpus", "raw", "piaf", "questoin-reponse.csv")

In [None]:
assert piaf_file.exists(), "the piaf dataset does not exist"

piaf_question = data

In [None]:
piaf_df_without_context = pd.read_csv(piaf_file)

In [None]:
sample_question_response = piaf_df_without_context.sample(1)
question = deaccent(sample_question_response.question.values[0])
response = deaccent(sample_question_response.reponse.values[0])


In [None]:
question

In [None]:
def get_positive_context(retriever: BM25Retriever, search_query:str, answer:str, positive_documents: int = 100) -> List[Document]:    
        """given entitity retrieve the positive context
        we will first retrieve the top  100 documents , 
        - if the answer is in the top 40 document the input of the reader is the top 40 documents
        if the top 40 documents does not contain the answer we check whithin the top 41 to 100 document if the anwer is ther and we put it ther.
        other wise we discard the sentence

        Args:
            retriever (BM25Retriever): _description_
            n_ctxs (int, optional): _description_. Defaults to 15.
            entity (Entity, optional): _description_. Defaults to None.
        """
        list_pos_ctxs = []
        retrieved_docs = retriever.retrieve(query=search_query, top_k=positive_documents)
        for index, retrieve_doc in enumerate(retrieved_docs[0:40]):
            if answer.lower() in retrieve_doc.content.lower():
                list_pos_ctxs.append(
                    {"title": retrieve_doc.meta.get("title"), "content": retrieve_doc.content}
                )
        if len(list_pos_ctxs) == 0:
            for index, retrieve_doc in enumerate(retrieved_docs[40:100]):
                if answer.lower() in retrieve_doc.content.lower():
                    list_pos_ctxs.append(
                        {"title": retrieve_doc.meta.get("title"), "content": retrieve_doc.content}
                    )
        else:
            pass
        if len(list_pos_ctxs) == 0:
            return []
        return list_pos_ctxs

#### Use Piaf Dataset to query context

In [None]:
piaf_dataset = load_dataset("piaf")

In [None]:
piaf_dataset = piaf_dataset["train"]

In [None]:
for index in tqdm(piaf_df_without_context.index):
    question = piaf_df_without_context.loc[index].question
    response = piaf_df_without_context.loc[index].reponse
    question = deaccent(question)
    response = deaccent(response)
    retrieved_docs = get_positive_context(retriever=bm25_retriever, search_query=question, answer=response, positive_documents=100)
    piaf_df_without_context.loc[index, "positive_context"] = retrieved_docs

By using our wiki corpus we are able to find some question with positive context, we will leverage them to build our qa system

In [None]:
piaf_with_context = piaf_df_without_context.loc[piaf_df_without_context.positive_context.apply(lambda x: len(x)) > 0]

In [None]:
piaf_with_context.shape

with our dataset , we can find that {{piaf_with_context.shape[0]}} have positive context and that will be usefull to fine tune our model.

In [None]:
piaf_with_context = piaf_with_context.assign(length_positive_context =piaf_with_context["positive_context"].apply(lambda x: len(x)))

In [None]:
piaf_with_context.loc[piaf_with_context.length_positive_context > 1].sort_values(by="length_positive_context", ascending=False)

In [None]:
retrieved_docs = bm25_retriever.retrieve(query=
    deaccent("De quelle langue est issue le mot mycelium ?"), top_k=30)

In [None]:
retrieved_docs

In [None]:
piaf_with_context.to_csv(DATA_PATH.joinpath("corpus", "raw", "piaf", "piaf_with_context.csv"))

within our dataset , 740 row have more than one context.

With our dataset with context, let us pull more question with context for the original piaf dataset with context

In [None]:
piaf_dataset = load_dataset("piaf")

In [None]:
piaf_dataset = piaf_dataset["train"]

In [None]:
piaf_df = piaf_dataset.to_pandas()

In [None]:
piaf_df.head()

What are we trying to achive with this ?


We have question on the piaf dataset with context, for those question we will query elastic search to find out  the additional context. That context will will be consider as additional context to train our model with.


The algo : 

- for each question , query the elastic search and keep the top 10 documents context
- we will loop and send queries in a batch of 10 questions to elastic search
- retrieved the context and and save anything on the disk.

In the future to improve the quality of our finding we can consider only documents with named entities in the answers. We can check the paragraph , run the NER model on it and then check if the answer is in the named entities.

In [None]:
sample_questions = np.vectorize(deaccent)(piaf_df.question.loc[1:5])
retrieved_docs = bm25_retriever.retrieve_batch(queries=sample_questions.tolist(), top_k=30)

In [None]:
async def query_batch(retriever, queries, top_k=10):
    return retriever.retrieve_batch(queries=queries, top_k=top_k)

In [None]:
def decent_vectorize(queries):
    return np.vectorize(deaccent)(queries)

In [None]:
question_chunk = [decent_vectorize(piaf_df.question.loc[1:5]), decent_vectorize(piaf_df.question.loc[5:10])]

In [None]:
question_chunk

In [None]:
async def main():
    return await tqdm_asyncio.gather(*[query_batch(bm25_retriever, queries) for queries in question_chunk])

In [None]:
def write_to_json(data, path):
    with open(path, "w") as f:
        json.dump(data, f, indent=4, ensure_ascii=False)

In [None]:
question_chunk[0][-1]

In [None]:
answers[0][-1]

In [None]:
piaf_dataset[0]

In [None]:
import json

In [None]:
async def process_queries_chunk(retriever:BM25Retriever, queries_chunk: pd.DataFrame):
    """tak a query chunk process it , query the elastic, instance and write the document to the file.

    Args:
        queries_chunk (pd.DataFrame): _description_
    """
    questions = decent_vectorize(queries_chunk.question)
    titles = queries_chunk.title
    answers = [answer.get("text")[0] for answer in queries_chunk.answers]
    contexts = queries_chunk.context
    ids = queries_chunk.id
    retrieved_docs = retriever.retrieve_batch(queries=questions, top_k=10)
    process_answers(ids=ids, 
                    questions=questions,
                    titles=titles,
                    answers=answers,
                    contexts=contexts,
                    retrieved_docs=retrieved_docs)

In [None]:
def process_doc(retrieved_docs):
    """
    convert a list of retrieved document to list of array of list and content
    """
    doc_list = []
    for doc in retrieved_docs:
        doc_ = {
            "title": doc.meta.get("title"),
            "content": doc.content,
            }
        doc_list.append(doc_)
    return doc_list

In [None]:
piaf_with_multi_context_path = DATA_PATH.joinpath("corpus", "french-qa", "piaf-with-multi-context")
assert piaf_with_multi_context_path.exists()

In [None]:
def process_answers(ids, questions, titles, answers, contexts, retrieved_docs):
    """process answers and write them to the file.

    Args:
        questions (_type_): _description_
        titles (_type_): _description_
        answers (_type_): _description_
        contexts (_type_): _description_
        retrieved_docs (_type_): _description_
    """
    for id_, question, title, answer, context, retrieved_doc in zip(ids, questions, titles, answers, contexts, retrieved_docs):
        contexts = [{"title": title, "content": context}]
        additional_context = process_doc(retrieved_doc)
        contexts.extend(additional_context)
        instance_json = {
            "question": question,
            "answer": answer,
            "contexts": contexts,
            "id": id_,
        }
        yield instance_json

    

In [None]:
def save_to_json(instances, path):
    "saves an iterator of multiple json files to files in the path directory"
    for instance in instances:
        with open(path.joinpath(f"{instance['id']}.json"), "w") as f:
            json.dump(instance, f, indent=4, ensure_ascii=False)

In [None]:
async def main():
    return await tqdm_asyncio.gather(*[process_queries_chunk(bm25_retriever, queries) for _, queries in piaf_df.groupby(np.arange(len(piaf_df))//5)])

In [None]:
%%script false --no-raise-error
await main()

In [None]:
bm25_retriever.retrieve(query=
    deaccent("De quelle langue est issue le mot mycelium ?"), top_k=10)

##### Processing Fquad

In [None]:
frquad_path_train = DATA_PATH.joinpath("corpus", "french-qa", "fquad", "train.json")
frquad_path_valid = DATA_PATH.joinpath("corpus", "french-qa", "fquad", "valid.json")

In [None]:
assert frquad_path_train.exists()
assert frquad_path_valid.exists()

In [None]:
with open(frquad_path_train, "r") as f:
    frquad_train = json.load(f)

In [None]:
frquad_train = frquad_train.get("data")

In [None]:
with open(frquad_path_valid, "r") as f:
    frquad_valid = json.load(f)

In [None]:
frquad_valid = frquad_valid.get("data")

In [None]:
frquad_valid[0]

In [None]:
from itertools import repeat

In [None]:
fquad_path = DATA_PATH.joinpath("corpus", "french-qa", "fquad-with-multi-context")
fquad_path_output_train = fquad_path.joinpath("train")
fquad_path_output_valid = fquad_path.joinpath("valid")

In [None]:
for doc in frquad_train[0:2]:
    print(doc.get("title"))

In [None]:
async def process_fquad_paragraph(retriever, paragraph, title, path):
    """
    this will loop over the  a paragraph in frquad dataset.
    since each paragraph have a list of question ,
     it will return query the elasticsearch for each batch of question 
    and return the additional related_context

    Args:
        paragraph (_type_): _description_
    """
    context  = paragraph.get("context")
    question_answers = paragraph.get("qas")
    ids = [qa.get("id") for qa in question_answers]
    questions = [qa.get("question") for qa in question_answers]
    answers = [qa.get("answers")[0].get("text") for qa in question_answers]
    retrieved_docs = retriever.retrieve_batch(queries=questions, top_k=10)
    instances = process_answers(ids=ids, 
                                questions=questions,
                                titles=repeat(title, len(questions)),
                                answers=answers,
                                contexts=repeat(context, len(questions)),
                                retrieved_docs=retrieved_docs)
    save_to_json(instances, path)

In [None]:
assert fquad_path.exists()

In [None]:
def check_answer_in_retrieved_docs(answer, retrieved_docs):
    for doc in retrieved_docs:
        if answer in doc.content:
            print(doc)
            return True
    return False

In [None]:
retrieved_docs = bm25_retriever.retrieve(query=deaccent("Combien de fois Piazzi est-il parvenu à observer Cérès?"), top_k=30)


In [None]:
await process_fquad_paragraph(bm25_retriever, frquad_train[0]["paragraphs"][0], "Cérès", fquad_path_output_train)

In [None]:
async def process_fquad(fquad, path, retriever):
    all_article_processor = list()
    for document in fquad:
        title = document.get("title")
        paragraphs = document.get("paragraphs")
        doc_coroutine = tqdm_asyncio.gather(*[process_fquad_paragraph(retriever=bm25_retriever, paragraph=paragraph, title=title, path=path) for paragraph in paragraphs])
        all_article_processor.append(doc_coroutine)
    return await tqdm_asyncio.gather(*all_article_processor)

In [None]:
await process_fquad(frquad_train, fquad_path_output_train, bm25_retriever)

In [None]:
await process_fquad(frquad_valid, fquad_path_output_valid, bm25_retriever)

At this point we have the piaf dataset with context, we have the frquad with context, let us look now the exetat questions with context.

In [None]:
exetat_questions_path = DATA_PATH.joinpath("corpus", "french-qa", "exetat-questions", "questions.json")

In [None]:
assert exetat_questions_path.exists()

In [None]:
with open(exetat_questions_path, "r") as f:
    exetat_questions = json.load(f)

In [None]:
len(exetat_questions)

for now we can say that we have 173 exams questions, we need more and get more context to train.frquad_train
But for now we are going to train our model on the conbinaision of piaf and frenchquad dataset.