## Install CDQA

In [None]:
#! git clone https://github.com/cdqa-suite/cdQA.git
#! cd cdQA
#! pip install -e .

In [1]:
from cdqa.pipeline import QAPipelineipeline
from cdqa.utils.converters import pdf_converter
from cdqa.utils.download import download_model

In [13]:
import pandas as pd
pd.set_option('display.max_colwidth', -1)

## Download BERT Model

In [5]:
download_model(model='bert-squad_1.1', dir='./models')


Downloading trained model...
100% [......................................................................] 438037911 / 438037911

## Load PDF

In [9]:
import PyPDF2

pdfFileObj = open('./docs/Q3_PDF.pdf', 'rb')
pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
  
# printing number of pages in pdf file
print(pdfReader.numPages)

10


In [45]:
all_text=[]
for page in range(pdfReader.numPages):
    pageObj = pdfReader.getPage(page)
  
    # extracting text from page
    all_text.append(pageObj.extractText().strip().replace('\n',' '))
df=pd.DataFrame({'title' : ['Q3_PDF'], 'paragraphs': [all_text]})

## Fit Pipeline to PDF File

In [40]:
cdqa_pipeline=QAPipeline(reader='./models/bert_qa.joblib', max_df=5)

In [46]:
cdqa_pipeline.fit_retriever(df=df)

QAPipeline(reader=BertQA(adam_epsilon=1e-08, bert_model='bert-base-uncased',
                         do_lower_case=True, fp16=False,
                         gradient_accumulation_steps=1, learning_rate=5e-05,
                         local_rank=-1, loss_scale=0, max_answer_length=30,
                         n_best_size=20, no_cuda=False,
                         null_score_diff_threshold=0.0, num_train_epochs=3.0,
                         output_dir=None, predict_batch_size=8, seed=42,
                         server_ip='', server_po...ch_size=8,
                         verbose_logging=False, version_2_with_negative=False,
                         warmup_proportion=0.1, warmup_steps=0),
           retrieve_by_doc=False,
           retriever=BM25Retriever(b=0.75, floor=None, k1=2.0, lowercase=True,
                                   max_df=5, min_df=2, ngram_range=(1, 2),
                                   preprocessor=None, stop_words='english',
                                   t

## Feed Query

In [55]:
queries=[
    'Which virus variants of covid 19 are detected in Malaysia?',
    'What is the mean age of patient that Brought In Dead?',
    'Which state has the highest number of patients that died in hospital?',
    'Why foreigners have higher number of BID than Malaysians?'
]
answers=[ cdqa_pipeline.predict(query=q) for q in queries ]
    

In [54]:
text=''
for n, (q,a) in enumerate(zip(queries, answers)):
    question=f'Question {n} : {q}\n'
    answer=f'Answer : {a[0]}\n'
    print(question)
    print(answer)
    text+=question
    text+=answer

Question 0 : Which virus variants of covid 19 are detected in Malaysia?
Answer : Delta and Omicron
Question 1 : What is the mean age of patient that Brought In Dead?
Answer : 59.59
Question 2 : Which state has the highest number of patients that died in hospital?
Answer : Sabah
Question 3 : Why foreigners have higher number of BID than Malaysians?
Answer : they might not be well-educated onCOVID-19 symptoms


## Save Audio

In [18]:
import gtts

In [24]:
tts = gtts.gTTS(text)
tts.save('./audio/query_answer.mp3')

In [51]:
answers

[('Delta and Omicron',
  'Q3_PDF',
  '338 Med J Malaysia Vol 77 No 3 May 2022ABSTRACTBackground: Measuring the success of the control ofCOVID-19 in any country includes a review of the mortalityespecially to compare the deaths of those dying in hospitalsand those brought in dead (BID). The objective of this studywas to compare the death groups with the demographicfactors that influenced the type of death.Methods: This was a case-control study (1:1 ratio) looking atCOVID-19 secondary public data from March 2020 toFebruary 2021. Data such as the basic demographic dataand comorbidities were analysed descriptively and thenusing a binary-logistic regression analysis to compare theindependent variables against the outcome of BID. From thedatabase, 120 cases were included as BID (4 excluded dueto insufficient information) and 120 patients from the 1006who passed away in hospital were randomly selected ascomparators. The data was analysed in SPSS v21.0. Results: The mean age for the BID was 59