# Notebook to test the module AnswerManager

In [21]:
import pandas as pd
import os
import sys
from pathlib import Path
from time import time 
from tqdm import tqdm

sys.path.append("../../")    # Add the path to the root directory (where we can find the folder .git)

%load_ext autoreload
%autoreload 2 

from narval.utils import get_data_dir, FileSystem
from narval.pdfreader import PDFReader
from narval.pagefinder import PageFinder
from narval.qamodel import T5QuestionAnswering, Llama3QuestionAnswering
from narval.answermanager import AnswerManager



The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
pd.options.display.max_columns = None
pd.set_option("max_colwidth", None)
#pd.options.display.max_rows = None

### Extract text

In [3]:
data_dir = get_data_dir()
year = "2021"      # year of the report
file_path = "/data/input/pdfs/"
file_name = f"RPQS_SIDEALF_AC_{year}.pdf"
FILE_PATH = data_dir + file_path + file_name

pdf_reader = PDFReader(FILE_PATH)
pages = pdf_reader.textpages

### Find relevant pages for each question

In [114]:
data_dir = get_data_dir()
question_keyword_path = data_dir + "/data/input/question_keyword_malou.csv"

competence = "assainissement collectif"
pagefinder = PageFinder(question_keyword_path, competence)
pages_df = pagefinder.extract_relevant_pages(pages)

pages_df.head()

Unnamed: 0,indicator,question,keyword_regex,relevant_pages
0,D203.0,Quelle est la valeur de l'indicateur D203.0,D203.0,"[1, 10]"
1,D203.0,Quelle est la quantité de boues évacuées (D203.0),boues,"[1, 2, 10, 14, 15, 18, 19]"
2,D204.0,Quelle est la valeur de l'indicateur D204.0,D204.0,"[1, 13]"
3,D204.0,Quel est le prix du service au m3 pour l'assinissement de l'eau (D204.0),prix,"[0, 14, 22]"
4,D204.0,Quel est le prix du service au m3 pour l'assinissement de l'eau ?,prix,"[0, 14, 22]"


In [115]:
# Modify one row with an empty list for testing 
pages_df.at[1, "relevant_pages"] = []
pages_df.head()

Unnamed: 0,indicator,question,keyword_regex,relevant_pages
0,D203.0,Quelle est la valeur de l'indicateur D203.0,D203.0,"[1, 10]"
1,D203.0,Quelle est la quantité de boues évacuées (D203.0),boues,[]
2,D204.0,Quelle est la valeur de l'indicateur D204.0,D204.0,"[1, 13]"
3,D204.0,Quel est le prix du service au m3 pour l'assinissement de l'eau (D204.0),prix,"[0, 14, 22]"
4,D204.0,Quel est le prix du service au m3 pour l'assinissement de l'eau ?,prix,"[0, 14, 22]"


### Predict answers with the T5-base model

In [116]:
t5_model = T5QuestionAnswering(model_name="google/flan-t5-base")



Device =  cpu


Ask all questions (or a subset to save time)

In [117]:
prompt_params = {
    "year": year
}

prompt_version="T5_prompt_v1"

In [118]:
answer_list_list = []
question_list = []
for idx in tqdm(pages_df.index[:5]):      # is it really faster than iterrows ?
    _, question, _, relevant_pages_list = pages_df.loc[idx]  # Be careful memory is accessed
    # Compute the answer_list for this question
    answer_list = []
    for page_num in relevant_pages_list:
        context = pages[page_num]
        prompt = t5_model.format_prompt(context, question, prompt_params, prompt_version)
        answer = t5_model.predict(prompt)
        answer_list.append(answer)
    question_list.append(question)
    answer_list_list.append(answer_list)

    print("Answers = ", answer_list)

# Update the dict
question_answer_dict = {"question": question_list, "answer_list": answer_list_list}
    

 20%|██        | 1/5 [01:07<04:28, 67.07s/it]

Answers =  ['D203.0', '1.10.1. Quantites de boue']
Answers =  []


 60%|██████    | 3/5 [02:23<01:31, 45.88s/it]

Answers =  ['2.2. Type of assain', '2.2. Facture of assa']


 80%|████████  | 4/5 [03:41<00:56, 56.71s/it]

Answers =  ['m3 for water (D204.0', "L'objectif est d'aller vers", '0']


100%|██████████| 5/5 [04:43<00:00, 56.73s/it]

Answers =  ['L2224 - 5 du code general', 'en EUR', 'EUR']





### Test the answer cleaning pipeline

Initialize the answer manager

In [119]:
indic_bound_file_path = "../../data/input/indicateurs.csv"
answer_manager = AnswerManager(pages_df, indic_bound_file_path)

In [120]:
answer_manager.indic_bound_df.head()

Unnamed: 0,code_ip,min_warning_ip,max_warning_ip,min_critic_ip,max_critic_ip
0,P205.3,0.0,100.0,0.0,100.0
1,P203.3,0.0,100.0,0.0,100.0
2,P204.3,0.0,100.0,0.0,100.0
3,P253.2,0.0,3.0,0.0,100.0
4,P103.2A,0.0,100.0,0.0,100.0


In [121]:
answer_manager.detailed_answer_df.head(6)

Unnamed: 0,indicator,question,keyword_regex,relevant_pages,answer_list
0,D203.0,Quelle est la valeur de l'indicateur D203.0,D203.0,"[1, 10]","[Not yet asked, Not yet asked]"
1,D203.0,Quelle est la quantité de boues évacuées (D203.0),boues,[],[]
2,D204.0,Quelle est la valeur de l'indicateur D204.0,D204.0,"[1, 13]","[Not yet asked, Not yet asked]"
3,D204.0,Quel est le prix du service au m3 pour l'assinissement de l'eau (D204.0),prix,"[0, 14, 22]","[Not yet asked, Not yet asked, Not yet asked]"
4,D204.0,Quel est le prix du service au m3 pour l'assinissement de l'eau ?,prix,"[0, 14, 22]","[Not yet asked, Not yet asked, Not yet asked]"
5,D204.0,Quelle est la valeur de D204.0,D204.0,"[1, 13]","[Not yet asked, Not yet asked]"


In [122]:
print(f"There are {answer_manager.detailed_answer_df.shape[0]} questions to be asked")

There are 41 questions to be asked


Update the detailed_answer_df

In [123]:
answer_manager.fill_detailed_answer_df(question_answer_dict)

In [124]:
answer_manager.detailed_answer_df.head(6)

Unnamed: 0,indicator,question,keyword_regex,relevant_pages,answer_list
0,D203.0,Quelle est la valeur de l'indicateur D203.0,D203.0,"[1, 10]","[D203.0, 1.10.1. Quantites de boue]"
1,D203.0,Quelle est la quantité de boues évacuées (D203.0),boues,[],[]
2,D204.0,Quelle est la valeur de l'indicateur D204.0,D204.0,"[1, 13]","[2.2. Type of assain, 2.2. Facture of assa]"
3,D204.0,Quel est le prix du service au m3 pour l'assinissement de l'eau (D204.0),prix,"[0, 14, 22]","[m3 for water (D204.0, L'objectif est d'aller vers, 0]"
4,D204.0,Quel est le prix du service au m3 pour l'assinissement de l'eau ?,prix,"[0, 14, 22]","[L2224 - 5 du code general, en EUR, EUR]"
5,D204.0,Quelle est la valeur de D204.0,D204.0,"[1, 13]","[Not yet asked, Not yet asked]"


Clean step by step

In [125]:
answer_manager.build_answer_df()
answer_manager.answer_df.head(6)

Unnamed: 0,indicator,concat_answer_list
0,D203.0,"[D203.0, 1.10.1. Quantites de boue]"
1,D204.0,"[2.2. Type of assain, 2.2. Facture of assa, m3 for water (D204.0, L'objectif est d'aller vers, 0, L2224 - 5 du code general, en EUR, EUR, Not yet asked, Not yet asked]"
2,P201.1,"[Not yet asked, Not yet asked, Not yet asked, Not yet asked, Not yet asked, Not yet asked, Not yet asked, Not yet asked, Not yet asked, Not yet asked, Not yet asked, Not yet asked]"
3,P202.2B,"[Not yet asked, Not yet asked, Not yet asked, Not yet asked, Not yet asked, Not yet asked]"
4,P204.3,"[Not yet asked, Not yet asked, Not yet asked, Not yet asked, Not yet asked]"
5,P205.3,"[Not yet asked, Not yet asked, Not yet asked, Not yet asked, Not yet asked, Not yet asked]"


In [127]:
# Add artifically an answer with "je ne trouve pas" for testing 
(answer_manager.answer_df
 .loc[0, "concat_answer_list"]
 .append("blabla Je ne trOuve pas la réponse!")
)
answer_manager.answer_df.head(6)

Unnamed: 0,indicator,concat_answer_list
0,D203.0,"[D203.0, 1.10.1. Quantites de boue, blabla Je ne trOuve pas la réponse!, blabla Je ne trOuve pas la réponse!]"
1,D204.0,"[2.2. Type of assain, 2.2. Facture of assa, m3 for water (D204.0, L'objectif est d'aller vers, 0, L2224 - 5 du code general, en EUR, EUR, Not yet asked, Not yet asked]"
2,P201.1,"[Not yet asked, Not yet asked, Not yet asked, Not yet asked, Not yet asked, Not yet asked, Not yet asked, Not yet asked, Not yet asked, Not yet asked, Not yet asked, Not yet asked]"
3,P202.2B,"[Not yet asked, Not yet asked, Not yet asked, Not yet asked, Not yet asked, Not yet asked]"
4,P204.3,"[Not yet asked, Not yet asked, Not yet asked, Not yet asked, Not yet asked]"
5,P205.3,"[Not yet asked, Not yet asked, Not yet asked, Not yet asked, Not yet asked, Not yet asked]"


In [128]:
answer_manager.clean_answers()

answer_manager.answer_df.head(6)

Unnamed: 0,indicator,clean_answer_list,concat_answer_list
0,D203.0,[je ne trouve pas],"[D203.0, 1.10.1. Quantites de boue, blabla Je ne trOuve pas la réponse!, blabla Je ne trOuve pas la réponse!]"
1,D204.0,"[204.0, 0.0, 2224.0, 5.0, Not yet asked]","[2.2. Type of assain, 2.2. Facture of assa, m3 for water (D204.0, L'objectif est d'aller vers, 0, L2224 - 5 du code general, en EUR, EUR, Not yet asked, Not yet asked]"
2,P201.1,[Not yet asked],"[Not yet asked, Not yet asked, Not yet asked, Not yet asked, Not yet asked, Not yet asked, Not yet asked, Not yet asked, Not yet asked, Not yet asked, Not yet asked, Not yet asked]"
3,P202.2B,[Not yet asked],"[Not yet asked, Not yet asked, Not yet asked, Not yet asked, Not yet asked, Not yet asked]"
4,P204.3,[Not yet asked],"[Not yet asked, Not yet asked, Not yet asked, Not yet asked, Not yet asked]"
5,P205.3,[Not yet asked],"[Not yet asked, Not yet asked, Not yet asked, Not yet asked, Not yet asked, Not yet asked]"


In [129]:
answer_manager.exclude_oob_answers()

answer_manager.answer_df.head(6)

Unnamed: 0,indicator,filtered_answer_list,clean_answer_list,concat_answer_list
0,D203.0,[je ne trouve pas],[je ne trouve pas],"[D203.0, 1.10.1. Quantites de boue, blabla Je ne trOuve pas la réponse!, blabla Je ne trOuve pas la réponse!]"
1,D204.0,"[0.0, 5.0, Not yet asked]","[204.0, 0.0, 2224.0, 5.0, Not yet asked]","[2.2. Type of assain, 2.2. Facture of assa, m3 for water (D204.0, L'objectif est d'aller vers, 0, L2224 - 5 du code general, en EUR, EUR, Not yet asked, Not yet asked]"
2,P201.1,[Not yet asked],[Not yet asked],"[Not yet asked, Not yet asked, Not yet asked, Not yet asked, Not yet asked, Not yet asked, Not yet asked, Not yet asked, Not yet asked, Not yet asked, Not yet asked, Not yet asked]"
3,P202.2B,[Not yet asked],[Not yet asked],"[Not yet asked, Not yet asked, Not yet asked, Not yet asked, Not yet asked, Not yet asked]"
4,P204.3,[Not yet asked],[Not yet asked],"[Not yet asked, Not yet asked, Not yet asked, Not yet asked, Not yet asked]"
5,P205.3,[Not yet asked],[Not yet asked],"[Not yet asked, Not yet asked, Not yet asked, Not yet asked, Not yet asked, Not yet asked]"


In [130]:
answer_manager.select_one_answer_per_indic()
answer_manager.answer_df.head(6)

Unnamed: 0,indicator,final_answer,filtered_answer_list,clean_answer_list,concat_answer_list
0,D203.0,je ne trouve pas,[je ne trouve pas],[je ne trouve pas],"[D203.0, 1.10.1. Quantites de boue, blabla Je ne trOuve pas la réponse!, blabla Je ne trOuve pas la réponse!]"
1,D204.0,5.0,"[0.0, 5.0, Not yet asked]","[204.0, 0.0, 2224.0, 5.0, Not yet asked]","[2.2. Type of assain, 2.2. Facture of assa, m3 for water (D204.0, L'objectif est d'aller vers, 0, L2224 - 5 du code general, en EUR, EUR, Not yet asked, Not yet asked]"
2,P201.1,Not yet asked,[Not yet asked],[Not yet asked],"[Not yet asked, Not yet asked, Not yet asked, Not yet asked, Not yet asked, Not yet asked, Not yet asked, Not yet asked, Not yet asked, Not yet asked, Not yet asked, Not yet asked]"
3,P202.2B,Not yet asked,[Not yet asked],[Not yet asked],"[Not yet asked, Not yet asked, Not yet asked, Not yet asked, Not yet asked, Not yet asked]"
4,P204.3,Not yet asked,[Not yet asked],[Not yet asked],"[Not yet asked, Not yet asked, Not yet asked, Not yet asked, Not yet asked]"
5,P205.3,Not yet asked,[Not yet asked],[Not yet asked],"[Not yet asked, Not yet asked, Not yet asked, Not yet asked, Not yet asked, Not yet asked]"


Alternatively ... in one line

In [104]:
answer_manager.apply_full_cleaning_pipeline()
answer_manager.answer_df.head(6)

Unnamed: 0,indicator,final_answer,filtered_answer_list,clean_answer_list,concat_answer_list
0,D203.0,167.046,"[167.046, 173.486, 203.0, 100.0]","[167.046, 173.486, 203.0, 100.0]","[D203.0, 1.10.1. Quantites de boue, 2.1., D203.0, 167.046 t 173.486, D203.0, Total des ventes d'eau, d203.0, 100 %]"
1,D204.0,5.0,"[0.0, 5.0, Not yet asked]","[204.0, 0.0, 2224.0, 5.0, Not yet asked]","[2.2. Type of assain, 2.2. Facture of assa, m3 for water (D204.0, L'objectif est d'aller vers, 0, L2224 - 5 du code general, en EUR, EUR, Not yet asked, Not yet asked]"
2,P201.1,Not yet asked,[Not yet asked],[Not yet asked],"[Not yet asked, Not yet asked, Not yet asked, Not yet asked, Not yet asked, Not yet asked, Not yet asked, Not yet asked, Not yet asked, Not yet asked, Not yet asked, Not yet asked]"
3,P202.2B,Not yet asked,[Not yet asked],[Not yet asked],"[Not yet asked, Not yet asked, Not yet asked, Not yet asked, Not yet asked, Not yet asked]"
4,P204.3,Not yet asked,[Not yet asked],[Not yet asked],"[Not yet asked, Not yet asked, Not yet asked, Not yet asked, Not yet asked]"
5,P205.3,Not yet asked,[Not yet asked],[Not yet asked],"[Not yet asked, Not yet asked, Not yet asked, Not yet asked, Not yet asked, Not yet asked]"


### Save results

In [149]:
year = "2021"      # year of the report
competence = "AC"
pdf_name = f"RPQS_SIDEALF_{competence}_{year}.pdf"

df = answer_manager.answer_df.copy()
df.insert(0, "pdf_name", pdf_name)
df.insert(0, "competence", competence)
df.insert(0, "year", year)

df.head()

Unnamed: 0,year,competence,pdf_name,indicator,final_answer,filtered_answer_list,clean_answer_list,concat_answer_list
0,2021,AC,RPQS_SIDEALF_AC_2021.pdf,D203.0,2021.0,"[2021.0, 167.046, 173.486, 2021.0, 100.0]","[2021.0, 167.046, 173.486, 2021.0, 100.0]","[2021, 1.10. Quantités de boues, Total of dry matter evacuated, D203.0, 167.046 t 173.486, D203.0, Total des ventes d’eau (, 2021, 100 %]"
1,2021,AC,RPQS_SIDEALF_AC_2021.pdf,D204.0,20.0,"[20.0, 20.0, 0.0, 0.0]","[20.0, 20.0, 0.0, 1000.0, 0.0, 1000.0]","[D204.0, 2.2. Facture of assa, m3 for water assistance (D20, m3 for water assistance (D20, 0 for 1000 abonnés, m3 for water?, m3, 0 for 1000 abonnés]"
2,2021,AC,RPQS_SIDEALF_AC_2021.pdf,P201.1,,[],[],[]
3,2021,AC,RPQS_SIDEALF_AC_2021.pdf,P202.2B,,[],[],[]
4,2021,AC,RPQS_SIDEALF_AC_2021.pdf,P204.3,,[],[],[]


In [160]:
data_dir = get_data_dir()
output_path = "/data/output/answers/"
output_dir = Path(data_dir+output_path)
output_dir.mkdir(parents=True, exist_ok=True)

output_file = pdf_name.split(".")[0]+"_answers.csv"
output_path = output_dir / output_file

fs = FileSystem()
fs.write_df_to_csv(df, output_path)


In [159]:
from unidecode import unidecode

mylist = ['blba', "#éà", "€100"]
mylist = [unidecode(x) for x in mylist]
mylist

['blba', '#ea', 'EUR100']