# Notebook to test the module `pipeline.py`


### Import modules

In [1]:
import sys
import pandas as pd

sys.path.append("../../")    # Add the path to the root directory (where we can find the folder .git)

%load_ext autoreload
%autoreload 2 

from narval.pipeline import Pipeline, merge_question_answer_dicts
from narval.utils import FileSystem, get_data_dir

  machar = _get_machar(dtype)
  from .autonotebook import tqdm as notebook_tqdm


In [16]:
pd.options.display.max_colwidth = None

### Choose the input parameters

In [2]:
# Name of the subfolder where answers will be saved (in `data/output/benchmark_*/answers/`)
benchmark_version = "test_pipeline" 
# Name of the indicator file in `data/input`
indicator_file = "indicateurs_v3.csv"
# Name of the question file in `data/input`
question_file = "question_keyword_v3.csv"
# Model type (Meta-Llama-3-8B-Instruct gives the best results to date)
model_name = "google/flan-t5-base"
# Prompt version (Llame_prompt_v7 gives the best results to date)
prompt_version = "T5_prompt_v1"
# Choose whether or not hallucinations must be removed in the cleaning step
remove_hallucinations = True
# Name of the file in `data/input` containing the list of PDFs to be read and questioned
rpqs_eval_list_file = "rpqs_eval_list_1.csv"

### Import the dataframe containing the list of PDFs to be read and questioned

In [3]:
# Instantiate the File System (local file system or S3 bucket)
fs = FileSystem()
# Get the directory containing the folder `data`
data_dir = get_data_dir()
# Import the dataframe containing the list of PDFs to be read and questioned
eval_df = fs.read_csv_to_df(data_dir + "/data/input/" + rpqs_eval_list_file, sep=";", 
                            usecols=["pdf_name", "collectivity", "year", "competence"])
# Show the first rows of this dataframe
eval_df.head()

Unnamed: 0,pdf_name,collectivity,year,competence
0,RPQS_Allain_AC_2021,Allain,2021,assainissement collectif
1,RPQS_Alloue_AC_2021,Alloue,2021,assainissement collectif
2,RPQS_Brillac_AC_2021,Brillac,2021,assainissement collectif
3,RPQS_Rully_AC_2021,Rully,2021,assainissement collectif
4,RPQS_Jons_AC_2022,Jons,2022,assainissement collectif


Choose a PDF

In [5]:
pdf_name = "RPQS_Alloue_AC_2021"    

pdf_file = pdf_name + ".pdf"
collectivity = eval_df.query("pdf_name==@pdf_name")["collectivity"].values[0]
year = eval_df.query("pdf_name==@pdf_name")["year"].values[0]
competence = eval_df.query("pdf_name==@pdf_name")["competence"].values[0]

### Instantiate the pipeline

In [6]:
pipeline = Pipeline(
    question_file=question_file,
    indicator_file=indicator_file,
    benchmark_version=benchmark_version,
    model_name=model_name,
    prompt_version=prompt_version
)

### Log in to HuggingFace Hub if not yet done 
Needed for Llama3 models. Be sure you created first a Hugging Face token `HF_TOKEN` on your HuggingFace profile and saved it as an environment variable.

In [53]:
# This cell needs to be run only once
# There is no need to run this cell if you have already logged in to HuggingFace Hub previously
# This cell must be run if the pipeline below generates an `AttributeError` inviting you to log in to the HuggingFace Hub 

'''
import os
from huggingface_hub import login

hf_token = os.environ["HF_TOKEN"]
login(token = hf_token)
'''

'\nimport os\nfrom huggingface_hub import login\n\nhf_token = os.environ["HF_TOKEN"]\nlogin(token = hf_token)\n'

### Run the pipeline, step by step (for testing)

Extract text

In [7]:
print(f"Extract text (and tables) from pdf {pdf_file} ...")
pdf_pages, pdf_tables, toc_indices = pipeline.extract_text_from_pdf(pdf_file)
print("Done")

Extract text (and tables) from pdf RPQS_Alloue_AC_2021.pdf ...
Done


Get the segmentation df

In [37]:
print("Get the segmentation dataframe  ...")
segmentation_df = pipeline.get_segmentation_df(
    pdf_pages, pdf_tables, competence, toc_indices
)
print("Done")

Get the segmentation dataframe  ...
Done


Define a sub-segmentation_df to save time

In [38]:
segmentation_df = segmentation_df.loc[:24]
segmentation_df

Unnamed: 0,indicator,question,keyword_regex,relevant_pages,table_relevant_pages
0,D201.0,Quel le nombre d'habitants desservis par le réseau d'assainissement collectif (D201.0),\bhabitants?\b,"[2, 4, 5, 11]",
1,D201.0,Quelle est la valeur de l'indicateur D201.0,\bD201.0s?\b,[11],"[[11, 0]]"
2,D202.0,Quel est le nombre d'autorisations de déversement d'effluents d'établissements industriels (D202.0),\bautorisations?\b,[11],
3,D202.0,Quelle est la valeur de l'indicateur D202.0,\bD202.0s?\b,[11],"[[11, 0]]"
4,D203.0,Quelle est la quantité de boues évacuées (D203.0),\bboues?\b,"[4, 9, 11]",
5,D203.0,Quelle est la valeur de l'indicateur D203.0,\bD203.0s?\b,"[4, 11]","[[11, 0]]"
6,D204.0,Quel est le prix au m3 du service d'assainissement de l'eau,\bprixs?\b,"[0, 11]",
7,D204.0,Quel est le prix au m3 du service d'assainissement de l'eau (D204.0),\bprixs?\b,"[0, 11]",
8,D204.0,Quelle est la valeur de l'indicateur D204.0,\bD204.0s?\b,[11],"[[11, 0]]"
9,P201.1,Quel est le pourcentage d'abonnés desservis par le réseau (P201.1),\babonnés?\b,"[2, 3, 4, 5, 7, 11]",


In [101]:
df = pdf_tables[20][0]
df

Unnamed: 0,Thème,Type,Code,Libellé,Valeur 2021
0,Abonnés,Descriptif,D201.0,Estimation du nombre d'habitants desservis par...,
1,,,,,364
3,Réseau,Descriptif,D202.0,Nombre d’autorisations de déversement d’efflue...,
4,,,,,1
6,Boue,Descriptif,D203.0,Quantité de boues issues des ouvrages d’épurat...,
7,,,,,390
9,Abonnés,Descriptif,D204.0,Prix TTC du service au m³ pour 120 m³ (€),196
10,Abonnés,Performance,P201.1,Taux de desserte par des réseaux de collecte d...,Non
11,,,,,estimable
12,Réseau,Performance,P202.2B,Indice de connaissance et de gestion patrimoni...,


Extract indicator values from summary tables

In [43]:
print("Extract indicator values from summary tables ...")
indicator_value_dict = pipeline.extract_indicators_from_tables(
    pdf_tables, segmentation_df, year
)
known_indicator_list = pipeline.get_known_indicator_list(indicator_value_dict)
default_question_answer_dict = pipeline.get_default_question_answer_dict(segmentation_df, known_indicator_list)
print("Done")

Extract indicator values from summary tables ...
Done


In [40]:
known_indicator_list

['D201.0',
 'D202.0',
 'D203.0',
 'D204.0',
 'P201.1',
 'P202.2B',
 'P203.3',
 'P204.3',
 'P205.3',
 'P207.0']

Ask questions to the LLM

In [41]:
if pipeline.are_all_indicators_extracted_from_tables(
    known_indicator_list, competence
):
    print("All indicators are extracted from summary tables")
    question_answer_dict = default_question_answer_dict
else:
    print("Some indicators could not be extracted from summary tables")
    print(f"Loading QA model {pipeline.model_name} ...")
    pipeline.load_qa_model()
    print("QA model loaded.")
    print(f"Asking questions using model {pipeline.model_name} ...")
    llm_question_answer_dict = pipeline.ask_questions(
        pdf_pages,
        segmentation_df,
        known_indicator_list,
        competence,
        year,
        collectivity,
        max_new_tokens=10,
    )
    question_answer_dict = merge_question_answer_dicts(
        llm_question_answer_dict, default_question_answer_dict
    )
    print("Done")

Some indicators could not be extracted from summary tables
Loading QA model google/flan-t5-base ...
QA model loaded.
Asking questions using model google/flan-t5-base ...


100%|██████████| 2/2 [05:50<00:00, 175.12s/it]

Done





In [56]:

print("Cleaning answers ...")
textpages = pdf_pages if remove_hallucinations else None
answer_df, detailed_answer_df = pipeline.clean_answers_from_dict(
    segmentation_df,
    question_answer_dict,
    indicator_value_dict,
    year,
    textpages=textpages,
)
print("Done")





Cleaning answers ...
Done


In [144]:
print(f"Saving answers for {pdf_file} ...")
pipeline.save_answers(answer_df, detailed_answer_df, pdf_file, competence, year)
print("Done")

Saving answers for RPQS_Grolejac_AC_2021.pdf ...
Done


In [145]:
answer_df.iloc[:, 4:]

Unnamed: 0,indicator,final_answer,filtered_answer_list,clean_answer_list,concat_answer_list
0,D201.0,364.0,[364.0],[364.0],"[Indicator value extracted from tables, 364, I..."
1,D202.0,1.0,[1.0],[1.0],"[Indicator value extracted from tables, 1, Ind..."
2,D203.0,3.9,[3.9],[3.9],"[Indicator value extracted from tables, 3,90, ..."
3,D204.0,1.96,[1.96],[1.96],"[Indicator value extracted from tables, Indica..."
4,P201.1,je ne trouve pas,[],[],"[Indicator value extracted from tables, Non es..."
5,P202.2B,je ne trouve pas,[],[],"[Indicator value extracted from tables, Indica..."
6,P203.3,100.0,[100.0],[100.0],"[Indicator value extracted from tables, 100, I..."
7,P204.3,100.0,[100.0],[100.0],"[Indicator value extracted from tables, 100, I..."
8,P205.3,100.0,[100.0],[100.0],"[Indicator value extracted from tables, 100, I..."
9,P206.3,100.0,[100.0],[100.0],"[Indicator value extracted from tables, 100, I..."
