# Notebook to run the full pipeline

This notebook takes as input a collection of PDFs and returns for each of them the extracted indicator values as a `csv` file.  
The results are stored in the folder `data/output/choose-the-subfolder-name/answers/`

### Import modules

In [None]:
import sys

sys.path.append("../")    # Add the path to the root directory (where we can find the folder .git)

%load_ext autoreload
%autoreload 2 

from codecarbon import EmissionsTracker
from time import time
from narval.pipeline import Pipeline
from narval.utils import FileSystem, get_data_dir

  machar = _get_machar(dtype)
  from .autonotebook import tqdm as notebook_tqdm


### Choose the input parameters

In [None]:
# Name of the subfolder where answers will be saved (in `data/output/benchmark_*/answers/`)
benchmark_version = "benchmark_123" 
# Name of the indicator file in `data/input`
indicator_file = "indicateurs_v6.csv"
# Name of the question file in `data/input`
question_file = "question_keyword_v7.csv"
# Text and table extraction parameters
extract_tables = True
text_extraction_method = "PyPDF2"
table_extraction_method = "PDFPlumber"
# ... table_answer_filter determines whether (True) or not (False) the QA model will be called 
# ... for an indicator whose value has been extracted from tables but does contain any digits (eg "Non renseign√©")
table_answer_filter = False   
only_table_search_in_rad = True
# Model type (Meta-Llama-3-8B-Instruct gives the best results to date)
model_name = "meta-llama/Meta-Llama-3-8B-Instruct"
# Prompt version (Llame_prompt_v7 gives the best results to date)
prompt_version = ("Llama_prompt_system_v7", "Llama_prompt_user_v7")
# Choose whether or not hallucinations must be removed in the cleaning step
remove_hallucinations = True
# Name of the file in `data/input` containing the list of PDFs to be read and questioned
rpqs_eval_list_file = "rpqs_eval_list_1+2.csv"

### Import the dataframe containing the list of PDFs to be read and questioned

In [3]:
# Instantiate the File System (local file system or S3 bucket)
fs = FileSystem()
# Get the directory containing the folder `data`
data_dir = get_data_dir()
# Import the dataframe containing the list of PDFs to be read and questioned
eval_df = fs.read_csv_to_df(data_dir + "/data/input/" + rpqs_eval_list_file, sep=";", 
                            usecols=["pdf_name", "collectivity", "year", "competence"])
# Show the first rows of this dataframe
eval_df.head()

Unnamed: 0,pdf_name,collectivity,year,competence
0,RPQS_Allain_AC_2021,Allain,2021,assainissement collectif
1,RPQS_Alloue_AC_2021,Alloue,2021,assainissement collectif
2,RPQS_Brillac_AC_2021,Brillac,2021,assainissement collectif
3,RPQS_Rully_AC_2021,Rully,2021,assainissement collectif
4,RPQS_Jons_AC_2022,Jons,2022,assainissement collectif


### Instantiate the pipeline

In [None]:
pipeline = Pipeline(
    question_file=question_file,
    indicator_file=indicator_file,
    extract_tables=extract_tables,
    text_extraction_method=text_extraction_method,
    table_extraction_method=table_extraction_method,
    table_answer_filter=table_answer_filter,
    only_table_search_in_rad=only_table_search_in_rad,
    benchmark_version=benchmark_version,
    model_name=model_name,
    prompt_version=prompt_version
)

### Log in to HuggingFace Hub if not yet done 
Needed for Llama3 models. Be sure you created first a Hugging Face token `HF_TOKEN` on your HuggingFace profile and saved it as an environment variable.

In [None]:
# This cell needs to be run only once
# There is no need to run this cell if you have already logged in to HuggingFace Hub previously
# This cell must be run if the pipeline below generates an `AttributeError` inviting you to log in to the HuggingFace Hub 

'''
import os
from huggingface_hub import login

hf_token = os.environ["HF_TOKEN"]
login(token = hf_token)
'''

### Instantiate the CodeCarbon tracker

In [None]:
tracker = EmissionsTracker(
    save_to_file = False,      
    log_level="error"
    )

### Run the pipeline

In [None]:
# Start the CodeCarbon tracker
tracker.start()

t0 = time()
try:
    for _, row in eval_df.iterrows():
        print("\n"+"#"*20)
        pdf_file = row['pdf_name'] + ".pdf"
        collectivity = row['collectivity']
        year = row['year']
        competence = row['competence']

        # Executes the full pipeline
        pipeline.run(
            pdf_file=pdf_file,
            competence=competence,
            year=year,
            collectivity=collectivity, 
            remove_hallucinations=remove_hallucinations,
            max_new_tokens=10
        )  

finally:
    # Stop the CodeCarbon tracker
    emissions = tracker.stop()

t1 = time()
print("\n"+"#"*20)
print(f"Computation time = {round(t1-t0, 1)} s")
print(f"Carbon footprint : {round(emissions * 1_000, 1)} gCO2eq")


Show the detailed CodeCarbon data

In [None]:
import json

json.loads(tracker.final_emissions_data.toJSON())