# Run the cleaning step only
This notebook :  
- takes as input files all detailed answers stored in `data/output/benchmark_{input_num}/answers`
- clean the answers
- save the clean answers in `data/output/benchmark_{output_num}/answers`

This notebook is useful when a modification has been made in the cleaning steps of the pipeline but there is no need to re-run the full lengthy pipeline calling the LLM.

In [9]:
import sys
import os

sys.path.append("../")    # Add the path to the root directory (where we can find the folder .git)

%load_ext autoreload
%autoreload 2 


from time import time
from narval.pipeline import Pipeline
from narval.utils import FileSystem, get_data_dir

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Define the input parameters

In [10]:
# Use benchmark_version variables to fix the folder name eg "data/output/benchmark_*/answers/"
input_benchmark_version = "benchmark_30"  # Fix the name of the folder where raw detailed answers are stored
output_benchmark_version = "benchmark_31" # Fix the name of the folder where output clean answers will be saved 
# Choose the indicator file used in the cleaning step
indicator_file = "indicateurs_v5.csv"
# Choose whether or not hallucinations must be removed
remove_hallucinations = True

### Instantiate the pipeline

In [11]:
pipeline = Pipeline(
    benchmark_version=output_benchmark_version,
    indicator_file=indicator_file,
)

### Run only the cleaning step 

In [12]:
# Get the list of detailed_answers files to be cleaned
data_dir = get_data_dir()
directory = data_dir + "/data/output/" + input_benchmark_version + "/answers/"
file_list = [file for file in os.listdir(directory) if file.endswith("_detailed_answers.csv")]

# Loop on files
for file in file_list:   
    pdf_name = "_".join(file.split(".")[0].split("_")[:-2])
    pdf_file = pdf_name + ".pdf"
    year = pdf_name.split("_")[-1]
    competence = pdf_name.split("_")[-2]
    if competence=="AC":
        competence = "assainissement collectif"
    else:
        raise ValueError(f"{competence} cannot be recognized as a competence")

    # Executes the cleaning pipeline
    pipeline.run_cleaning_step(
        input_benchmark_version,
        pdf_file=pdf_file,
        competence=competence,
        year=year,
        remove_hallucinations=remove_hallucinations
    )


Answers are saved in data/output/benchmark_31/answers/RAD_Cabasse_AC_2022_answers.csv
The answer 'En consultant l'extrait 4.2,' could not be cleaned and has been ignored
Answers are saved in data/output/benchmark_31/answers/RPQS_Abainville_AC_2021_answers.csv
Answers are saved in data/output/benchmark_31/answers/RPQS_Ahun_cp23150_rpqsid_674494_AC_2021_answers.csv
Answers are saved in data/output/benchmark_31/answers/RPQS_Allain_AC_2021_answers.csv
Answers are saved in data/output/benchmark_31/answers/RPQS_Alloue_AC_2021_answers.csv
Answers are saved in data/output/benchmark_31/answers/RPQS_Amagne_cp08300_rpqsid_651153_AC_2022_answers.csv
Answers are saved in data/output/benchmark_31/answers/RPQS_Artaix_cp71110_rpqsid_303861_AC_2019_answers.csv
Answers are saved in data/output/benchmark_31/answers/RPQS_Aubignosc_AC_2021_answers.csv
Answers are saved in data/output/benchmark_31/answers/RPQS_Autun_AC_2021_answers.csv
Answers are saved in data/output/benchmark_31/answers/RPQS_Brillac_AC_20