# A notebook to test `answermanager.py` after reading detailed_answers files

In [1]:
import pandas as pd
import os
import sys
import ast
from pathlib import Path
from time import time 
from tqdm import tqdm

sys.path.append("../../")    # Add the path to the root directory (where we can find the folder .git)

%load_ext autoreload
%autoreload 2 

from narval.utils import get_data_dir, FileSystem
from narval.pdfreader import PDFReader
from narval.answermanager import AnswerManager


  machar = _get_machar(dtype)


## Test the cleaning function `clean_answer`

In [2]:
import re 

def clean_answer(answer):
    # For the test answer "je crois  2.20. que 252.3 va D123 alors textee# P245.3 ok! 17. B415.3 €128."
    # this should return [252.3, 17.0, 128.0]
    # Replace spaces between digits with no space (eg 58 425.25 -> 58425.25)
    # Removes spaces for millions eg "1 253 457" -> "1253457"
    answer = re.sub(
        r"(?<!\d)\b(\d{1,3}) (\d{3}) (\d{3})\b(?!\d)", r"\1\2\3", answer
    )
    # Remove spaces for eg "abc 12 345.7 xyz" -> "12345.7" but not "123 45"
    answer = re.sub(r"(?<!\d)\b(\d{1,}) (\d{3,})([.,]?\d{0,3})\b(?!\d)", r"\1\2\3", answer)
    # Remove spaces in fractions (eg "57 / 120" -> 57/120)
    answer = re.sub(r"\s?/\s?", r"/", answer)
    # Keep only the number of points for indicators in points (P202.2B, P255.3, ...)
    # Eg "70 points sur un total de 120" -> 70
    matches = re.findall(
        r"\D*(\d{1,3})(?: points)? sur\D* (?:100|120)\D*", answer
    )
    if len(matches) != 0:
        answer = " ".join(list(matches))
    # Remove all "." that are followed by a space and a capital letter
    answer = re.sub(r"\.(?=\s[A-Z])", "", answer)
    # Remove "." at the end of a sentence
    answer = re.sub(r"\.$", "", answer)
    # Split the string by spaces
    item_list = answer.split(" ")
    # Keep only items that contain at least one digit
    item_list = [item for item in item_list if re.search(r"\d", item)]
    # Keep only numerators from fractions over 100, 120 or 1000 (eg "57/120" -> 57 or "57.0/100" -> 57)
    # Relevant for indicators counted as points over 100 or 120
    item_list = [
        re.sub(r"^(\d+)([.,]0)?/(100|120)$", r"\1", item) for item in item_list
    ]
    # Same for indicators expressed as unitless numbers over 100 (eg km) or 1000 (eg abonnés)
    # eg 70.5u/100km -> 70.5
    item_list = [
        re.sub(r"^(\d+)([.,]\d*)?(\D{0,2})/(100|1000)\D*$", r"\1\2", item) for item in item_list
    ]
    # For other fractions, separate the numerator and denominator 'eg "22.4/75.2" -> ["22.4", "75.2"]
    item_list = [
        re.sub(r"(^\d+[.,]?\d+)/(\d+[.,]?\d+)$", r"\1##&\2", item)
        for item in item_list
    ]
    item_list = [item.split("##&") for item in item_list]
    # Flatten the list
    item_list = sum(item_list, [])
    # Remove punctuation characters except "," and "."
    item_list = [
        re.sub(r'[!"#$%&\'()*+:;<=>?@\[\\\]^_`{|}~]', "", item)
        for item in item_list
    ]
    # Remove items that have more than one period
    item_list = [item for item in item_list if item.count(".") <= 1]
    item_list = [item for item in item_list if item.count(",") <= 1]
    # Remove items that start with "D" or "P" (corresponding to indicator codes)
    item_list = [item for item in item_list if not item.startswith(("D", "P"))]
    # Remove special items m2 and m3
    item_list = [re.sub(r"(\w*)m[23](?:/\w+)?\b", r"\1", item) for item in item_list]
    # Extract digits and decimal point
    item_list = [re.sub(r"[^\d.,]", "", item) for item in item_list]
    # Replace "," by "."
    item_list = [re.sub(r"(?<=\d),(?=\d|$)", ".", item) for item in item_list]
    # Remove empty strings
    item_list = [
        item for item in item_list if item.strip() not in ["", ",", "."]
    ]
    # Convert to float
    item_list = [float(item) for item in item_list]

    return item_list


In [None]:
test = "je crois  2.20. que 252.3 va D123 alors textee# P245.3 ok! 17. B415.3 €128."
result = clean_answer(test)
print([float(x) for x in result] == [252.3, 17, 415.3, 128])

test = "je crois  2.20. Que 252.3 va D123 alors textee# P245.3 ok! 17. B415.3 €128."
result = clean_answer(test)
print([float(x) for x in result] == [2.2, 252.3, 17, 415.3, 128])

test = "Selon les extraits, l'endettement du service pour l'assainissement de l'eau en 2021 est de 59 478,41 EUR (encours de la dette au 31 decembre 2021)."
result = clean_answer(test)
print([float(x) for x in result] == [2021, 59478.41, 31, 2021])

test = "Selon les extraits, la valeur de la solidarite aux usagers pour l'assainissement de l'eau en 2021 est de 0,00 EUR/m3"
result = clean_answer(test)
print([float(x) for x in result] == [2021, 0])

test = """Selon les extraits, l'indicateur D204.0 est le "Prix TTC (EUR) du service au m3 pour 120 m3" en 2021, et sa valeur est de 1,73 EUR et 17 m3/km/j"""
result = clean_answer(test)
print([float(x) for x in result] == [120, 2021, 1.73, 17])

test = "Je ne trouve pas. L'indicateur demande est l'Indice de connaissance et de gestion patrimoniale des reseaux (P202.2B), mais il n'y a pas de valeur pour l'annee "
result = clean_answer(test)
print([float(x) for x in result] == [])

test = "Selon les extraits, la valeur de l'indicateur P201.1 en 2021 est de 100,00%."
result = clean_answer(test)
print([float(x) for x in result] == [2021, 100])

test = 'blabla 1 800,00 0,00 D204.0 Prix T'
result = clean_answer(test)
print(result)
print([float(x) for x in result] == [])

test = "La réponse est   70/100 ou 18,0/120 and 22.4 /63.4 ou 17 / 85.8 ou 70.5/ 100 ou 80.0/120"
result = clean_answer(test)
print([float(x) for x in result] == [70, 18, 22.4, 63.4, 17, 85.8, 70.5, 80])

test = "La réponse est 70 points sur un total de 120 points"
result = clean_answer(test)
print([float(x) for x in result] == [70])

test = "5 points sur 100"
result = clean_answer(test)
print([float(x) for x in result] == [5])

test = "J'hésite entre 5 sur 100, 70 points sur 120 et peut-être même 85 points sur un total de 100"
result = clean_answer(test)
print([float(x) for x in result] == [5, 70, 85])

test = "La réponse est 5 points sur 100"
result = clean_answer(test)
print([float(x) for x in result] == [5])

test = "Je dirais 17m3"
result = clean_answer(test)
print([float(x) for x in result] == [17])

test = "En EUR/m3, le prix au "
result = clean_answer(test)
print([float(x) for x in result] == [])

test = '0,00u/1000abonnés et 0,00u/100km'
result = clean_answer(test)
print([float(x) for x in result] == [0, 0])

test = '2 032 2 175'
result = clean_answer(test)
print([float(x) for x in result] == [2032, 2175])

test = '48 526.50'
result = clean_answer(test)
print([float(x) for x in result] == [48526.5])


True
True
True
True
True
True
True
[1800.0, 0.0]
False
True
True
True
True
True
True
True
True
True
True
True


## Test the cleaning pipeline 

Choose the file

In [2]:
benchmark_version = "test_pipeline" 
year = "2021"
collectivity = "Allain"
pdf_file = f"RPQS_{collectivity}_AC_{year}.pdf"

In [3]:
fs = FileSystem()
data_dir = get_data_dir()
answer_path = "/data/output/" + benchmark_version + "/answers/"
detailed_answer_file = pdf_file.split(".")[0]+"_detailed_answers.csv"

Initialize the Answer Manager

In [4]:
indic_bound_file_path = "../../data/input/indicateurs_v3.csv"
answer_manager = AnswerManager(indic_bound_file_path)

Import the detailed_answer dataframe

In [14]:
answer_manager.get_detailed_answer_df_from_file(data_dir+answer_path+detailed_answer_file)

answer_manager.detailed_answer_df.head()

ok


Unnamed: 0,indicator,question,keyword_regex,relevant_pages,table_relevant_pages,answer_list_from_language_model,answer_list_from_tables,answer_list
0,D201.0,Quel le nombre d'habitants desservis par le ré...,\bhabitants?\b,"[4, 5, 8, 21]",,[Indicator value extracted from tables],,[Indicator value extracted from tables]
1,D201.0,Quelle est la valeur de l'indicateur D201.0,\bD201.0s?\b,"[4, 21]","[[21, 0]]",[Indicator value extracted from tables],[476],"[476, Indicator value extracted from tables]"
2,D202.0,Quel est le nombre d'autorisations de déversem...,\bautorisations?\b,"[6, 8, 21]",,[Indicator value extracted from tables],,[Indicator value extracted from tables]
3,D202.0,Quelle est la valeur de l'indicateur D202.0,\bD202.0s?\b,[21],"[[21, 0]]",[Indicator value extracted from tables],[0],"[0, Indicator value extracted from tables]"
4,D203.0,Quelle est la quantité de boues évacuées (D203.0),\bboues?\b,"[3, 9, 13, 17, 18, 21]",,[Indicator value extracted from tables],,[Indicator value extracted from tables]


Apply the cleaning pipeline

In [75]:
pd.options.display.max_columns = None
pd.set_option("max_colwidth", None)
pd.options.display.max_rows = None

In [12]:
#answer_manager.build_answer_df()
#answer_manager.clean_answers()
#answer_manager.remove_forbidden_numbers(forbidden_number_list=[2021])
#answer_manager.exclude_oob_answers()
#answer_manager.select_one_answer_per_indic()

# Extract PDF pages needed for removing hallucinations
file_path = "../../data/input/pdfs/"
pdf_reader = PDFReader(file_path + pdf_file)
pdf_pages = pdf_reader.textpages

# Without removing hallucinations
answer_manager.apply_full_cleaning_pipeline(forbidden_number_list=[float(year)])
answer_manager.answer_df


Unnamed: 0,indicator,final_answer,filtered_answer_list,clean_answer_list,concat_answer_list
0,D201.0,476.0,[476.0],[476.0],"[Indicator value extracted from tables, 476, I..."
1,D202.0,0.0,[0.0],[0.0],"[Indicator value extracted from tables, 0, Ind..."
2,D203.0,je ne trouve pas,[],[],"[Indicator value extracted from tables, ____, ..."
3,D204.0,1.05,[1.05],[1.05],"[Indicator value extracted from tables, Indica..."
4,P201.1,100.0,[100.0],[100.0],"[Indicator value extracted from tables, 100%, ..."
5,P202.2B,85.0,[85.0],[85.0],"[Indicator value extracted from tables, Indica..."
6,P203.3,100.0,[100.0],[100.0],"[Indicator value extracted from tables, 100%, ..."
7,P204.3,100.0,[100.0],[100.0],"[Indicator value extracted from tables, 100%, ..."
8,P205.3,100.0,[100.0],[100.0],"[Indicator value extracted from tables, 100%, ..."
9,P206.3,je ne trouve pas,[],[],"[Indicator value extracted from tables, ____%,..."


In [13]:
# By removing hallucinations
answer_manager.apply_full_cleaning_pipeline(textpages=pdf_pages, forbidden_number_list=[float(year)])
answer_manager.answer_df

Unnamed: 0,indicator,final_answer,filtered_answer_list,clean_answer_list,concat_answer_list
0,D201.0,476.0,[476.0],[476.0],"[Indicator value extracted from tables, 476, I..."
1,D202.0,0.0,[0.0],[0.0],"[Indicator value extracted from tables, 0, Ind..."
2,D203.0,je ne trouve pas,[],[],"[Indicator value extracted from tables, ____, ..."
3,D204.0,1.05,[1.05],[1.05],"[Indicator value extracted from tables, Indica..."
4,P201.1,100.0,[100.0],[100.0],"[Indicator value extracted from tables, 100%, ..."
5,P202.2B,85.0,[85.0],[85.0],"[Indicator value extracted from tables, Indica..."
6,P203.3,100.0,[100.0],[100.0],"[Indicator value extracted from tables, 100%, ..."
7,P204.3,100.0,[100.0],[100.0],"[Indicator value extracted from tables, 100%, ..."
8,P205.3,100.0,[100.0],[100.0],"[Indicator value extracted from tables, 100%, ..."
9,P206.3,je ne trouve pas,[],[],"[Indicator value extracted from tables, ____%,..."


## Test the hallucination detector

In [11]:
import re 
from unidecode import unidecode


def is_hallucination(answer, textpage):
    assert isinstance(answer, float)

    # 0 and 100 answers corresponding to "aucune autorisation" and "conforme"
    # should not be considered as hallucinations
    if answer == 0 or answer == 100:
        return False
    # Removes spaces for millions eg "1 253 457" -> "1253457"
    textpage = re.sub(
        r"(?<!\d)\s*(\d{1,3}) (\d{3}) (\d{3})\b(?!\d)", r"\1\2\3", textpage
    )
    # Remove spaces for eg "12 345" -> "12345" but not "123 45"
    textpage = re.sub(r"(?<!\d)\s*(\d{1,2}) (\d{3,})\b(?!\d)", r"\1\2", textpage)
    # Remove spaces for eg "abc 1 024 2 148 xyz" -> "1024 2148"
    textpage = re.sub(
        r"(?<!\d)\s*(\d{1,2}) (\d{3,}) +(\d{1,2}) (\d{3,})\b(?!\d)",
        r"\1\2 \3\4",
        textpage,
    )
    # Format textpage
    textpage = unidecode(textpage)
    # Remove special items m2 or m3
    textpage = re.sub(r"(\w*)m[23](?:/\w+)?\b", r"\1", textpage)
    # Remove dates
    textpage = re.sub(r"\b\d{2}[/-]\d{2}[/-]\d{4}\b", "", textpage)
    # Generate possible patterns for the answer
    patterns = []
    patterns.append(str(answer))
    if "." in str(answer):
        patterns.append(str(answer).replace(".", ","))
        # Remove trailing zeros eg 12.0 -> 12
        patterns.append(str(answer).rstrip("0").rstrip("."))
    
    patterns = list(set(patterns))
    # Build the regex
    patterns = [
        r"(?<!\d|,|\.)" + re.escape(p) + r"(?=([.,]0{1,2})\b|[^.,\d]|[.]?$|[.]?\s)"
        if "." not in p and "," not in p
        else r"(?<!\d|,|\.)" + re.escape(p) + r"(?=[^.,123456789]|[.]?$|[.]?\s)"
        for p in patterns 
    ]
    regex = "|".join(patterns)


    return not bool(re.search(regex, textpage))

In [12]:
answer = 123.0
textpage = "Je crois que €123.4 est"
print(is_hallucination(answer, textpage)==True)

answer = 123.0
textpage = "Je crois que €123 est"
print(is_hallucination(answer, textpage)==False)

answer = 72.45
textpage = "La réponse est 72.45."
print(is_hallucination(answer, textpage)==False)

answer = 72.45
textpage = "La réponse est 72,45."
print(is_hallucination(answer, textpage)==False)

answer = 17.0
textpage = "La réponse est 17%"
print(is_hallucination(answer, textpage)==False)

answer = 1.501
textpage = "Prix au m3 (total /120 m3)   1,5010 € 1,5010 € /  Il"
print(is_hallucination(answer, textpage)==False)

answer = 96.8
textpage = "𝑁𝑜𝑚𝑏𝑟𝑒\t𝑑!𝑎𝑏𝑜𝑛𝑛é𝑠\t𝑑𝑒𝑠𝑠𝑒𝑟𝑣𝑖𝑠𝑁𝑜𝑚𝑏𝑟𝑒\t𝑑!𝑎𝑏𝑜𝑛𝑛é𝑠\t𝑝𝑜𝑡𝑒𝑛𝑡𝑖𝑒𝑙𝑠´\t100=\t𝟗𝟔,𝟖\t%\t(97,4\t%\ten\t2021)  "
print(is_hallucination(answer, textpage)==False)

answer = 85.0
textpage = "programme détaillé assorti d’un estimatif \nchiffré portant sur au moins trois ans  10 0 \n  TOTAL  120 85 "
print(is_hallucination(answer, textpage)==False)

answer = 1955230.0
textpage = "blabla 1 955 230 truc"
print(is_hallucination(answer, textpage)==False)

answer = 20512.0
textpage = "blabla 20 512"
print(is_hallucination(answer, textpage)==False)

answer = 253.3
textpage = "Boues évacuées (Tonnes de MS) 310 539,3 467,1 347,6 370,1 220,9 362 253,3\nTaux"
print(is_hallucination(answer, textpage)==False)

answer = 0.0   # 0 is never considered as an hallucination
textpage = "Boues évacuées (Tonnes de MS) 310 539,3 467,1 347,6 370,1 220,9 362 253,3\nTaux"
print(is_hallucination(answer, textpage)==False)

answer = 2032.0
textpage = "2 032 2 075"
print(is_hallucination(answer, textpage)==False)

answer = 4383.0
textpage = "4 383"
print(is_hallucination(answer, textpage)==False)

answer = 10.0
textpage = "100"
print(is_hallucination(answer, textpage)==True)

answer = 3.0
textpage = "bgfdgd €3.38 est"
print(is_hallucination(answer, textpage)==True)

answer = 10.0
textpage = "\nDate  DCO  \n03/10/2019 423 \n \n \nRésultats d’analyses ponctuelles"
print(is_hallucination(answer, textpage)==True)

answer = 3.0
textpage = " eaux uséesP255.3 90 / 120 90 / 120Taux de d"
print(is_hallucination(answer, textpage)==True)

answer = 3.0
textpage = "2€/m3 and 12m3"
print(is_hallucination(answer, textpage)==True)

answer = 253.3
textpage = "\nD203.0 253,3"
print(is_hallucination(answer, textpage)==False)

True
True
True
True
True
True
True
True
True
True
False
True
True
True
True
True
True
True
True
False
