In [122]:
import os
import re
import fitz
import tomli


TOML_DIRECTORY = "../../questions/cleaned/"
PDF_DIRECTORY = "../../pdf_data"
TXT_DIRECTORY = "../../txt_data/"

First get all pdfs as text files. Might give some format error but its ok

In [None]:
def normalize_text(input_text):
    text = input_text.strip()
    # Remove invisible/zero-width Unicode characters
    text = re.sub(r"[\u00AD\u200B\u200C\u200D\u200E\u200F]\s*", "", text)
    # Split into lines to check for page numbers
    lines = text.splitlines()

    if lines and re.fullmatch(r"\s*\d{1,3}\s*", lines[0]):
        lines = lines[1:]
    if lines and re.fullmatch(r"\s*\d{1,3}\s*", lines[-1]):
        lines = lines[:-1]
        
    # Re-join lines for further processing
    text = "\n".join(lines)
    # Fix hyphenated line breaks: "infor-\nmation" → "information"
    text = re.sub(r"-\s*\n\s*", "", text)
    # Normalize whitespace
    text = re.sub(r"\s+", " ", text)
    # Clean up space before punctuation
    text = re.sub(r" +\.\s", ". ", text)
    
    return text.strip()

def chunk_pdf_by_tokens(pdf_path):
    doc = fitz.open(pdf_path)

    text_and_pagenumber = []  # List [(page_number, page_text)]
    for i, page in enumerate(doc):
        text = page.get_text(sort=True)
        if text.strip():  # Skip empty pages
            norm_text = normalize_text(text)
            text_and_pagenumber.append((i + 1, norm_text + " "))
    doc.close()
    return text_and_pagenumber

for filename in os.listdir(PDF_DIRECTORY):
    if filename.endswith(".pdf"):
        filename_s = filename[:-4]  # Remove '.pdf'
        pdf_path = os.path.join(PDF_DIRECTORY, filename)
        chunks = chunk_pdf_by_tokens(pdf_path)

        os.makedirs(TXT_DIRECTORY, exist_ok=True)
        file_path_txt = os.path.join(TXT_DIRECTORY, f"{filename_s}.txt")

        with open(file_path_txt, "w", encoding="utf-8") as f:
            for chunk in chunks:
                f.write(chunk[1])

Then load toml files to directory

In [140]:
def get_questions(toml_dir):
    all_embedded_questions = {}
    for filename in os.listdir(toml_dir):
        if filename.endswith(".toml"):
            file_path = os.path.join(toml_dir, filename)
            with open(file_path, "rb") as f:  # tomli requires binary mode
                toml_data = tomli.load(f)
            questions = toml_data.get("questions", [])
            for question in questions:
                q_id = question.get("id")
                if q_id:
                    all_embedded_questions[q_id] = question
    return all_embedded_questions

In [143]:

question_dict = get_questions(TOML_DIRECTORY)


Now check if answer is found in text files. This is basically a bad word match. But it's a starting point.

Also checks if the answer was found in the right file.

In [144]:
def check_answers_in_txt(question_dict, txt_directory):
    for question_id, question in question_dict.items():
        answer = question['answer'].lower()
        filename_toml = question['files'][0]['file']
        multiple_pages = True if len(question['files'][0]['page_numbers']) > 1 else False
        hyphenated_ans = True if "-" in answer else False

        # Build full path to the expected .txt file
        expected_txt_file = os.path.splitext(filename_toml)[0] + '.txt'
        expected_txt_path = os.path.join(txt_directory, expected_txt_file)

        # Check if the file exists
        if os.path.exists(expected_txt_path):
            with open(expected_txt_path, 'r', encoding='utf-8') as f:
                content = f.read().lower()
                if answer in content:
                    continue  # Answer found in correct file
                else:
                    print(f"\nAnswer for question {question_id} not found.\nCheck Filename: '{expected_txt_file[0:-4]}.pdf'\nor Answer: '{answer}'.")
                    if multiple_pages:
                        print(f"Note: This answer is spread across multiple pages {question['files'][0]['page_numbers']}, so that could cause a miss in detection. But still check it!")
                    if hyphenated_ans:
                        print("Note: This answer contains a hyphen '-', which might affect detection. Please check manually!")
        else:
            print(f"Expected file '{expected_txt_file[0:-4]}.pdf' for question {question_id} does not exist in the pdf directory.")

question_dict = get_questions(TOML_DIRECTORY)
check_answers_in_txt(question_dict, TXT_DIRECTORY)


Answer for question RN023 not found.
Check Filename: '2023_24_UbU6_20231211104325_Publicering.pdf'
or Answer: 'att bland annat föreslå hur statens roll bör se ut när det gäller läromedel i svensk skola.'.

Answer for question RN026 not found.
Check Filename: '2023_24_UbU6_20231211104325_Publicering.pdf'
or Answer: 'förskoleklassen, grundskolan, anpassade grundskolan, specialskolan,sameskolan, gymnasieskolan och anpassade gymnasieskolan.'.

Answer for question RN027 not found.
Check Filename: '2023_24_UbU6_20231211104325_Publicering.pdf'
or Answer: 'att stärka elevers kunskapsutveckling.'.

Answer for question RN028 not found.
Check Filename: '2023_24_UbU6_20231211104325_Publicering.pdf'
or Answer: 'stärka tillgången till läromedel inom den kommunala vuxenutbildningen.'.

Answer for question RN029 not found.
Check Filename: '2023_24_UbU6_20231211104325_Publicering.pdf'
or Answer: 'att regeringen tillsätter en utredning om det finns tryckta läromedel som följer kursplanerna.'.

Answer f