In [None]:
import os
import re
import fitz
import tomli


TOML_DIRECTORY = "../../questions/cleaned/"
PDF_DIRECTORY = "../../pdf_data"
TXT_DIRECTORY = "../../txt_data/"

First get all pdfs as text files. Might give some format error but its ok

In [None]:
def normalize_text(input_text):
    # Remove split words at the end of lines
    normalized = re.sub(r"- ?\n", "", input_text.strip())
    # Replace any sequence of whitespace (including newlines) with a single space
    normalized = re.sub(r"\s+", " ", normalized)
    # Don't keep space if end of sentence
    normalized = re.sub(r" +\.\s", ". ", normalized) 
    
    return normalized

def chunk_pdf_by_tokens(pdf_path):
    doc = fitz.open(pdf_path)

    text_and_pagenumber = []  # List [(page_number, page_text)]
    for i, page in enumerate(doc):
        text = page.get_text(sort=True)
        if text.strip():  # Skip empty pages
            norm_text = normalize_text(text)
            text_and_pagenumber.append((i + 1, norm_text + " "))
    doc.close()
    return text_and_pagenumber

for filename in os.listdir(PDF_DIRECTORY):
    if filename.endswith(".pdf"):
        filename_s = filename[:-4]  # Remove '.pdf'
        pdf_path = os.path.join(PDF_DIRECTORY, filename)
        chunks = chunk_pdf_by_tokens(pdf_path)

        os.makedirs(TXT_DIRECTORY, exist_ok=True)
        file_path_txt = os.path.join(TXT_DIRECTORY, f"{filename_s}.txt")

        with open(file_path_txt, "w", encoding="utf-8") as f:
            for chunk in chunks:
                f.write(chunk[1])

Then load toml files to directory

In [None]:
def get_questions(toml_dir):
    all_embedded_questions = {}
    for filename in os.listdir(toml_dir):
        if filename.endswith(".toml"):
            file_path = os.path.join(toml_dir, filename)
            with open(file_path, "rb") as f:  # tomli requires binary mode
                toml_data = tomli.load(f)
            questions = toml_data.get("questions", [])
            for question in questions:
                q_id = question.get("id")
                if q_id:
                    all_embedded_questions[q_id] = question
    return all_embedded_questions

In [None]:
question_dict = get_questions(TOML_DIRECTORY)

Check if filename in toml file exists in your pdf directory:

In [None]:


for question_id, question in question_dict.items():
    if question['files'][0]['file'] in os.listdir(PDF_DIRECTORY):
        continue
    else:
        print(f"PDF file for question {question_id} does not exist. Please check if {question['files'][0]['file']} is the right file")
else:
    print("All PDF files for questions exist in the directory.")


Now check if answer is found in text files. This is basically a bad word match. But it's a starting point.

This is of course not exact.

So page breaks where there is a footer between might be interpreted as missing answer and so on, even though it's there. 

Hyphenated linebreaks are not accounted for and so on.

So this is a first step to doublecheck.

In [None]:
def check_answers_in_txt(question_dict, txt_directory):
    # Load contents of all .txt files into a list of strings
    txt_file_paths = [
        os.path.join(txt_directory, filename)
        for filename in os.listdir(txt_directory)
        if filename.endswith('.txt')
    ]

    txt_contents = []
    for path in txt_file_paths:
        with open(path, 'r', encoding='utf-8') as f:
            txt_contents.append(f.read())

    for question_id, question in question_dict.items():
        answer = question['answer']
        # Check if answer is a substring in any of the txt file contents
        if any(answer in content for content in txt_contents):
            continue
        else:
            print(f"\nAnswer for question {question_id} does not exist. Please check file {question['files'][0]['file']} if '{answer}' is the right answer")

# Usage
question_dict = get_questions(TOML_DIRECTORY)
check_answers_in_txt(question_dict, TXT_DIRECTORY)