# Meta Data Extraction
## Commissioning Body and Regional Information from the document body and Regex


In [1]:
from pathlib import Path
import json
import os

# Set paths
ROOT = Path().resolve().parent
PDFS = ROOT / 'data/pdfs'
REFERENCE_DATA = ROOT / 'data/reference'

In [2]:
# Load reference dictionaries
with open(REFERENCE_DATA / 'commissioning_bodies.json','r') as f:
    cb_refs = json.load(f)

# Extract commissioning bodies as list
cb_list = cb_refs['commissioning_body']

In [3]:
import fitz
import re
import pandas as pd

commissioning_bodies = {}

# Check if commissioning body appears in a portion of text
def cb_in_text(text: str, cb_list: list):
    for body in cb_list:
        if body in text:
            return body
    return None

# Check for the phrase 'commissioned by' in the body text
def check_commissioned_by_bool(text: str):
    pattern = r'commissioned by'
    try:
        findmatch = re.findall(pattern, text)
        if findmatch:
            return True
    except:
        return False
    return False


for pdf in os.listdir(PDFS):
    pdf_path = PDFS / pdf
# for pdf in range(1):  # For testing with a single PDF
#     pdf_path = PDFS / '68541cfed3fa824998561bf4.pdf'
    if not pdf_path.suffix == ".pdf":
        continue
    doc = fitz.open(pdf_path)

    # Flag to indicate if commissioning body is found
    cb_found = False

    for page in range(doc.page_count):
        # Load the page
        current_page = doc.load_page(page)

        # Get the list of text blocks
        blocks= current_page.get_text("blocks")

        # Flag to indicate if the phrase "commissioned by" is found
        cb_regex_found = False

        # Search through each block for cb phrase
        for block in blocks:
            text = block[4] if len(block) > 4 else ""
            if not text:
                continue
            if check_commissioned_by_bool(text):
                cb = cb_in_text(text, cb_list)
                if cb:
                    commissioning_bodies[pdf_path.name] = {'commissioning_body': cb}
                    cb_regex_found = True
                    break # Stops scanning more blocks
        if cb_regex_found:
            break # Stops scanning more pages

    doc.close()

print(len(commissioning_bodies))
# Save results to JSON file
cb_df = pd.DataFrame.from_dict(commissioning_bodies, orient='index').reset_index().rename(columns={'index': 'filename'})
cb_df.to_csv(ROOT / 'function_validation/commissioning_body/commissioning_bodies_from_doc_body.csv', index=False)


207
