In [1]:
import pandas as pd
import numpy as np
import os
import fitz  # PyMuPDF
from os.path import join
from ast import literal_eval
from dla.src.dla_pipeline_support_functions import reset_directory, list_files_with_extensions, get_filename_without_extension, find_files_recursively
import json
import shutil

pd.set_option("display.max_rows", 999)
pd.set_option("display.max_columns", 999)
pd.set_option("display.width", 999)

In [2]:
APP_DIRECTORY = "/mnt/HDD_1/w210/clean_data_is_all_you_need/app"

DATA_DIRECTORY = join(APP_DIRECTORY, "data")
S1_INPUT_PDFS_DIR = join(DATA_DIRECTORY, "s1_input_pdfs")
S2_DLA_INPUTS_DIR = join(DATA_DIRECTORY, "s2_dla_inputs")
S3_OUTPUTS_DIR = join(DATA_DIRECTORY, "s3_outputs")
S4_JSON_TEXT_OUTPUTS_DIR = join(DATA_DIRECTORY, "s4_json_text_output")
PAGE_MASK_DIR = join(S3_OUTPUTS_DIR, "page_masks")

reset_directory(S4_JSON_TEXT_OUTPUTS_DIR,erase_contents=True, verbose=True)

for p in [S1_INPUT_PDFS_DIR,S2_DLA_INPUTS_DIR,S3_OUTPUTS_DIR,S4_JSON_TEXT_OUTPUTS_DIR, PAGE_MASK_DIR]:
    assert os.path.exists(p), f"Directory {p}, does not exist"

Directory: /mnt/HDD_1/w210/clean_data_is_all_you_need/app/data/s4_json_text_output, was found.
All contents in "/mnt/HDD_1/w210/clean_data_is_all_you_need/app/data/s4_json_text_output" have been deleted successfully.


In [3]:
# Debug, to ensure that the the contents on the individual csvs match the registry one
def valadiate_results_csvs(page_mask_directory):
    try:
        agg_csvs = pd.DataFrame()

        for file in list_files_with_extensions(page_mask_directory, ["csv"]):
            file_name = get_filename_without_extension(file)

            if file_name == "mask_registry":
                mask_registry = pd.read_csv(file)

            else:
                agg_csvs = pd.concat([agg_csvs, pd.read_csv(file)], axis=0)

        agg_csvs.drop("Unnamed: 0", axis=1, inplace=True)
        mask_registry.drop("Unnamed: 0", axis=1, inplace=True)

        agg_csvs.sort_values(["document", "page_no", "mask_id"], inplace=True)
        mask_registry.sort_values(["document", "page_no", "mask_id"], inplace=True)

        # Check that they are the same
        assert len(mask_registry) == len(
            agg_csvs
        ), "The aggregated content of all the individual page csvs and the mask_registry, do not have the same length"
        assert np.array_equal(
            agg_csvs, mask_registry
        ), "The aggregated content of all the individual page csvs and the mask_registry, do not match"

    except Exception as e:
        raise Exception(f"Match error: {e}")


def load_mask_registry(page_mask_directory, validate_csvs=False):
    valadiate_results_csvs(page_mask_directory)
    mask_registry = pd.read_csv(join(page_mask_directory, "mask_registry.csv"))

    mask_registry.sort_values(["document", "page_no", "mask_id"], inplace=True)

    mask_registry = mask_registry[
        [
            "document",
            "page_no",
            "mask_id",
            "category",
            "category_lbl",
            "score",
            "x0",
            "x1",
            "y0",
            "y1",
            "xcf",
            "ycf",
            "column",
            "mask_shape",
            "is_primary",
            "mask_img_file_names",
            "mask_file_names",
        ]
    ]

    mask_registry["mask_shape"] = mask_registry["mask_shape"].apply(
        lambda var: literal_eval(var)
    )

    return mask_registry

In [4]:
def extract_text_from_scaled_pdf(pdf_path, page_number, coords, new_dimensions):
    """Extract text from specified coordinates in a scaled PDF."""
    doc = fitz.open(pdf_path)
    scaled_doc = fitz.open()  # Create a new empty PDF for scaled pages

    # Scale the specific page
    page = doc.load_page(page_number)
    new_page = scaled_doc.new_page(width=int(new_dimensions[1]), height=int(new_dimensions[0]))
    new_page.show_pdf_page(new_page.rect, doc, page.number)

    # Extract text from the scaled page
    scaled_page = scaled_doc.load_page(0)  # As we have only one page in scaled_doc
    extracted_text = scaled_page.get_text("text", clip=fitz.Rect(coords))
    #extracted_text = ' '.join(extracted_text.split())

    # Clean up
    doc.close()
    scaled_doc.close()
    return extracted_text

def process_pdfs_local():
    
    # Look for results
    mask_registry = load_mask_registry(PAGE_MASK_DIR, validate_csvs=True)
    pdf_list = np.unique(mask_registry['document'].values)
    model_support_images = list_files_with_extensions(join(S3_OUTPUTS_DIR, "model_outputs"), ['jpg'])  

    for pdf_file in pdf_list:
        pdf_name = get_filename_without_extension(pdf_file)

        pdf_file_path = join(S1_INPUT_PDFS_DIR, pdf_file)      
        assert os.path.exists(pdf_file_path), f"PDF File {pdf_file_path}, Not Found"

        doc_mask_registry = mask_registry.query(f"document=='{pdf_file}' & is_primary==True")
        assert len(doc_mask_registry) != 0, f"No results found for {pdf_file}"

        doc_mask_registry.sort_values(by=["mask_id","page_no"], inplace=True)     

        # SETUP OUTPUT DIR
        doc_output_dir = join(S4_JSON_TEXT_OUTPUTS_DIR,pdf_name)
        reset_directory(doc_output_dir, erase_contents=True)   
        
        concatenated_text = ""
        json_structure = {"paper_id": pdf_file, "title": "", "paper_text": []}

        for i,row in doc_mask_registry.iterrows():
            coords = (row['x0'], row['y0'], row['x1'], row['y1'])
            page_number = row['page_no'] - 1
            category = row['category_lbl']
            numbers = row['mask_shape']
            
            ## TEXT
            extracted_text = extract_text_from_scaled_pdf(pdf_file_path, page_number, coords, numbers)

            lines = extracted_text.split('\n')
            processed_lines = [line[:-1] if line.endswith('-') else line for line in lines]
            single_line_text = ''.join(processed_lines).strip()
            #single_line_text = textwrap.fill(single_line_text, width=80)

            if category == 'title':
                single_line_text = "\n## " + single_line_text + " ##\n"

            concatenated_text += "\n" + single_line_text

            ## JSON
            decoded_text = json.dumps(single_line_text, ensure_ascii=False)

            section_dict = {
                "section_name": "",  # Set based on your data/logic
                "section_text": decoded_text,
                "section_annotation": category,
                "section_page": page_number + 1,
                "section_column": 0,  # Set based on your data/logic
                "section_location": [coords]
            }

            json_structure["paper_text"].append(section_dict)

            ## IMAGES
            mask_img_f_name = row['mask_img_file_names']
            shutil.copy2(join(PAGE_MASK_DIR, mask_img_f_name), doc_output_dir)


        # Save TEXT FILE
        output_file_path = join(doc_output_dir, pdf_name+".txt")
        with open(output_file_path, 'w') as file:
            file.write(concatenated_text)        

        # Save JSON FILE
        json_output = json.dumps(json_structure, indent=4)
        json_output_file_path = join(doc_output_dir, pdf_name+".json")
        with open(json_output_file_path, 'w') as json_file:
           json_file.write(json_output)

        # COPY SUPPORTING FILES
        doc_mask_registry_path = join(doc_output_dir, f"{pdf_name}_mask_registry.csv")
        doc_mask_registry.to_csv(doc_mask_registry_path, index=False)

        # Useful Images
        for im_path in model_support_images:
            im_name = get_filename_without_extension(im_path)
            if im_name.startswith(pdf_name) and im_name.endswith("_base_dla_result"):
                shutil.copy2(im_path, doc_output_dir)


process_pdfs_local()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  doc_mask_registry.sort_values(by=["mask_id","page_no"], inplace=True)
