In [1]:
!pip install PyMuPDF

Collecting PyMuPDF
  Downloading PyMuPDF-1.24.7-cp310-none-manylinux2014_x86_64.whl (3.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.5/3.5 MB[0m [31m26.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting PyMuPDFb==1.24.6 (from PyMuPDF)
  Downloading PyMuPDFb-1.24.6-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (15.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.7/15.7 MB[0m [31m40.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyMuPDFb, PyMuPDF
Successfully installed PyMuPDF-1.24.7 PyMuPDFb-1.24.6


In [12]:
import pandas as pd
import fitz  # PyMuPDF
import re
import os
import math
from tkinter import Tk, filedialog

def get_pdf_schema(pdf_path:str):
  '''
  Given a PDF path, it will read the file and return the schema as a list of column names
  '''
  doc = fitz.open(pdf_path)
  annotations = []
  cat = doc.pdf_catalog()
  bsi_definition = (doc.xref_get_key(cat, "BSIAnnotColumns"))
  bsi_xref_str=bsi_definition[1].split()
  print(bsi_xref_str)
  if bsi_xref_str[0] == 'null':
    return None
  bsi_xref=int(bsi_xref_str[0])
  ColumnName = doc.xref_object(bsi_xref)

  pattern = r'/Name \((.*?)\)\n'

  # Extract values using the pattern
  xref_values = re.findall(pattern, ColumnName)

  return xref_values

def combine_BSIAnnotColumns(df:pd.DataFrame, pdf_path:str):
  '''

  '''
  # Create a DataFrame with NaN values for the new columns
  xref_values = get_pdf_schema(pdf_path) # get the schema from the pdf path

  if xref_values is not None:
    df_additions = pd.DataFrame(columns=xref_values) # create a df from the
    df_additions = df_additions.reindex(columns=df_additions.columns, fill_value=None)
    df_final = pd.concat([df, df_additions], axis=1)

    # reorder datafram to have same order or original BSIAnnotColumns + order of the csv df
    column_names = df.columns.tolist()
    combined_order = xref_values + column_names
    df_final = df_final.reindex(columns=combined_order)

      # retrieve original structured objects of
    doc = fitz.open(pdf_path)

    # Find the /BSIAnnotColumns object number
    xreflen = doc.xref_length()
    bsi_annot_columns_obj_num = None

    for xref in range(1, xreflen):
        obj = doc.xref_object(xref, compressed=False)
        if "/BSIAnnotColumns" in obj:
            bsi_annot_columns_obj_num = int(obj.split("/BSIAnnotColumns")[1].split(" ")[1])
            break

    if bsi_annot_columns_obj_num is None:
        raise ValueError("Could not find /BSIAnnotColumns in the PDF.")

    # Read the /BSIAnnotColumns object
    bsi_annot_columns_obj = doc.xref_object(bsi_annot_columns_obj_num, compressed=False)

    start_display_order = max(
        [int(line.split("/DisplayOrder")[1].split()[0]) for line in bsi_annot_columns_obj.split("<<") if "/DisplayOrder" in line]
    )
  else: # if the schema structure does not exist in the pdf
    df_final = df
    bsi_annot_columns_obj = '[]'
    start_display_order = 0




  #--------- Convert the columns in the df of the csv into an object to be inserted into the pdf ---------#
  new_objects = []
  display_order = start_display_order + 1

  for column in df.columns:
      if pd.api.types.is_numeric_dtype(df[column]):
          subtype = "/Number"
          precision = 10
      else:
          subtype = "/Text"
          precision = None

      new_obj = f"""
      <<
          /Subtype {subtype}
          /Name ({column})
          /DisplayOrder {display_order}"""
      if precision is not None:
          new_obj += f"""
          /Precision {precision}"""
      new_obj += ">>"

      new_objects.append(new_obj)
      display_order += 1

  updated_bsi_annot_columns_obj = "".join(new_objects)
  #---------------------------------------------------------------------------------------------------------------------#

  #--------- Inject the new combined object into the modified version of the pdf ---------#
  second_to_last_index = len(bsi_annot_columns_obj) - 1
  modified_string = bsi_annot_columns_obj[:second_to_last_index] + updated_bsi_annot_columns_obj + bsi_annot_columns_obj[second_to_last_index:]
  print(modified_string)

  if xref_values is not None:
    doc.update_object(bsi_annot_columns_obj_num, modified_string)

    new_pdf_path = pdf_path.replace(".pdf", "_updated_object.pdf")
    doc.save(new_pdf_path)
    doc.close()
  else:
    # need to insert the two objects into the document

    # INSERT into doc HERE#
    pass

  return df_final




In [13]:
import pandas as pd

# Load the CSV file into a DataFrame
df = pd.read_csv('/content/Grace manor-mid rise (1).pdf Metadata (2).csv', index_col=0)

pdf_path = '/content/Simple Dimension.pdf'

combined = combine_BSIAnnotColumns(df, pdf_path)
combined

['6', '0', 'R']
[ <<
    /Subtype /Text
    /Name (Description)
    /DisplayOrder 0
    /Multiline false
  >> <<
    /Subtype /Number
    /Name (WeightFt)
    /DisplayOrder 17
    /Format /Normal
    /Precision 2
  >> <<
    /Subtype /Formula
    /Name (WeightTotal)
    /DisplayOrder 5
    /Format /Normal
    /Precision 2
    /Expression ([Length] * [WeightData] * [Quantity])
  >> <<
    /Subtype /Choice
    /Name (Size)
    /DisplayOrder 3
    /Format /Normal
    /Precision 2
    /Items 10 0 R
    /DefaultValues [ ]
    /AllowCustom false
  >> <<
    /Subtype /Formula
    /Name (WeightData)
    /DisplayOrder 4
    /Format /Normal
    /Precision 2
    /Totals false
    /Expression ([Size])
  >> <<
    /Subtype /Number
    /Name (Quantity)
    /DisplayOrder 2
    /DefaultValue (1)
    /Format /Normal
    /Precision 0
  >> <<
    /Subtype /Number
    /Name (Cost)
    /DisplayOrder 1
    /Format /Normal
    /Precision 2
  >> <<
    /Subtype /Number
    /Name (Copes)
    /DisplayOrder -1
 

Unnamed: 0,Description,WeightFt,WeightTotal,Size,WeightData,Quantity,Cost,Copes,Moment Connections,Additional Shop Hours,...,y2,NM,ScaleFactor,author,modified_date,xref,Type,AISC_Manual_Label,W,PB
0,,,,,,,,,,,...,18.390301,fitz-A1,96,W10X15,,1475,W,W10X15,15.0,35.0
1,,,,,,,,,,,...,23.270001,fitz-A3,96,W4X13,,1479,W,W4X13,13.0,23.6
2,,,,,,,,,,,...,36.543335,fitz-A5,96,W10X15,,1483,W,W10X15,15.0,35.0
3,,,,,,,,,,,...,30.689999,fitz-A7,96,W10X15,,1487,W,W10X15,15.0,35.0
4,,,,,,,,,,,...,13.247562,fitz-A9,96,W10X30,,1491,W,W10X30,30.0,43.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1055,,,,,,,,,,,...,11.066667,fitz-A655,96,"('W10X54',)",,3507,,,,
1056,,,,,,,,,,,...,11.120002,fitz-A656,96,"('W10X54',)",,3509,,,,
1057,,,,,,,,,,,...,22.830000,fitz-A657,96,"('W10X54',)",,3511,,,,
1058,,,,,,,,,,,...,22.926666,fitz-A658,96,"('W10X54',)",,3513,,,,


In [14]:
# this code can verify the structure was updated

pdf_path = '/content/Simple Dimension_updated_object.pdf'

doc = fitz.open(pdf_path)

# Find the /BSIAnnotColumns object number
xreflen = doc.xref_length()
bsi_annot_columns_obj_num = None

for xref in range(1, xreflen):
    obj = doc.xref_object(xref, compressed=False)
    if "/BSIAnnotColumns" in obj:
        bsi_annot_columns_obj_num = int(obj.split("/BSIAnnotColumns")[1].split(" ")[1])
        break

if bsi_annot_columns_obj_num is None:
    raise ValueError("Could not find /BSIAnnotColumns in the PDF.")

# Read the /BSIAnnotColumns object
bsi_annot_columns_obj = doc.xref_object(bsi_annot_columns_obj_num, compressed=False)
start_display_order = max(
    [int(line.split("/DisplayOrder")[1].split()[0]) for line in bsi_annot_columns_obj.split("<<") if "/DisplayOrder" in line]
)

print(bsi_annot_columns_obj)
print(start_display_order)

[ <<
    /Subtype /Text
    /Name (Description)
    /DisplayOrder 0
    /Multiline false
  >> <<
    /Subtype /Number
    /Name (WeightFt)
    /DisplayOrder 17
    /Format /Normal
    /Precision 2
  >> <<
    /Subtype /Formula
    /Name (WeightTotal)
    /DisplayOrder 5
    /Format /Normal
    /Precision 2
    /Expression ([Length] * [WeightData] * [Quantity])
  >> <<
    /Subtype /Choice
    /Name (Size)
    /DisplayOrder 3
    /Format /Normal
    /Precision 2
    /Items 10 0 R
    /DefaultValues [ ]
    /AllowCustom false
  >> <<
    /Subtype /Formula
    /Name (WeightData)
    /DisplayOrder 4
    /Format /Normal
    /Precision 2
    /Totals false
    /Expression ([Size])
  >> <<
    /Subtype /Number
    /Name (Quantity)
    /DisplayOrder 2
    /DefaultValue (1)
    /Format /Normal
    /Precision 0
  >> <<
    /Subtype /Number
    /Name (Cost)
    /DisplayOrder 1
    /Format /Normal
    /Precision 2
  >> <<
    /Subtype /Number
    /Name (Copes)
    /DisplayOrder -1
    /Deleted true