## DON'T FORGET TO CHANGE OUTPUT FILE NAMES!

In [5]:
import json
import pandas as pd
import numpy as np

file_name ='test-me.json'
file_path = f'../{file_name}'

output_cells_file_name = 'T_preprocessed_cells.csv'
output_tables_file_name = 'T_preprocessed_tables.csv'

# json_to_process = ['../1-100.json', '../101-300.json', '../750-999.json']

with open(file_path) as file:
    data = json.load(file)

In [6]:
print(len(data))

59667


In [7]:
## ORIGINAL VERSION STEP 1

# Find all TABLE blocks
table_ids = [item['Id'] for item in data if item.get('BlockType') == 'TABLE']

# Initialize an empty list to hold cell records
cell_records = []

# Function to find items by ID
def find_item_by_id(item_id):
    return next((item for item in data if item.get('Id') == item_id), None)

# Iterate over table IDs to process each table's cells
for table_id in table_ids:
    table_block = find_item_by_id(table_id)
    if 'Relationships' in table_block:
        for relationship in table_block['Relationships']:
            cell_type = relationship['Type']  # This could be CHILD, MERGED_CELL, or TABLE_TITLE
            for cell_id in relationship['Ids']:
                cell_block = find_item_by_id(cell_id)
                # Determine if EntityTypes exists and get the first EntityType if available
                entity_type = cell_block.get('EntityTypes', [None])[0] if 'EntityTypes' in cell_block else None
                # Create a record for this cell
                cell_record = {
                    'cell_id': cell_id,
                    'cell_type': cell_type,
                    'row_index': cell_block.get('RowIndex'),
                    'column_index': cell_block.get('ColumnIndex'),
                    'row_span': cell_block.get('RowSpan'),
                    'column_span': cell_block.get('ColumnSpan'),
                    'table_id': table_id,
                    'table_width': table_block['Geometry']['BoundingBox']['Width'],
                    'table_height': table_block['Geometry']['BoundingBox']['Height'],
                    'table_left': table_block['Geometry']['BoundingBox']['Left'],
                    'table_top': table_block['Geometry']['BoundingBox']['Top'],
                    'table_page': table_block['Page'],
                    'table_type': table_block['EntityTypes'][0] if 'EntityTypes' in table_block else None,
                    'entity_type': entity_type
                }
                cell_records.append(cell_record)

# Convert the list of cell records into a DataFrame
cells_df = pd.DataFrame(cell_records)

# Display the first few rows of the DataFrame to verify its structure
cells_df.head()


Unnamed: 0,cell_id,cell_type,row_index,column_index,row_span,column_span,table_id,table_width,table_height,table_left,table_top,table_page,table_type,entity_type
0,ac008238-4da0-4f5c-b8b3-957b99940980,CHILD,1.0,1.0,1.0,1.0,66f86551-5831-4bba-860c-95bc0acef324,0.175104,0.069369,0.714693,0.069639,1,SEMI_STRUCTURED_TABLE,
1,a407d92d-a3c2-4ff1-983a-4da842168874,CHILD,1.0,2.0,1.0,1.0,66f86551-5831-4bba-860c-95bc0acef324,0.175104,0.069369,0.714693,0.069639,1,SEMI_STRUCTURED_TABLE,
2,3650936d-c9f9-424a-8f6c-3841b459e2a0,CHILD,2.0,1.0,1.0,1.0,66f86551-5831-4bba-860c-95bc0acef324,0.175104,0.069369,0.714693,0.069639,1,SEMI_STRUCTURED_TABLE,
3,ad34b937-d2e1-4e6d-a582-536154f069db,CHILD,2.0,2.0,1.0,1.0,66f86551-5831-4bba-860c-95bc0acef324,0.175104,0.069369,0.714693,0.069639,1,SEMI_STRUCTURED_TABLE,
4,488e61d9-432a-4932-b0fd-3136ee8a01b0,CHILD,3.0,1.0,1.0,1.0,66f86551-5831-4bba-860c-95bc0acef324,0.175104,0.069369,0.714693,0.069639,1,SEMI_STRUCTURED_TABLE,


In [8]:
## ORIGINAL VERSION STEP 2

# Ensure cells_df has the 'cell_children' column initialized to hold lists
if 'cell_children' not in cells_df.columns:
    cells_df['cell_children'] = [[] for _ in range(len(cells_df))]

for index, row in cells_df.iterrows():
    cell_block = find_item_by_id(row['cell_id'])  # Make sure this matches the exact column name
    cell_block_children = []

    # Iterating through all relationships (if multiple)
    for relationship in cell_block.get('Relationships', []):
        if relationship['Type'] == 'CHILD':
            cell_block_children.extend(relationship['Ids'])

    # Directly updating the DataFrame with the list
    cells_df.at[index, 'cell_children'] = cell_block_children

cells_df.head()

Unnamed: 0,cell_id,cell_type,row_index,column_index,row_span,column_span,table_id,table_width,table_height,table_left,table_top,table_page,table_type,entity_type,cell_children
0,ac008238-4da0-4f5c-b8b3-957b99940980,CHILD,1.0,1.0,1.0,1.0,66f86551-5831-4bba-860c-95bc0acef324,0.175104,0.069369,0.714693,0.069639,1,SEMI_STRUCTURED_TABLE,,[40543536-299b-47a1-a0cb-0a649f19ff27]
1,a407d92d-a3c2-4ff1-983a-4da842168874,CHILD,1.0,2.0,1.0,1.0,66f86551-5831-4bba-860c-95bc0acef324,0.175104,0.069369,0.714693,0.069639,1,SEMI_STRUCTURED_TABLE,,[725c935d-ad62-41cc-bd73-be42745e4f5f]
2,3650936d-c9f9-424a-8f6c-3841b459e2a0,CHILD,2.0,1.0,1.0,1.0,66f86551-5831-4bba-860c-95bc0acef324,0.175104,0.069369,0.714693,0.069639,1,SEMI_STRUCTURED_TABLE,,"[bc217098-02e5-480b-8900-e1b79aaf0731, 873d277..."
3,ad34b937-d2e1-4e6d-a582-536154f069db,CHILD,2.0,2.0,1.0,1.0,66f86551-5831-4bba-860c-95bc0acef324,0.175104,0.069369,0.714693,0.069639,1,SEMI_STRUCTURED_TABLE,,[aa471032-bb0a-4ae7-b728-d86388e882c6]
4,488e61d9-432a-4932-b0fd-3136ee8a01b0,CHILD,3.0,1.0,1.0,1.0,66f86551-5831-4bba-860c-95bc0acef324,0.175104,0.069369,0.714693,0.069639,1,SEMI_STRUCTURED_TABLE,,[59dcd74b-57e2-46b8-805b-edae49f9097b]


In [9]:
## ORIGINAL VERSION STEP 3

# Initialize 'cell_words' column with empty lists
cells_df['cell_words'] = [[] for _ in range(len(cells_df))]

# Iterate through each row to process 'cell_children'
for index, row in cells_df.iterrows():
    # Initialize a list to hold the text of each child
    words_list = []


    # Iterate through each 'cell_children' ID
    for child_id in row['cell_children']:
        # Use 'find_item_by_id' to get the word block
        word_block = find_item_by_id(child_id)
        
        # Extract the 'Text' property if it exists
        if word_block and 'Text' in word_block:
            words_list.append(word_block['Text'])
        elif word_block and 'SelectionStatus' in word_block:
            words_list.append(word_block['SelectionStatus'])

    
    # Assign the list of words to 'cell_words' for the row
    cells_df.at[index, 'cell_words'] = words_list
    cells_df.at[index, 'cell_content'] = ' '.join([word if word is not None else 'empty' for word in words_list])


def modify_row(row):
    if row['cell_children'] == []:
        row['cell_words'] = [None]
    return row

# Apply the custom function along the DataFrame's rows
# cells_df = cells_df.apply(modify_row, axis=1)

cells_df.head()


Unnamed: 0,cell_id,cell_type,row_index,column_index,row_span,column_span,table_id,table_width,table_height,table_left,table_top,table_page,table_type,entity_type,cell_children,cell_words,cell_content
0,ac008238-4da0-4f5c-b8b3-957b99940980,CHILD,1.0,1.0,1.0,1.0,66f86551-5831-4bba-860c-95bc0acef324,0.175104,0.069369,0.714693,0.069639,1,SEMI_STRUCTURED_TABLE,,[40543536-299b-47a1-a0cb-0a649f19ff27],[Date:],Date:
1,a407d92d-a3c2-4ff1-983a-4da842168874,CHILD,1.0,2.0,1.0,1.0,66f86551-5831-4bba-860c-95bc0acef324,0.175104,0.069369,0.714693,0.069639,1,SEMI_STRUCTURED_TABLE,,[725c935d-ad62-41cc-bd73-be42745e4f5f],[7/31/16],7/31/16
2,3650936d-c9f9-424a-8f6c-3841b459e2a0,CHILD,2.0,1.0,1.0,1.0,66f86551-5831-4bba-860c-95bc0acef324,0.175104,0.069369,0.714693,0.069639,1,SEMI_STRUCTURED_TABLE,,"[bc217098-02e5-480b-8900-e1b79aaf0731, 873d277...","[Work, Order:]",Work Order:
3,ad34b937-d2e1-4e6d-a582-536154f069db,CHILD,2.0,2.0,1.0,1.0,66f86551-5831-4bba-860c-95bc0acef324,0.175104,0.069369,0.714693,0.069639,1,SEMI_STRUCTURED_TABLE,,[aa471032-bb0a-4ae7-b728-d86388e882c6],[12359],12359
4,488e61d9-432a-4932-b0fd-3136ee8a01b0,CHILD,3.0,1.0,1.0,1.0,66f86551-5831-4bba-860c-95bc0acef324,0.175104,0.069369,0.714693,0.069639,1,SEMI_STRUCTURED_TABLE,,[59dcd74b-57e2-46b8-805b-edae49f9097b],[Temperature:],Temperature:


In [124]:
# cells_df.to_csv('interim-wihtout-editing.csv', index=False)

In [10]:
## OPTIONAL STEP 4

cells_df['row_index'] = cells_df['row_index'].fillna(0).astype(int)
cells_df['column_index'] = cells_df['column_index'].fillna(0).astype(int)
cells_df['row_span'] = cells_df['row_span'].fillna(1).astype(int)  # Assuming a default span of 1 if missing
cells_df['column_span'] = cells_df['column_span'].fillna(1).astype(int)

# Sort the DataFrame as required
cells_df.sort_values(by=['table_id', 'column_index', 'row_index'], inplace=True)

# Isolate merged cells
merged_cells = cells_df[cells_df['cell_type'] == 'MERGED_CELL']

for cell in merged_cells.itertuples():
    # Retrieve the necessary details
    row_index = cell.row_index
    column_index = cell.column_index
    row_span = cell.row_span
    column_span = cell.column_span

    # Calculate the affected range of rows and columns
    affected_rows = range(row_index, row_index + row_span)
    affected_columns = range(column_index, column_index + column_span)

    # Find the cells that are affected
    affected_cells = cells_df[
        (cells_df['table_id'] == cell.table_id) &
        (cells_df['cell_type'] == 'CHILD') &  # Targeting only child cells
        (cells_df['row_index'].isin(affected_rows)) &
        (cells_df['column_index'].isin(affected_columns))
    ]

    # Filter out empty strings and concatenate the rest; ensure it's stripped to remove leading/trailing spaces
    aggregated_text_content = " ".join(filter(None, affected_cells['cell_content'].astype(str))).strip()

    # Check if the aggregated content is not empty before updating
    if aggregated_text_content:
        # Update the affected cells with the aggregated text content
        cells_df.loc[affected_cells.index, 'cell_content'] = aggregated_text_content
        cells_df.loc[affected_cells.index, 'merged_parent_cell_id'] = cell.cell_id
        cells_df.loc[affected_cells.index, 'has_merged_parent'] = 1

cells_df['has_merged_parent'] = cells_df['has_merged_parent'].fillna(0)
cells_df['merged_parent_cell_id'] = cells_df['merged_parent_cell_id'].fillna(-1)

In [126]:
# cells_df.to_csv('interim-output-5.csv', index=False)

In [11]:
# ORIGINAL VERSION STEP 5

# Sort the DataFrame as required
cells_df.sort_values(by=['table_id', 'column_index', 'row_index'], inplace=True)

# Initialize empty lists to store the 'words' in each direction for each cell
words_above_list = []
words_left_list = []
words_below_list = []
words_right_list = []

# Filter the DataFrame to only include rows where cell_type is 'CHILD'
child_cells_df = cells_df[cells_df['cell_type'] == 'CHILD']

# Iterate over rows of child_cells_df instead of the entire cells_df
for index, row in child_cells_df.iterrows():
    # Adjust masks to include only CHILD cells for comparison
    above_mask = (cells_df['table_id'] == row['table_id']) & \
                 (cells_df['column_index'] == row['column_index']) & \
                 (cells_df['row_index'] < row['row_index']) & \
                 (cells_df['cell_type'] == 'CHILD')
    above_cells = cells_df.loc[above_mask]

    left_mask = (cells_df['table_id'] == row['table_id']) & \
                (cells_df['row_index'] == row['row_index']) & \
                (cells_df['column_index'] < row['column_index']) & \
                (cells_df['cell_type'] == 'CHILD')
    left_cells = cells_df.loc[left_mask]

    below_mask = (cells_df['table_id'] == row['table_id']) & \
                 (cells_df['column_index'] == row['column_index']) & \
                 (cells_df['row_index'] > row['row_index']) & \
                 (cells_df['cell_type'] == 'CHILD')
    below_cells = cells_df.loc[below_mask]

    right_mask = (cells_df['table_id'] == row['table_id']) & \
                 (cells_df['row_index'] == row['row_index']) & \
                 (cells_df['column_index'] > row['column_index']) & \
                 (cells_df['cell_type'] == 'CHILD')
    right_cells = cells_df.loc[right_mask]
    
    # Process each direction's cells to aggregate words, replacing None with 'empty' and joining words within a cell
    words_above = [' '.join([word if word is not None else 'empty' for word in cell_words]) if cell_words else 'empty' for cell_words in above_cells['cell_words']]
    words_left = [' '.join([word if word is not None else 'empty' for word in cell_words]) if cell_words else 'empty' for cell_words in left_cells['cell_words']]
    words_below = [' '.join([word if word is not None else 'empty' for word in cell_words]) if cell_words else 'empty' for cell_words in below_cells['cell_words']]
    words_right = [' '.join([word if word is not None else 'empty' for word in cell_words]) if cell_words else 'empty' for cell_words in right_cells['cell_words']]

    # Append the list to the respective direction list
    words_above_list.append(words_above)
    words_left_list.append(words_left)
    words_below_list.append(words_below)
    words_right_list.append(words_right)

# Since we're iterating over child_cells_df, we need to merge the results back into the original DataFrame
# Create a temporary DataFrame with the results
temp_df = pd.DataFrame({
    'index': child_cells_df.index,
    'words_above': words_above_list,
    'words_left': words_left_list,
    'words_below': words_below_list,
    'words_right': words_right_list
})

# Merge the temporary DataFrame back into the original DataFrame based on the index
cells_df = pd.merge(cells_df, temp_df, how='left', left_index=True, right_on='index')

# Drop the 'index' column as it's no longer needed
cells_df.drop(columns=['index'], inplace=True)

cells_df.head()


Unnamed: 0,cell_id,cell_type,row_index,column_index,row_span,column_span,table_id,table_width,table_height,table_left,...,entity_type,cell_children,cell_words,cell_content,merged_parent_cell_id,has_merged_parent,words_above,words_left,words_below,words_right
,dc135c6c-215c-477d-966f-ce5ddc42ac11,TABLE_TITLE,0,0,1,1,00491b53-15ea-46a8-9dcc-c718cf35b9d7,0.846375,0.092622,0.086091,...,,"[aeba045e-658d-4abd-bd84-ee05f33af666, 919f6c8...","[Operational, Tests]",Operational Tests,-1,0.0,,,,
0.0,ac135cef-4f1f-4e34-a756-7e908911b6fc,CHILD,1,1,1,1,00491b53-15ea-46a8-9dcc-c718cf35b9d7,0.846375,0.092622,0.086091,...,TABLE_TITLE,"[aeba045e-658d-4abd-bd84-ee05f33af666, 919f6c8...","[Operational, Tests]",Operational Tests,bf4932c4-02ee-46f2-9a45-c63f6c30ec96,1.0,[],[],"[Manual Open:, Electrically Open:, Manually Ch...","[empty, empty, empty]"
,bf4932c4-02ee-46f2-9a45-c63f6c30ec96,MERGED_CELL,1,1,1,4,00491b53-15ea-46a8-9dcc-c718cf35b9d7,0.846375,0.092622,0.086091,...,TABLE_TITLE,"[ac135cef-4f1f-4e34-a756-7e908911b6fc, 27e3ce5...",[],,-1,0.0,,,,
1.0,9d401b93-a9c6-40ff-b209-c2a267037a25,CHILD,2,1,1,1,00491b53-15ea-46a8-9dcc-c718cf35b9d7,0.846375,0.092622,0.086091,...,,"[29d39722-081a-4dff-b011-194a7e94457b, a9cd7af...","[Manual, Open:]",Manual Open:,-1,0.0,[Operational Tests],[],"[Electrically Open:, Manually Charge:, Trip wi...","[OK, Manual Close:, OK]"
2.0,bb51bfeb-9198-448a-a07d-1e8b5ee09a64,CHILD,3,1,1,1,00491b53-15ea-46a8-9dcc-c718cf35b9d7,0.846375,0.092622,0.086091,...,,"[20ccf29b-1f35-4b48-a2b0-9a0ad5fbd962, a429dad...","[Electrically, Open:]",Electrically Open:,-1,0.0,"[Operational Tests, Manual Open:]",[],"[Manually Charge:, Trip with Protective Devices:]","[N/A, Electrically Close:, N/A]"


In [12]:
cells_df.to_csv(f'../{output_cells_file_name}', index=False)

In [13]:
# Function to aggregate cell contents into a list of lists, one per row
def aggregate_contents(group):
    # Sort the group by row and column index to ensure the correct order
    sorted_group = group.sort_values(by=['row_index', 'column_index'])
    # Aggregate contents by row
    contents_by_row = sorted_group.groupby('row_index')['cell_words'].apply(list).tolist()
    return contents_by_row

def aggregate_child_entities(group):
    # Filter the group to only include CHILD cells
    child_cells = group[group['cell_type'] == 'CHILD']
    # Replace NaN or empty entity_type values with 'normal'
    child_cells['entity_type'] = child_cells['entity_type'].replace({np.nan: 'normal', '': 'normal'})
    # Sort the group by row and column index to ensure the correct order
    sorted_group = child_cells.sort_values(by=['row_index', 'column_index'])
    # Aggregate entity types by row
    entities_by_row = sorted_group.groupby('row_index')['entity_type'].apply(list).tolist()
    return entities_by_row

# Aggregate information for each table
tables_df = cells_df.groupby('table_id').apply(lambda g: pd.Series({
    'table_width': g['table_width'].max(),
    'table_height': g['table_height'].max(),
    'table_left': g['table_left'].max(),
    'table_top': g['table_top'].max(),
    'table_page': g['table_page'].max(),
    'cell_count': g['cell_words'].count(),
    'row_count': g['row_index'].nunique(),
    'column_count': g['column_index'].nunique(),
    'content': aggregate_contents(g),
    'entities': aggregate_child_entities(g),
    # Add counts for different cell types
    'child_count': g[g['cell_type'] == 'CHILD']['cell_type'].count(),
    'merged_cell_count': g[g['cell_type'] == 'MERGED_CELL']['cell_type'].count(),
    'table_title_count': g[g['cell_type'] == 'TABLE_TITLE']['cell_type'].count(),
    'table_footer_count': g[g['cell_type'] == 'TABLE_FOOTER']['cell_type'].count(),
    'table_type': g['table_type'].max()
})).reset_index()

tables_df.head()

Unnamed: 0,table_id,table_width,table_height,table_left,table_top,table_page,cell_count,row_count,column_count,content,entities,child_count,merged_cell_count,table_title_count,table_footer_count,table_type
0,00491b53-15ea-46a8-9dcc-c718cf35b9d7,0.846375,0.092622,0.086091,0.253852,21,22,6,5,"[[[Operational, Tests]], [[Operational, Tests]...","[[TABLE_TITLE, TABLE_TITLE, TABLE_TITLE, TABLE...",20,1,1,0,SEMI_STRUCTURED_TABLE
1,008cfa48-340e-48f2-b50f-72501391bb7b,0.845629,0.298292,0.08772,0.462612,42,81,16,6,"[[[Trip, Unit, Settings]], [[Trip, Unit, Setti...","[[TABLE_TITLE, TABLE_TITLE, TABLE_TITLE, TABLE...",75,5,1,0,STRUCTURED_TABLE
2,00f834b4-1cfa-4d45-9865-f2875e7ec764,0.84497,0.288748,0.084845,0.513307,93,90,17,6,"[[[Trip, Tests], [Comments:, The, circuit, bre...","[[TABLE_TITLE, TABLE_TITLE, TABLE_TITLE, TABLE...",80,8,1,1,STRUCTURED_TABLE
3,00fc6115-6976-4954-8b76-8f0e92b6aecb,0.172638,0.068643,0.710909,0.070024,98,8,4,2,"[[[Date:], [7/30/16]], [[Work, Order:], [12362...","[[normal, normal], [normal, normal], [normal, ...",8,0,0,0,SEMI_STRUCTURED_TABLE
4,013618d2-8c4b-42ea-ad34-4005c6a8bb82,0.846589,0.145947,0.086444,0.098644,55,34,9,5,"[[[Visual, Inspection]], [[Visual, Inspection]...","[[TABLE_TITLE, TABLE_TITLE, TABLE_TITLE, TABLE...",32,1,1,0,SEMI_STRUCTURED_TABLE


In [14]:
layout_title_ids = [item['Id'] for item in data if item.get('BlockType') == 'LAYOUT_TITLE']

layout_titles = []

for layout_title in layout_title_ids:
    layout_title_block = find_item_by_id(layout_title)
    layout_title_cell = {
        'layout_title_id': layout_title,
        'layout_title_text': [],
        'layout_title_page': layout_title_block['Page'],
        'layout_title_confidence': layout_title_block['Confidence'],
    }
    if 'Relationships' in layout_title_block:
        for relationship in layout_title_block['Relationships']:
            for child_id in relationship['Ids']:
                child_block = find_item_by_id(child_id)
                if child_block['BlockType'] == 'LINE':
                    title_text = child_block['Text']
                    layout_title_cell['layout_title_text'].append(title_text)
    layout_titles.append(layout_title_cell)


layout_titles_df = pd.DataFrame(layout_titles)

# Calculate the maximum layout_title_confidence for each layout_title_page
layout_titles_df['max_confidence_per_page'] = layout_titles_df.groupby('layout_title_page')['layout_title_confidence'].transform('max')

# Determine if the current row has the maximum layout_title_confidence within its page
layout_titles_df['is_max_confidence'] = layout_titles_df['layout_title_confidence'] == layout_titles_df['max_confidence_per_page']
layout_titles_df.drop(columns=['max_confidence_per_page'], inplace=True)

layout_titles_df.head()

Unnamed: 0,layout_title_id,layout_title_text,layout_title_page,layout_title_confidence,is_max_confidence
0,dd2340f1-81d0-40f5-b55d-dbb0475ac08f,[Low VOLTAGE AIR CIRCUIT BREAKER TEST REPORT],1,80.029297,True
1,b0090016-2b4a-4428-909a-067c763bd9d8,[Low VOLTAGE AIR CIRCUIT BREAKER TEST REPORT (...,2,83.447266,True
2,bbc0454e-e4b0-4eff-89ff-cbbcab5c64b5,[Low VOLTAGE AIR CIRCUIT BREAKER TEST REPORT],3,80.175781,True
3,9ba57125-8fd5-4597-bef8-2193cdade0b4,[Low VOLTAGE AIR CIRCUIT BREAKER TEST REPORT],4,80.175781,True
4,c32a2fa3-91d6-4ba6-b179-02c0707b7b18,[Low VOLTAGE AIR CIRCUIT BREAKER TEST REPORT (...,5,85.9375,True


In [15]:
# Filter layout_titles_df
max_confidence_titles_df = layout_titles_df[layout_titles_df['is_max_confidence'] == True].copy()

# Flatten the layout_title_text list into a single string
max_confidence_titles_df['layout_title_text'] = max_confidence_titles_df['layout_title_text'].apply(' '.join)

# Merge with tables_df
tables_df = pd.merge(tables_df, max_confidence_titles_df[['layout_title_page', 'layout_title_text']],
                     left_on='table_page', right_on='layout_title_page', how='left')

# Rename and clean up
tables_df.drop(columns=['layout_title_page'], inplace=True, errors='ignore')


In [16]:
tables_df.head()

Unnamed: 0,table_id,table_width,table_height,table_left,table_top,table_page,cell_count,row_count,column_count,content,entities,child_count,merged_cell_count,table_title_count,table_footer_count,table_type,layout_title_text
0,00491b53-15ea-46a8-9dcc-c718cf35b9d7,0.846375,0.092622,0.086091,0.253852,21,22,6,5,"[[[Operational, Tests]], [[Operational, Tests]...","[[TABLE_TITLE, TABLE_TITLE, TABLE_TITLE, TABLE...",20,1,1,0,SEMI_STRUCTURED_TABLE,Low VOLTAGE AIR CIRCUIT BREAKER TEST REPORT (C...
1,008cfa48-340e-48f2-b50f-72501391bb7b,0.845629,0.298292,0.08772,0.462612,42,81,16,6,"[[[Trip, Unit, Settings]], [[Trip, Unit, Setti...","[[TABLE_TITLE, TABLE_TITLE, TABLE_TITLE, TABLE...",75,5,1,0,STRUCTURED_TABLE,Low VOLTAGE AIR CIRCUIT BREAKER TEST REPORT
2,00f834b4-1cfa-4d45-9865-f2875e7ec764,0.84497,0.288748,0.084845,0.513307,93,90,17,6,"[[[Trip, Tests], [Comments:, The, circuit, bre...","[[TABLE_TITLE, TABLE_TITLE, TABLE_TITLE, TABLE...",80,8,1,1,STRUCTURED_TABLE,Low VOLTAGE AIR CIRCUIT BREAKER TEST REPORT (C...
3,00fc6115-6976-4954-8b76-8f0e92b6aecb,0.172638,0.068643,0.710909,0.070024,98,8,4,2,"[[[Date:], [7/30/16]], [[Work, Order:], [12362...","[[normal, normal], [normal, normal], [normal, ...",8,0,0,0,SEMI_STRUCTURED_TABLE,Low VOLTAGE AIR CIRCUIT BREAKER TEST REPORT
4,013618d2-8c4b-42ea-ad34-4005c6a8bb82,0.846589,0.145947,0.086444,0.098644,55,34,9,5,"[[[Visual, Inspection]], [[Visual, Inspection]...","[[TABLE_TITLE, TABLE_TITLE, TABLE_TITLE, TABLE...",32,1,1,0,SEMI_STRUCTURED_TABLE,Low VOLTAGE AIR CIRCUIT BREAKER TEST REPORT (C...


In [17]:
tables_df.to_csv(f'../{output_tables_file_name}', index=False)