In [1]:
import os
import re
import shutil
import csv
from PIL import Image

In [2]:
# File and Directory Paths
source_dir = "source_files/"
processed_metadata_file = source_dir + "papers_processed.csv"

table_dir = "extracted_tables/"
table_metadata_file = table_dir + "tables.csv"
image_table_dir = table_dir + "table_images/"
code_table_dir = table_dir + "table_code/"

In [None]:
# Table image filtering by max height
def filter_table_image(max_height):
    filtered_tables = []
    for table_image in os.listdir(image_table_dir):
        try:
            image_path = image_table_dir + table_image
            img = Image.open(image_path)
            width, height = img.size
            img.close()
            
            if height > max_height:
                filtered_tables.append(table_image.replace(".png", ""))
                
        except Exception as e:
            print(f"Error for file {table_image}: {e}")
    
    print(f"{len(filtered_tables)} tables have been filtered.")
    return filtered_tables

In [None]:
# Table code filtering by min rows and min columns
def filter_table_code(min_rows, min_cols):
    filtered_tables = []
    for table_code in os.listdir(code_table_dir):
        try:
            rows, cols = count_table_dimension(code_table_dir + table_code)       
            if rows < min_rows or cols < min_cols:
                filtered_tables.append(table_code.replace(".txt", ""))
        except Exception as e:
            print(f"Error for file {table_code}: {e}")
            filtered_tables.append(table_code.replace(".txt", ""))
    print(f"{len(filtered_tables)} tables have been filtered.")
    return filtered_tables 

# Determines the number of rows and columns of a latex table based on its code
def count_table_dimension(code_path):
    f = open(code_path, "r", encoding="utf8")
    latex_code = f.read()
    f.close()
    
    table_content = re.search(r'\\begin\{tabular\}.*?\\end\{tabular\}', latex_code, re.DOTALL)
    if table_content is None:
        raise ValueError("No tabular environment found in the code.")
        
    table_content = table_content.group(0).split("\n",1)[1]
    
    # Remove latex formatting
    table_content = re.sub(r'\\[a-zA-Z]+\{.*?\}', '', table_content)  # Remove commands like \textbf{}
    table_content = re.sub(r'%.*', '', table_content)  # Remove comments
    table_content = re.sub(r"\\(top|mid|bottom)rule", "", table_content)
    table_content = re.sub(r"\\hline", "", table_content)  
    table_content = re.sub(r'\n', ' ', table_content)  # Replace newlines with spaces for simpler processing
    
    # Count rows and columns
    rows = re.split(r'\\\\', table_content)
    rows = [row.strip() for row in rows if row.strip()]
    number_rows = len(rows)
    
    number_columns = 0
    if rows and len(rows) > 1:
        number_columns = rows[1].count('&') + 1
    else:
        print(f"No rows found for {code_path}!")
    
    return number_rows, number_columns

In [None]:
# Removes a subset of tables from the dataset (including table_image, table_code)
def remove_from_dataset(table_subset):
    table_code_files = os.listdir(code_table_dir)
    table_image_files = os.listdir(image_table_dir)
    
    for table in table_subset:
        table_png = table + ".png"
        table_code = table + ".txt"
        
        if table_png in table_image_files:
            os.remove(image_table_dir + table_png)
        if table_code in table_code_files:
            os.remove(code_table_dir + table_code)
    
    # Remove filtered tables from metadata file
    with open(table_metadata_file, "r", newline='', encoding='utf-8') as input_file:
        with open(table_dir + "tmp.csv", "w", newline='', encoding='utf-8') as output_file:
            csv_reader = csv.reader(input_file, delimiter=';', quotechar='|', quoting=csv.QUOTE_MINIMAL)
            csv_writer = csv.writer(output_file, delimiter=';', quotechar='|', quoting=csv.QUOTE_MINIMAL)
            
            # Only writes rows of not deleted tables
            for row in csv_reader:
                if row[0] not in table_subset:
                    csv_writer.writerow(row)
                    
    # Replace old csv file with new csv file
    os.replace(table_dir + "tmp.csv", table_metadata_file)

In [None]:
# Max height a table image should have to be not
MAX_HEIGHT = 1700

# Filtering table images
filtered_tables = filter_table_image(MAX_HEIGHT)

# Removing them from the dataset
remove_from_dataset(filtered_tables)

In [None]:
# Min rows and colums a latex table should have to be not filtered
MIN_ROWS = 4
MIN_COLUMNS = 4

# Filtering table code
filtered_tables = filter_table_code(MIN_ROWS, MIN_COLUMNS)

# Removing them from the dataset
remove_from_dataset(filtered_tables)

In [None]:
# Deletes no longer used tex tables from source files
def update_processed_metadata():
    table_list = []
    with open(table_metadata_file, "r", newline='', encoding='utf-8') as metadata_file:
        csv_reader = csv.reader(metadata_file, delimiter=';', quotechar='|', quoting=csv.QUOTE_MINIMAL)
        for row in csv_reader:
            table_list.append(row[0])
    print(len(table_list))
    
    with open(processed_metadata_file, "r", newline='', encoding='utf-8') as input_file:
        with open(source_dir + "tmp.csv", "w", newline='', encoding='utf-8') as output_file:
            csv_reader = csv.reader(input_file, delimiter=';', quotechar='|', quoting=csv.QUOTE_MINIMAL)
            csv_writer = csv.writer(output_file, delimiter=';', quotechar='|', quoting=csv.QUOTE_MINIMAL)
            
            for row in csv_reader:
                paper_id = row[0]
                num_tables = int(row[3])
                to_be_processed = row[5]
                
                actual_tables = [x for x in table_list if x.startswith(paper_id)]
                actual_table_number = len(actual_tables)
                row[3] = actual_table_number
                
                if actual_table_number == 0 and to_be_processed == "True":
                    row[5] = "False"
                    try:
                        shutil.rmtree(source_dir + paper_id)
                        print(f"Deleting {paper_id}")
                    except Exception as e:
                        print(f"Paper {row[0]} could not be deleted.")
                        print(f"Error Type: {type(e).__name__}")
                        error_message = str(e)[:100]
                        print(f"Error Message: {error_message}")               
                elif actual_table_number < num_tables and to_be_processed == "True":
                    tables_in_source_dir = [x for x in os.listdir(source_dir + paper_id) if x.startswith("FR_TAB_") and x.endswith(".tex")]
                    for table in tables_in_source_dir:
                        table_id = table.replace(".tex", "")
                        if table_id not in actual_tables:
                            try:
                                os.remove(source_dir + paper_id + "/" + table)
                            except Exception as e:
                                print(f"Error Type: {type(e).__name__}")
                                error_message = str(e)[:100]
                                print(f"Error Message: {error_message}")    
                
                csv_writer.writerow(row)
            
    # Replace old csv file with new csv file
    os.replace(source_dir + "tmp.csv", processed_metadata_file)
                
update_processed_metadata()

In [5]:
# Removes table images that were filtered before
def remove_table_images():
    # Get table list from metadata file
    table_list = []
    with open(table_metadata_file, "r", newline='', encoding='utf-8') as metadata_file:
        csv_reader = csv.reader(metadata_file, delimiter=';', quotechar='|', quoting=csv.QUOTE_MINIMAL)
        for row in csv_reader:
            table_list.append(row[0])
   
    counter_delete = 0
    image_file_list = os.listdir(image_table_dir)
    
    # Delete table image if it is not part of the metadata file
    for image_file in image_file_list:
        if image_file.replace(".tex", "") not in table_list:
            try:
                os.remove(image_table_dir + image_file)
                counter_delete += 1
            except Exception as e:
                print(f"Error Type: {type(e).__name__}")
                error_message = str(e)[:100]
                print(f"Error Message: {error_message}")
                
    print(f"{counter_delete} table images have been deleted.")

In [6]:
remove_table_images()

['2307.16446_TAB_1', '2309.07322_TAB_1', '2309.07322_TAB_2', '2309.14857_TAB_1', '2309.14857_TAB_2', '2309.14857_TAB_5', '2309.14857_TAB_6', '2309.14857_TAB_7', '2309.14857_TAB_8', '2309.14857_TAB_9', '2309.14857_TAB_10', '2310.00503_TAB_1', '2310.00503_TAB_2', '2310.00503_TAB_3', '2310.00503_TAB_4', '2310.00503_TAB_5', '2310.00503_TAB_6', '2312.10237_TAB_2', '2401.13246_TAB_1', '2401.13246_TAB_4', '2401.13246_TAB_5', '2402.06778_TAB_1', '2402.06778_TAB_2', '2402.06778_TAB_3', '2402.06778_TAB_4', '2402.06778_TAB_5', '2402.06778_TAB_6', '2402.06778_TAB_8', '2402.06778_TAB_9', '2402.06778_TAB_10', '2402.07309_TAB_1', '2402.07309_TAB_5', '2402.11719_TAB_1', '2402.11719_TAB_2', '2402.11719_TAB_3', '2402.11719_TAB_6', '2403.01432_TAB_11', '2403.01432_TAB_12', '2403.01432_TAB_13', '2403.01432_TAB_15', '2403.01432_TAB_16', '2403.01432_TAB_17', '2403.01432_TAB_19', '2403.01432_TAB_20', '2403.01432_TAB_21', '2403.01432_TAB_22', '2403.01432_TAB_23', '2403.14484_TAB_1', '2403.14484_TAB_2', '2403.

0