In [None]:
import os
import re
import shutil
import csv
from PIL import Image

In [None]:
# File and Directory Paths
source_dir = "source_files/"
processed_metadata_file = source_dir + "papers_processed.csv"

table_dir = "extracted_tables/"
table_metadata_file = table_dir + "tables.csv"
code_table_dir = table_dir + "table_code/"

In [None]:
# Table image filtering by max height
def filter_table_image(max_height):
    filtered_tables = []
    for table_image in os.listdir(image_table_dir):
        try:
            image_path = image_table_dir + table_image
            img = Image.open(image_path)
            width, height = img.size
            img.close()
            
            if height > max_height:
                filtered_tables.append(table_image.replace(".png", ""))
                
        except Exception as e:
            print(f"Error for file {table_image}: {e}")
    
    print(f"{len(filtered_tables)} tables have been filtered.")
    return filtered_tables

In [None]:
# Table code filtering by min rows and min columns
def filter_table_code(min_rows, min_cols):
    filtered_tables = []
    for table_code in os.listdir(code_table_dir):
        try:
            rows, cols = count_table_dimension(code_table_dir + table_code)       
            if rows < min_rows or cols < min_cols:
                filtered_tables.append(table_code.replace(".txt", ""))
        except Exception as e:
            print(f"Error for file {table_code}: {e}")
            filtered_tables.append(table_code.replace(".txt", ""))
    print(f"{len(filtered_tables)} tables have been filtered.")
    return filtered_tables 

# Determines the number of rows and columns of a latex table based on its code
def count_table_dimension(code_path):
    f = open(code_path, "r", encoding="utf8")
    latex_code = f.read()
    f.close()
    
    table_content = re.search(r'\\begin\{tabular\}.*?\\end\{tabular\}', latex_code, re.DOTALL)
    if table_content is None:
        raise ValueError("No tabular environment found in the code.")
        
    table_content = table_content.group(0).split("\n",1)[1]
    
    # Remove latex formatting
    table_content = re.sub(r'\\[a-zA-Z]+\{.*?\}', '', table_content)  # Remove commands like \textbf{}
    table_content = re.sub(r'%.*', '', table_content)  # Remove comments
    table_content = re.sub(r"\\(top|mid|bottom)rule", "", table_content)
    table_content = re.sub(r"\\hline", "", table_content)  
    table_content = re.sub(r'\n', ' ', table_content)  # Replace newlines with spaces for simpler processing
    
    # Count rows and columns
    rows = re.split(r'\\\\', table_content)
    rows = [row.strip() for row in rows if row.strip()]
    number_rows = len(rows)
    
    number_columns = 0
    if rows and len(rows) > 1:
        number_columns = rows[1].count('&') + 1
    else:
        print(f"No rows found for {code_path}!")
    
    return number_rows, number_columns

In [None]:
# Removes a subset of tables from the dataset (including table_image, table_code)
def remove_from_dataset(table_subset):
    table_code_files = os.listdir(code_table_dir)
    table_image_files = os.listdir(image_table_dir)
    
    for table in table_subset:
        table_png = None
        table_code = table + ".txt"
        
        # Locate image file (if it exists)
        # TODO: Do this with possible extensions
        file_list = [x for x in table_image_files if x.startswith(table + ".")]
        if len(file_list) == 1:
            table_png = file_list[0]
        
        # Delete image and code files
        if table_png:
            os.remove(image_table_dir + table_png)
        if table_code in table_code_files:
            os.remove(code_table_dir + table_code)
    
    # Remove filtered tables from metadata file
    with open(table_metadata_file, "r", newline='', encoding='utf-8') as input_file:
        with open(table_dir + "tmp.csv", "w", newline='', encoding='utf-8') as output_file:
            csv_reader = csv.reader(input_file, delimiter=';', quotechar='|', quoting=csv.QUOTE_MINIMAL)
            csv_writer = csv.writer(output_file, delimiter=';', quotechar='|', quoting=csv.QUOTE_MINIMAL)
            
            # Only writes rows of not deleted tables
            for row in csv_reader:
                if row[0] not in table_subset:
                    csv_writer.writerow(row)
                    
    # Replace old csv file with new csv file
    os.replace(table_dir + "tmp.csv", table_metadata_file)

In [None]:
# Max height a table image should have to be not
MAX_HEIGHT = 1700

# Filtering table images
filtered_tables = filter_table_image(MAX_HEIGHT)

# Removing them from the dataset
remove_from_dataset(filtered_tables)

In [None]:
# Min rows and colums a latex table should have to be not filtered
MIN_ROWS = 4
MIN_COLUMNS = 4

# Filtering table code
filtered_tables = filter_table_code(MIN_ROWS, MIN_COLUMNS)

# Removing them from the dataset
remove_from_dataset(filtered_tables)