In [None]:
from pdf2image import convert_from_path
import csv
import glob
import os
import shutil
import time
import subprocess

In [None]:
# File and Directory Paths
table_dir = "extracted_tables/"
table_metadata_file = table_dir + "tables.csv"
output_dir = table_dir + "table_images/"
code_table_dir = table_dir + "table_code/"

MAX_COMPILING_TIME = 90
  
os.makedirs(output_dir, exist_ok=True)

In [None]:
# Returns a list of tables for which there does not exist an image file yet
def get_list_of_unprocessed_tables():
    # Locate already existing image files
    processed_tables = [x.replace(".png", "") for x in os.listdir(output_dir)]
    
    # Check for each table of the metadata if an image file exists
    table_metadata = open(table_metadata_file, "r", newline='', encoding='utf-8')
    spamreader = csv.reader(table_metadata, delimiter=';', quotechar='"', quoting=csv.QUOTE_ALL)
    list_unprocessed_papers = [row[0] for row in spamreader if row[0] not in processed_tables]
    table_metadata.close()
    
    return list_unprocessed_papers


# Removes a subset of tables from the dataset (including table_image, table_code)
def remove_from_dataset(table_subset):
    table_code_files = os.listdir(code_table_dir)
    
    for table in table_subset:
        table_code = table + ".tex"
        
        if table_code in table_code_files:
            os.remove(code_table_dir + table_code)
    
    # Remove filtered tables from metadata file
    with open(table_metadata_file, "r", newline='', encoding='utf-8') as input_file:
        with open(table_dir + "tmp.csv", "w", newline='', encoding='utf-8') as output_file:
            csv_reader = csv.reader(input_file, delimiter=';', quotechar='|', quoting=csv.QUOTE_MINIMAL)
            csv_writer = csv.writer(output_file, delimiter=';', quotechar='|', quoting=csv.QUOTE_MINIMAL)
            
            # Only writes rows of not deleted tables
            for row in csv_reader:
                if row[0] not in table_subset:
                    csv_writer.writerow(row)
                    
    # Replace old csv file with new csv file
    os.replace(table_dir + "tmp.csv", table_metadata_file)
    
    
# Removes compiling files output directory
def remove_compiling_files(table_id):
    file_endings = [".aux", ".pdf", ".out", ".log", ".spl"]
    
    for ending in file_endings:
        file_path = output_dir + table_id + ending
        if os.path.isfile(file_path):
            os.remove(file_path)

In [None]:
"""
Converting latex code tables to png tables
unprocessed_tables - list of table ids where table images have to be generated
amount - number of tables that should be processed
"""
def generate_table_images(unprocessed_tables, amount):
    faulty_tables = []
    for i in range(amount):          
        table_id = unprocessed_tables[i]
        print(table_id)   
        table_file = code_table_dir + table_id + ".tex"
        
        if os.path.isfile(table_file):
            try:
                # Using pdflatex to generate pdf file
                result = subprocess.run(
                    ["pdflatex", "-interaction=nonstopmode", f"-output-directory={output_dir}", table_file],
                    capture_output=True,
                    text=True,
                    timeout=MAX_COMPILING_TIME
                )
                
                # Converting pdf file to an image
                pdf_file = output_dir + table_id + ".pdf"
                images = convert_from_path(pdf_file)              
                png_file = pdf_file.replace(".pdf", ".png")            
                images[0].save(png_file, "PNG")

                # Trimming whitespace
                os.system(f"magick {png_file} -trim {png_file}")
                
                # Remove compiling files (such as .aux and .out)
                remove_compiling_files(table_id)

            except Exception as e:
                faulty_tables.append(table_id)
                print(f"Error Type: {type(e).__name__}")
                
                # Remove compiling files (such as .aux and .out)
                remove_compiling_files(table_id)
        else:
            print(f"{table_file} was not found.")
    
    # Removing faulty tables from dataset
    remove_from_dataset(faulty_tables)

In [None]:
unprocessed_tables = get_list_of_unprocessed_tables()
print(f"Length of unprocessed tables: {len(unprocessed_tables)}")

In [None]:
# Set number of tables for which an image shall be created
number_of_tables_to_process = 10

# Generate table images
generate_table_images(unprocessed_tables, number_of_tables_to_process)

In [None]:
for txt_file in os.listdir(code_table_dir):
    tex_file = txt_file.replace(".txt", ".tex")
    os.rename(code_table_dir + txt_file, code_table_dir + tex_file)
    
print("All tables converted to tex files.")