In [None]:
from pdf2image import convert_from_path
import csv
import glob
import os
import shutil
import time

In [None]:
# File and Directory Paths
source_dir = "source_files/"
processed_metadata_file = source_dir + "papers_processed.csv"

table_dir = "extracted_tables/"
table_metadata_file = table_dir + "tables.csv"
output_dir = table_dir + "table_images/"
code_table_dir = table_dir + "table_code/"
  
os.makedirs(output_dir, exist_ok=True)

In [None]:
def get_list_of_unprocessed_papers():
    csvfile_processed = open(processed_metadata_file, "r")
    spamreader = csv.reader(csvfile_processed, delimiter=';', quotechar='"', quoting=csv.QUOTE_ALL)
    
    list_unprocessed_papers = [row[0] for row in spamreader if row[5] == "True"]
    csvfile_processed.close()
    
    return list_unprocessed_papers

In [None]:
"""
Converting latex code tables to png tables
unprocessed_papers - list of paper ids where table images have to be generated
amount - number of papers that should be processed
"""
def extract_data_from_papers(unprocessed_papers, amount):
    faulty_tables = []
    for i in range(amount):          
        paper_id = unprocessed_papers[i]
        paper_path = source_dir + paper_id
        print(paper_path)
        if os.path.isdir(paper_path):
            table_files = [x for x in os.listdir(paper_path) if x.startswith('FR_TAB_')]
            print(str(i) + ", " + paper_id + ": " + str(len(table_files)) + " tables")

            for table in table_files:
                try:
                    # Using pdflatex to generate pdf file
                    os.system(f"pdflatex -interaction=nonstopmode -output-directory={paper_path} {os.path.join(paper_path, table)}")

                    # Converting pdf file to an image
                    table = table.replace(".tex", ".pdf")
                    images = convert_from_path(os.path.join(paper_path, table))              
                    table = table.replace(".pdf", ".png")            
                    saved_png_path = os.path.join(paper_path, table)
                    table_id = table.replace("FR_TAB_", "")
                    images[0].save(saved_png_path, "PNG")

                    # Trimming whitespace
                    os.system(f"magick {saved_png_path} -trim {output_dir + table_id}")

                except Exception as e:
                    faulty_tables.append(table)
                    print(f"Error Type: {type(e).__name__}")
                    error_message = str(e)[:100]
                    print(f"Error Message: {error_message}")

            # Remove the paper from disk
            try:
                shutil.rmtree(source_dir + paper_id)
            except Exception as e:
                print(f"Paper {paper_id} could not be deleted.")
                print(f"Error Type: {type(e).__name__}")
                error_message = str(e)[:100]
                print(f"Error Message: {error_message}")
                      
        else:
            print("Paper could not be found on the disk")
    
    # Removing faulty tables from dataset
    remove_from_dataset(faulty_tables)
    
    # Writing changes into the csv file
    write_changes_into_csvfile(unprocessed_papers[0:amount])

# Change to_be_processed_value in the csv paper metadata file
def write_changes_into_csvfile(processed_papers):
    with open(processed_metadata_file, "r", newline='', encoding='utf-8') as input_file:
        with open(source_dir + "tmp.csv", "w", newline='', encoding='utf-8') as output_file:
            csv_reader = csv.reader(input_file, delimiter=';', quotechar='|', quoting=csv.QUOTE_MINIMAL)
            csv_writer = csv.writer(output_file, delimiter=';', quotechar='|', quoting=csv.QUOTE_MINIMAL)
            
            for row in csv_reader:
                if row[0] in processed_papers:
                    row[5] = False
                csv_writer.writerow(row)
                
    # Replace old csv file with new csv file
    os.replace(source_dir + "tmp.csv", processed_metadata_file)
    
# Removes a subset of tables from the dataset (including table_image, table_code)
def remove_from_dataset(table_subset):
    table_code_files = os.listdir(code_table_dir)
    
    for table in table_subset:
        table_code = table + ".txt"
        
        if table_code in table_code_files:
            os.remove(table_code_dir + table_code)
    
    # Remove filtered tables from metadata file
    with open(table_metadata_file, "r", newline='', encoding='utf-8') as input_file:
        with open(table_dir + "tmp.csv", "w", newline='', encoding='utf-8') as output_file:
            csv_reader = csv.reader(input_file, delimiter=';', quotechar='|', quoting=csv.QUOTE_MINIMAL)
            csv_writer = csv.writer(output_file, delimiter=';', quotechar='|', quoting=csv.QUOTE_MINIMAL)
            
            # Only writes rows of not deleted tables
            for row in csv_reader:
                if row[0] not in table_subset:
                    csv_writer.writerow(row)
                    
    # Replace old csv file with new csv file
    os.replace(table_dir + "tmp.csv", table_metadata_file)

In [None]:
unprocessed_papers = get_list_of_unprocessed_papers()
print(f"Length of unprocessed papers: {len(unprocessed_papers)}")

In [None]:
# Set number of papers that should be processed (meaning extracting table images from them)
number_of_papers_to_process = 381

# Process papers
extract_data_from_papers(unprocessed_papers, number_of_papers_to_process)

In [None]:
import re

tables = re.findall(r"\d+, ([0-9\.]*): \d+ tables", output_message)
print(len(tables))

write_changes_into_csvfile(tables)