In [1]:
import os
import re
import csv
import uuid
import glob
import shutil

In [3]:
# Variables
source_dir = "source_files/"
source_metadata_file = source_dir + "papers.csv"
processed_metadata_file = source_dir + "papers_processed.csv"

table_output_dir = "extracted_tables/"
table_code_dir = table_output_dir + "table_code/"
table_result_file = table_output_dir + "tables.csv"

figure_output_dir = "extracted_figures/"
figure_result_file = figure_output_dir + "figures.csv"

possible_extensions = [".pdf", ".png", ".jpg", ".jpeg", ".eps"]

os.makedirs(table_output_dir, exist_ok=True)
os.makedirs(table_code_dir, exist_ok=True)
os.makedirs(figure_output_dir, exist_ok=True)

if os.path.isfile(table_result_file) == False:
    open(table_result_file, "w").close()
    
if os.path.isfile(figure_result_file) == False:
    open(figure_result_file, "w").close()
    
if os.path.isfile(processed_metadata_file) == False:
    open(processed_metadata_file, "w").close()

In [None]:
# Reset directories and csv files
# WARNING: This deletes all collected tables and figures
files = glob.glob(table_code_dir + "*")
for f in files:
    os.remove(f)
    
files = glob.glob(figure_output_dir + "*")
for f in files:
    os.remove(f)
open(figure_result_file, "w").close()

if os.path.isfile(table_result_file):
    os.remove(table_result_file)
    open(table_result_file, "w").close()
    
if os.path.isfile(processed_metadata_file):
    os.remove(processed_metadata_file)
    open(processed_metadata_file, "w").close()

In [None]:
"""
Returns a list of papers that were downloaded but not yet processed.
"""
def get_unprocessed_papers():
    csvfile_source = open(source_metadata_file, "r")
    csvfile_processed = open(processed_metadata_file, "r")
    source_spamreader = csv.reader(csvfile_source, delimiter=';', quotechar='"', quoting=csv.QUOTE_ALL)
    processed_spamreader = csv.reader(csvfile_processed, delimiter=';', quotechar='"', quoting=csv.QUOTE_ALL)
    
    processed_papers = [row[0] for row in processed_spamreader]
    unprocessed_papers = []
    
    for row in source_spamreader:
        if row[0] not in processed_papers:
            unprocessed_papers.append(row)
            
    csvfile_source.close()
    csvfile_processed.close()
    
    return unprocessed_papers

In [None]:
"""
Extracting tables and figures from the downloaded papers in the source directory
"""
def extract_data_from_papers(unprocessed_papers, amount):
    for i in range(amount):
        row = unprocessed_papers[i]
        
        paper_id = row[0]
        paper_path = source_dir + paper_id
        if os.path.isdir(paper_path):
            print("\n\n" + paper_id)
            tex_files = [os.path.join(root, file)
                 for root, dirs, files in os.walk(paper_path)
                 for file in files if file.endswith(".tex") and file.startswith("FR_") == False]
            print(tex_files)

            complete_tex = ""
            for file in tex_files:
                try:
                    f = open(file, "r", encoding="utf8")
                    complete_tex += f.read()
                    f.close()
                except UnicodeDecodeError as e:
                    print("UnicodeDecodeError occurred. File could not be loaded.")
                    continue

            found_tables = re.findall(r"\\begin\{table\*?\}.*?\\end\{table\*?\}", complete_tex, re.DOTALL)
            found_figures = re.findall(r"\\begin\{figure\*?\}.*?\\end\{figure\*?\}", complete_tex, re.DOTALL)
            counter_table = 0
            counter_figure = 0
            
            # Capture document preamble. If it can't be captured, skip table capturing.
            found_document_header = re.search(r"\\documentclass\[.*?\\begin\{document\}", complete_tex, re.DOTALL)
            if found_document_header is None:
                print("Document header could not be identified.")
                found_tables = []

            # Table capturing
            for table in found_tables:
                #r"\\caption\{(.*?)}"
                #caption_match = re.search(r"\\caption\{(([^{}]*(\{[^{}]*\})?[^{}]*)+)\}", table)

                label_match = re.search(r"\\label\{(.*?)\}", table)
                caption_match = re.search(r"\\caption\{(.*?)}", table)
                
                if label_match and caption_match:
                    
                    # Skipping if no paragraph mention was found
                    found_label = label_match.group(1)
                    found_paragraphs = re.findall(fr".*\\ref\{{{re.escape(found_label)}}}.*", complete_tex)
                    if len(found_paragraphs) == 0:
                        continue
                        
                    paragraph_mentions = ""
                    for paragraph in found_paragraphs:
                        paragraph_mentions += paragraph
      
                    found_caption = caption_match.group(1)
                    counter_table += 1
                    table_id = paper_id + "_TAB_" + str(counter_table)

                    # Store meta information in csv file
                    with open(table_result_file, 'a', newline='', encoding="utf-8") as csvfile_table:
                        csvfile_table.write(table_id + ";" + paper_id + ";https://arxiv.org/abs/" + row[0] + ";" + found_caption + ";" + paragraph_mentions + "\n")

                    # Store as txt file in own data collection
                    table_file_path = table_code_dir + table_id + ".txt"
                    f = open(table_file_path, "w", encoding="utf-8")
                    f.write(table)
                    f.close()

                    # Store as tex file in folder of paper
                    table_file_path = paper_path + "/FR_TAB_" + table_id + ".tex"
                    table_tex_code = found_document_header.group(0) + "\n\pagenumbering{gobble}\n" + table + "\n\end{document}"
                    f = open(table_file_path, "w", encoding="utf-8")
                    f.write(table_tex_code)
                    f.close()

            # Figure capturing
            for figure in found_figures:
                found_graphics = re.findall(r"\\includegraphics(\[.*?\])*\{(.*?)\}", figure)
                #caption_match = re.search(r"\\caption\{(([^{}]*(\{[^{}]*\})?[^{}]*)+)\}", figure)
                caption_match = re.search(r"\\caption\{(.*?)}", figure)
                label_match = re.search(r"\\label\{(.*?)\}", figure)
                
                # Skipping multi figures
                if len(found_graphics) != 1:
                    continue
                
                # Caption and label need to be found
                if caption_match and label_match:                  
                    
                    # Skipping if no paragraph mention was found
                    found_label = label_match.group(1)
                    found_paragraphs = re.findall(fr".*\\ref\{{{re.escape(found_label)}}}.*", complete_tex)
                    if len(found_paragraphs) == 0:
                        continue
                        
                    paragraph_mentions = ""
                    for paragraph in found_paragraphs:
                        paragraph_mentions += paragraph
                    paragraph_mentions.replace(";", ",")
                    
                    found_caption = caption_match.group(1)
                    graphic = found_graphics[0]

                    graphic_path = graphic[1]
                    file_type = os.path.splitext(graphic_path)[-1]
                    if os.path.isfile(paper_path+"/"+graphic_path):
                        graphic_path = paper_path+"/"+graphic_path
                    else:
                        unknown_extension = True
                        for ext in possible_extensions:
                            possible_path = paper_path+"/"+graphic_path + ext
                            if os.path.isfile(possible_path):
                                graphic_path = possible_path
                                file_type = ext
                                unknown_extension = False
                                break
                        if unknown_extension:
                            continue

                    try:
                        counter_figure += 1
                        figure_id = paper_id + "_FIG_" + str(counter_figure)
                        shutil.copy(graphic_path, figure_output_dir+figure_id+file_type)
                        with open(figure_result_file, 'a', newline='', encoding="utf-8") as csvfile_figure:
                            csvfile_figure.write(figure_id + ";" + row[0] + ";https://arxiv.org/abs/" + row[0] + ";" + found_caption + ";" + paragraph_mentions + "\n")
                    except FileNotFoundError as e:
                        print(f"File not found: {graphic_path} - {e}")
                    except Exception as e:
                        print(f"An error occurred: {e}")

            number_tables = len(found_tables)
            number_figures = len(found_figures)
            print(f"{counter_table}/{number_tables} tables collected.")
            print(f"{counter_figure}/{number_figures} figures collected.")
            
            finish_process(row, counter_table, counter_figure)
        else:
            print(f"Paper {paper_id} could not be found.")
            finish_process(row, -1, -1)
                
# Store meta information in processed_papers csv file and deletes paper from disk if it is not longer needed
def finish_process(row, number_tables, number_figures):
    
    # Check if there are papers that still need to be processed (as image file)
    tables_to_be_process = True
    if number_tables <= 0:
        tables_to_be_process = False
    
    # Write into processed_papers csv file
    with open(processed_metadata_file, "a", newline='', encoding="utf-8") as csvfile_processed:
        spamwriter = csv.writer(csvfile_processed, delimiter=';', quotechar='|', quoting=csv.QUOTE_MINIMAL)
        row.extend([str(number_tables), str(number_figures), str(tables_to_be_process)])
        spamwriter.writerow(row)

    # Delete paper if no tables were found (because no table image extraction is needed)
    if tables_to_be_process == False:
        try:
            shutil.rmtree(source_dir + row[0])
        except Exception as e:
            print(f"Paper {row[0]} could not be deleted.")
            print(f"Error Type: {type(e).__name__}")
            error_message = str(e)[:100]
            print(f"Error Message: {error_message}")

In [None]:
## Get list of unprocessed papers
list_unprocessed_papers = get_unprocessed_papers()
print(f"Number of unprocessed papers: {len(list_unprocessed_papers)}")

In [None]:
# Set how many processed papers should be processed
number_of_papers_to_process = 5

# Start the processing process, including extraction
extract_data_from_papers(list_unprocessed_papers, number_of_papers_to_process)