In [1]:
import os
import re
import csv
import uuid
import glob
import shutil

In [2]:
# Variables
source_dir = "source_files/"

table_output_dir = "extracted_tables/"
table_code_dir = table_output_dir + "table_code/"
table_header_dir = table_output_dir + "table_header/"
table_result_file = "tables_regex.csv"

figure_output_dir = "extracted_figures/"
figure_result_file = "figures_regex.csv"

possible_extensions = [".pdf", ".png", ".jpg", ".jpeg", ".eps"]

os.makedirs(table_output_dir, exist_ok=True)
os.makedirs(table_header_dir, exist_ok=True)
os.makedirs(table_code_dir, exist_ok=True)
os.makedirs(figure_output_dir, exist_ok=True)

In [12]:
# Reset directories
files = glob.glob(table_code_dir + "*")
for f in files:
    os.remove(f)
    
files = glob.glob(table_header_dir + "*")
for f in files:
    os.remove(f)
    
files = glob.glob(figure_output_dir + "*")
for f in files:
    os.remove(f)

if os.path.isfile(table_output_dir + table_result_file):
    os.remove(table_output_dir + table_result_file)
    
if os.path.isfile(figure_output_dir + figure_result_file):
    os.remove(figure_output_dir + figure_result_file)

In [13]:
# TODO: Save paper name and authors as metadata
csvfile_table = open(table_output_dir + table_result_file, 'w', newline='', encoding="utf-8")
spamwriter_table = csv.writer(csvfile_table, delimiter=';', quotechar='"', quoting=csv.QUOTE_ALL)

csvfile_figure = open(figure_output_dir + figure_result_file, 'w', newline='', encoding="utf-8")
spamwriter_figure = csv.writer(csvfile_figure, delimiter=';', quotechar='"', quoting=csv.QUOTE_ALL)

for paper in os.listdir(source_dir):
    paper_path = source_dir + paper
    if os.path.isdir(paper_path):
        print("\n\n" + paper)
        tex_files = [x for x in os.listdir(paper_path) if x.endswith('.tex')]

        complete_tex = ""
        for file in tex_files:
            try:
                f = open(paper_path + "/" + file, "r", encoding="utf8")
                complete_tex += f.read()
                f.close()
            except UnicodeDecodeError as e:
                print("UnicodeDecodeError occurred. File could not be loaded.")
                continue

        found_document_header = re.search(r"\\documentclass\[.*\\begin\{document\}", complete_tex, re.DOTALL)
        paper_id = str(uuid.uuid4())
        if found_document_header:
            paper_header_file = table_header_dir + paper_id + ".txt"
            f = open(paper_header_file, "w", encoding="utf-8")
            f.write(found_document_header.group(0))
            f.close()
        else:
            print("Document header could not be identified.")
            continue

        found_tables = re.findall(r"\\begin\{table\*?\}.*?\\end\{table\*?\}", complete_tex, re.DOTALL)
        found_figures = re.findall(r"\\begin\{figure\*?\}.*?\\end\{figure\*?\}", complete_tex, re.DOTALL)

        for table in found_tables:
            #r"\\caption\{(.*?)}"
            #caption_match = re.search(r"\\caption\{(([^{}]*(\{[^{}]*\})?[^{}]*)+)\}", table)
            
            """
            label_match = re.search(r"\\label\{(.*?)\}", table)
            if label_match:
                found_label = label_match.group(1)
                print(found_label)
                found_paragraphs = re.findall(fr".*\\ref\{{{re.escape(found_label)}}}.*", complete_tex)
                for paragraph in found_paragraphs:
                    print(paragraph)
            """
                    
            caption_match = re.search(r"\\caption\{(.*?)}", table)        
            if caption_match:
                found_caption = caption_match.group(1)
                table_id = str(uuid.uuid4())

                # Store as txt file in own data collection
                spamwriter_table.writerow([table_id, paper_id, found_caption])
                table_file_path = table_code_dir + table_id + ".txt"
                f = open(table_file_path, "w", encoding="utf-8")
                f.write(table)
                f.close()
                
                # Store as tex file in folder of paper
                table_file_path = paper_path + "/FR_TAB_" + table_id + ".tex"
                print(table_file_path)
                table_tex_code = found_document_header.group(0) + "\n\pagenumbering{gobble}\n" + table + "\n\end{document}"
                f = open(table_file_path, "w", encoding="utf-8")
                f.write(table_tex_code)
                f.close()

        for figure in found_figures:
            found_graphics = re.findall(r"\\includegraphics(\[.*?\])*\{(.*?)\}", figure)
            #caption_match = re.search(r"\\caption\{(([^{}]*(\{[^{}]*\})?[^{}]*)+)\}", figure)
            caption_match = re.search(r"\\caption\{(.*?)}", figure)
            if caption_match:
                found_caption = caption_match.group(1)
                for graphic in found_graphics:
                    figure_id = str(uuid.uuid4())
                    
                    graphic_path = graphic[1]
                    file_type = os.path.splitext(graphic_path)[-1]
                    if os.path.isfile(paper_path+"/"+graphic_path):
                        graphic_path = paper_path+"/"+graphic_path
                    else:
                        for ext in possible_extensions:
                            possible_path = paper_path+"/"+graphic_path + ext
                            if os.path.isfile(possible_path):
                                graphic_path = possible_path
                                break
                                
                    try:
                        shutil.copy(graphic_path, figure_output_dir+figure_id+file_type)
                        spamwriter_figure.writerow([figure_id, paper_id, found_caption])
                    except FileNotFoundError as e:
                        print(f"File not found: {graphic_path} - {e}")
                    except Exception as e:
                        print(f"An error occurred: {e}")
        
        print(f"{len(found_tables)} tables found.")
        print(f"{len(found_figures)} figures found.")
    #break
    
csvfile_table.close()
csvfile_figure.close()



2001.00116v2
source_files/2001.00116v2/FR_TAB_056610b8-263b-4421-a4b1-27ecea184e54.tex
source_files/2001.00116v2/FR_TAB_1bf1148b-6093-4596-b982-0e6e2b2596e9.tex
source_files/2001.00116v2/FR_TAB_f112a945-4ba0-4a0e-9416-4215e5036729.tex
source_files/2001.00116v2/FR_TAB_90edf805-01c5-499a-809f-84ff92f5b1f4.tex
source_files/2001.00116v2/FR_TAB_88d81d22-3da1-4775-b182-13ed8c728112.tex
source_files/2001.00116v2/FR_TAB_aee2dc5b-4ede-406a-bbbf-29a20be0434f.tex
source_files/2001.00116v2/FR_TAB_cfff4985-23ed-4f9b-b634-44a48b0f1aa2.tex
source_files/2001.00116v2/FR_TAB_bd61a4cd-c64b-47bc-b235-35825da47012.tex
8 tables found.
9 figures found.


2001.00117v1
0 tables found.
9 figures found.


2001.00119v2
Document header could not be identified.


2001.00120v1
0 tables found.
0 figures found.


2001.00122v1
source_files/2001.00122v1/FR_TAB_47159bc6-99b5-45c3-9168-b644a21022da.tex
source_files/2001.00122v1/FR_TAB_6320ae15-1303-41ad-b98e-427b71661d77.tex
source_files/2001.00122v1/FR_TAB_3d8b1fd1-8ab

3 tables found.
7 figures found.


2001.00188v3
source_files/2001.00188v3/FR_TAB_67e503f7-8e0c-46e9-88e4-c69f43189f73.tex
1 tables found.
6 figures found.


2001.00190v2
source_files/2001.00190v2/FR_TAB_a8e9eb03-9db1-4a48-b2c4-61f5ebadc2e4.tex
1 tables found.
4 figures found.


2001.00191v1
UnicodeDecodeError occurred. File could not be loaded.
Document header could not be identified.


2001.00193v1
0 tables found.
9 figures found.


2001.00194v3
0 tables found.
3 figures found.


2001.00195v2
source_files/2001.00195v2/FR_TAB_47b98a34-7c4e-481a-9315-9827cfd6a4e3.tex
File not found: #3 - [Errno 2] No such file or directory: '#3'
File not found: #3 - [Errno 2] No such file or directory: '#3'
1 tables found.
6 figures found.


2001.00196v3
0 tables found.
0 figures found.


2001.00197v1
2 tables found.
21 figures found.


2001.00202v3
0 tables found.
0 figures found.


2001.00203v1
source_files/2001.00203v1/FR_TAB_80b6ba1c-0b81-4b46-84de-2bef3a75daa2.tex
source_files/2001.00203v1/FR_TAB_1

In [None]:
csvfile_table.close()
csvfile_figure.close()