In [1]:
import os
import re
import csv
import uuid
import glob
import shutil

In [2]:
# Variables
source_dir = "source_files/"

table_output_dir = "extracted_tables/"
table_code_dir = table_output_dir + "table_code/"
table_header_dir = table_output_dir + "table_header/"
table_result_file = "tables_regex.csv"

figure_output_dir = "extracted_figures/"
figure_result_file = "figures_regex.csv"

possible_extensions = [".pdf", ".png", ".jpg", ".jpeg", ".eps"]

os.makedirs(table_output_dir, exist_ok=True)
os.makedirs(table_header_dir, exist_ok=True)
os.makedirs(table_code_dir, exist_ok=True)
os.makedirs(figure_output_dir, exist_ok=True)

In [3]:
# Reset directories
files = glob.glob(table_code_dir + "*")
for f in files:
    os.remove(f)
    
files = glob.glob(table_header_dir + "*")
for f in files:
    os.remove(f)
    
files = glob.glob(figure_output_dir + "*")
for f in files:
    os.remove(f)

if os.path.isfile(table_output_dir + table_result_file):
    os.remove(table_output_dir + table_result_file)
    
if os.path.isfile(figure_output_dir + figure_result_file):
    os.remove(figure_output_dir + figure_result_file)

In [8]:
# TODO: Save paper name and authors as metadata
csvfile_table = open(table_output_dir + table_result_file, 'w', newline='', encoding="utf-8")
spamwriter_table = csv.writer(csvfile_table, delimiter=';', quotechar='"', quoting=csv.QUOTE_ALL)

csvfile_figure = open(figure_output_dir + figure_result_file, 'w', newline='', encoding="utf-8")
spamwriter_figure = csv.writer(csvfile_figure, delimiter=';', quotechar='"', quoting=csv.QUOTE_ALL)

for paper in os.listdir(source_dir):
    paper_path = source_dir + paper
    if os.path.isdir(paper_path):
        print("\n\n" + paper)
        tex_files = [x for x in os.listdir(paper_path) if x.endswith('.tex')]

        complete_tex = ""
        for file in tex_files:
            try:
                f = open(paper_path + "/" + file, "r", encoding="utf8")
                complete_tex += f.read()
                f.close()
            except UnicodeDecodeError as e:
                print("UnicodeDecodeError occurred. File could not be loaded.")
                continue

        found_document_header = re.search(r"\\documentclass\[.*\\begin\{document\}", complete_tex, re.DOTALL)
        paper_id = str(uuid.uuid4())
        if found_document_header:
            paper_header_file = table_header_dir + paper_id + ".txt"
            f = open(paper_header_file, "w", encoding="utf-8")
            f.write(found_document_header.group(0))
            f.close()
        else:
            print("Document header could not be identified.")
            continue

        found_tables = re.findall(r"\\begin\{table\*?\}.*?\\end\{table\*?\}", complete_tex, re.DOTALL)
        found_figures = re.findall(r"\\begin\{figure\*?\}.*?\\end\{figure\*?\}", complete_tex, re.DOTALL)

        for table in found_tables:
            #r"\\caption\{(.*?)}"
            #caption_match = re.search(r"\\caption\{(([^{}]*(\{[^{}]*\})?[^{}]*)+)\}", table)
            
            """
            label_match = re.search(r"\\label\{(.*?)\}", table)
            if label_match:
                found_label = label_match.group(1)
                print(found_label)
                found_paragraphs = re.findall(fr".*\\ref\{{{re.escape(found_label)}}}.*", complete_tex)
                for paragraph in found_paragraphs:
                    print(paragraph)
            """
                    
            caption_match = re.search(r"\\caption\{(.*?)}", table)        
            if caption_match:
                found_caption = caption_match.group(1)
                table_id = str(uuid.uuid4())

                # Store as txt file in own data collection
                spamwriter_table.writerow([table_id, paper_id, found_caption])
                table_file_path = table_code_dir + table_id + ".txt"
                f = open(table_file_path, "w", encoding="utf-8")
                f.write(table)
                f.close()
                
                # Store as tex file in folder of paper
                table_file_path = paper_path + "/FR_TAB_" + table_id + ".tex"
                print(table_file_path)
                table_tex_code = found_document_header.group(0) + "\n\pagenumbering{gobble}\n" + table + "\n\end{document}"
                f = open(table_file_path, "w", encoding="utf-8")
                f.write(table_tex_code)
                f.close()

        for figure in found_figures:
            found_graphics = re.findall(r"\\includegraphics(\[.*?\])*\{(.*?)\}", figure)
            #caption_match = re.search(r"\\caption\{(([^{}]*(\{[^{}]*\})?[^{}]*)+)\}", figure)
            caption_match = re.search(r"\\caption\{(.*?)}", figure)
            if caption_match:
                found_caption = caption_match.group(1)
                for graphic in found_graphics:
                    figure_id = str(uuid.uuid4())
                    
                    graphic_path = graphic[1]
                    file_type = os.path.splitext(graphic_path)[-1]
                    if os.path.isfile(paper_path+"/"+graphic_path):
                        graphic_path = paper_path+"/"+graphic_path
                    else:
                        for ext in possible_extensions:
                            possible_path = paper_path+"/"+graphic_path + ext
                            if os.path.isfile(possible_path):
                                graphic_path = possible_path
                                break
                                
                    try:
                        shutil.copy(graphic_path, figure_output_dir+figure_id+file_type)
                        spamwriter_figure.writerow([figure_id, paper_id, found_caption])
                    except FileNotFoundError as e:
                        print(f"File not found: {graphic_path} - {e}")
                    except Exception as e:
                        print(f"An error occurred: {e}")
        
        print(f"{len(found_tables)} tables found.")
        print(f"{len(found_figures)} figures found.")
    #break
    
csvfile_table.close()
csvfile_figure.close()



2001.00116v2
source_files/2001.00116v2/FR_TAB_96299615-c9bb-493b-9e27-b936a8e07dc4.tex
source_files/2001.00116v2/FR_TAB_b5118576-551e-4d0a-8c4f-a0bf13abbccf.tex
source_files/2001.00116v2/FR_TAB_fe0042d3-7cd9-451e-b3a7-db4fc830343a.tex
source_files/2001.00116v2/FR_TAB_58224619-edf5-4a4f-aa8a-8855481985d9.tex
source_files/2001.00116v2/FR_TAB_b9314ff7-75db-466a-9044-a5d099b45283.tex
source_files/2001.00116v2/FR_TAB_af591c7c-75eb-46b4-b5ae-436869d25097.tex
source_files/2001.00116v2/FR_TAB_6461ab3d-0b39-4ada-bd45-f156cd983176.tex
source_files/2001.00116v2/FR_TAB_90d0f3e9-eeff-482b-8b71-d906326e050a.tex
8 tables found.
9 figures found.


2001.00117v1
0 tables found.
9 figures found.


2001.00119v2
Document header could not be identified.


2001.00120v1
0 tables found.
0 figures found.


2001.00122v1
source_files/2001.00122v1/FR_TAB_b888d5f8-c994-4718-a418-dace50a9058f.tex
source_files/2001.00122v1/FR_TAB_0bcc6f00-dc24-4484-80f1-54639ead8362.tex
source_files/2001.00122v1/FR_TAB_90eca622-16c

source_files/2001.00133v2/FR_TAB_fb774b01-edfa-4766-8e2e-c564b925609e.tex
source_files/2001.00133v2/FR_TAB_f0135353-11d8-4927-9103-69bbce911b7e.tex
source_files/2001.00133v2/FR_TAB_7e8f9c1e-c86c-4407-84a5-ec61fe188dbe.tex
source_files/2001.00133v2/FR_TAB_ef2e3c4e-c88d-4e90-97da-cad0dc749cd3.tex
source_files/2001.00133v2/FR_TAB_2fa331df-79a3-4411-84ab-9b36dfb66030.tex
source_files/2001.00133v2/FR_TAB_99def575-605e-46c2-b2d5-280d8b0abd99.tex
source_files/2001.00133v2/FR_TAB_df18facd-2d7b-4a79-bb76-9024695c3806.tex
source_files/2001.00133v2/FR_TAB_b3667437-3f90-45fb-9a87-6a5afa099f40.tex
source_files/2001.00133v2/FR_TAB_b548f1a9-2a70-4da1-972d-11941ce8ccfa.tex
source_files/2001.00133v2/FR_TAB_62aee04f-c73a-4424-9617-5f7e2a4ede41.tex
source_files/2001.00133v2/FR_TAB_60d2beaa-28fa-457c-afc4-f8953c786e75.tex
source_files/2001.00133v2/FR_TAB_9ce2f646-c8dd-4060-a83e-a2a8c4d38913.tex
source_files/2001.00133v2/FR_TAB_71649b94-48dd-454c-a6dc-35eae4c24d96.tex
source_files/2001.00133v2/FR_TAB_0bbdd

source_files/2001.00133v2/FR_TAB_30678a46-185a-426e-873a-031fd033fbc5.tex
source_files/2001.00133v2/FR_TAB_4858c2da-9481-43a1-ba36-1c7ac4de7049.tex
source_files/2001.00133v2/FR_TAB_3bd92c7e-5376-4841-8b61-29797599495f.tex
source_files/2001.00133v2/FR_TAB_76a83df0-6df0-499a-9637-1428d1197f18.tex
source_files/2001.00133v2/FR_TAB_6ec7c6cd-fa96-46bd-9dee-0d859a75b6a4.tex
source_files/2001.00133v2/FR_TAB_385e6e0f-7557-4d8a-8e6a-ac9b0d872e08.tex
source_files/2001.00133v2/FR_TAB_3a17f10d-8e7d-4888-b83d-132f0cfa0dc6.tex
source_files/2001.00133v2/FR_TAB_fa1f0ce6-1f89-45b4-9be5-605be2977f6d.tex
source_files/2001.00133v2/FR_TAB_dcaec44e-d806-4aee-86dd-d0f1d65c4ed9.tex
source_files/2001.00133v2/FR_TAB_eff9d218-1680-4c7e-8ba9-50575e1e4dc5.tex
source_files/2001.00133v2/FR_TAB_9bab96a6-d5d3-4044-baec-3ce9d7a79a9f.tex
source_files/2001.00133v2/FR_TAB_b0ac98ac-8bd0-45c8-8252-0b6660004703.tex
source_files/2001.00133v2/FR_TAB_f11b11ae-0a86-49b6-a46b-b7e193d5cb00.tex
source_files/2001.00133v2/FR_TAB_5ced7

source_files/2001.00137v2/FR_TAB_d4abb927-665c-4298-acdb-71cca55affc3.tex
source_files/2001.00137v2/FR_TAB_34de67b2-6bfa-459c-92cb-6e384103c27f.tex
source_files/2001.00137v2/FR_TAB_a1b1d9f3-9b3c-4a85-928c-7fcad12aba74.tex
source_files/2001.00137v2/FR_TAB_0ffa4298-2ddb-40e2-b905-720a772ba32b.tex
source_files/2001.00137v2/FR_TAB_d7a0706a-e9ee-44c5-b458-b0a7665d8996.tex
source_files/2001.00137v2/FR_TAB_4f361251-548b-42f5-9633-4c51bc863c64.tex
source_files/2001.00137v2/FR_TAB_87ba9c2f-804b-4761-988f-f83d419a3b99.tex
source_files/2001.00137v2/FR_TAB_8b952e3d-9257-4d78-af29-81c2e1306ac8.tex
source_files/2001.00137v2/FR_TAB_b426a044-0a4e-4516-b2f8-f62852c8d1b9.tex
source_files/2001.00137v2/FR_TAB_4bc0b569-915d-461a-8783-dc3cd7befbc1.tex
source_files/2001.00137v2/FR_TAB_c254e673-57f5-4c14-96cb-966555d1c68b.tex
source_files/2001.00137v2/FR_TAB_db58751f-e0e2-4876-a389-7d13dfc34a01.tex
source_files/2001.00137v2/FR_TAB_312d481b-48e4-4bf8-abd3-3f7fb68d3a64.tex
source_files/2001.00137v2/FR_TAB_3d629

4 tables found.
11 figures found.


2001.00179v3
source_files/2001.00179v3/FR_TAB_130aca22-2a19-455b-8f1d-e2d3ea7b7f38.tex
source_files/2001.00179v3/FR_TAB_6e7c79d9-1eff-4556-88a7-48882bffd2aa.tex
source_files/2001.00179v3/FR_TAB_cf156c44-ae29-4d35-b46f-f70943ec902b.tex
source_files/2001.00179v3/FR_TAB_df8f87c2-e0c2-454b-8702-dd3932b0f774.tex
source_files/2001.00179v3/FR_TAB_574b0ac6-bf98-4be8-b105-571f053e4488.tex
source_files/2001.00179v3/FR_TAB_7c1b5d67-f24e-4b22-809e-e75778fca9c7.tex
source_files/2001.00179v3/FR_TAB_e6b1fc72-6df6-414a-8ed6-4f2b5cbba6b2.tex
source_files/2001.00179v3/FR_TAB_e7aa2335-dca3-49af-be96-5e10f5c330ca.tex
source_files/2001.00179v3/FR_TAB_1897f71f-874a-4fff-b5e4-a6fda46d0147.tex
source_files/2001.00179v3/FR_TAB_c4a1b0b3-d428-4362-ba41-81563c577245.tex
source_files/2001.00179v3/FR_TAB_b3f77e91-feb6-48ce-a0a6-fb3658393ecb.tex
source_files/2001.00179v3/FR_TAB_d0ac0672-1e9b-430d-b10c-d014bd114443.tex
12 tables found.
3 figures found.


2001.00181v2
Document heade

In [None]:
csvfile_table.close()
csvfile_figure.close()