In [1]:
import os
import csv

In [6]:
# File and Directory Paths
source_dir = "source_files/"
source_metadata_file = source_dir + "papers.csv"
processed_metadata_file = source_dir + "papers_processed.csv"

table_output_dir = "extracted_tables/"
table_code_dir = table_output_dir + "table_code/"
table_images_dir = table_output_dir + "table_image/"
table_metadata_file = table_output_dir + "tables.csv"

figure_output_dir = "extracted_figures/"
figure_metadata_file = figure_output_dir + "figures.csv"

# CSV Size Limit
csv.field_size_limit(260000)

131072

In [50]:
# Check paper metadata files for errors
paper_original_row_length = 3
original_corrupted_papers = []
with open(source_metadata_file, "r", encoding="utf-8") as paper_metadata:
    spamreader = csv.reader(paper_metadata, delimiter=';', quotechar='"', quoting=csv.QUOTE_ALL)
    for row in spamreader:
        if len(row) != paper_original_row_length:
            original_corrupted_papers.append(row)
            print(f"Unexpected row length for: {row}")
print(f"Check for original paper metadata file complete. {len(original_corrupted_papers)} rows were corrupted.\n\n")
        
paper_processed_row_length = 5
processed_corrupted_papers = []
with open(processed_metadata_file, "r", encoding="utf-8") as paper_metadata:
    spamreader = csv.reader(paper_metadata, delimiter=";", quotechar='"', quoting=csv.QUOTE_ALL)
    for row in spamreader:
        if len(row) != paper_processed_row_length:
            processed_corrupted_papers.append(row)
            print(f"Unexpected row length for: {row}")
print(f"Check for processed paper metadata file complete. {len(processed_corrupted_papers)} rows were corrupted.")

Check for original paper metadata file complete. 0 rows were corrupted.


Check for processed paper metadata file complete. 0 rows were corrupted.


In [25]:
# Repairing paper titels including a semicolumn
repaired_papers = {}
for paper_row in original_corrupted_papers:
    if len(paper_row) == 4:
        new_row = [paper_row[0], paper_row[1] + "," + paper_row[2], paper_row[3]]
        repaired_papers[paper_row[0]] = new_row
    else:
        raise ValueError(f"Unexpected row length for {paper_row}.")

print(f"{len(repaired_papers)} rows were fixed.")

42 rows were fixed.


In [26]:
# Fix for original source paper file
with open(source_metadata_file, "r", newline='', encoding='utf-8') as input_file:
        with open(source_dir + "tmp.csv", "w", newline='', encoding='utf-8') as output_file:
            csv_reader = csv.reader(input_file, delimiter=';', quotechar='|', quoting=csv.QUOTE_MINIMAL)
            csv_writer = csv.writer(output_file, delimiter=';', quotechar='|', quoting=csv.QUOTE_MINIMAL)
            
            for row in csv_reader:
                if row[0] in repaired_papers:
                    csv_writer.writerow(repaired_papers[row[0]])
                else:
                    csv_writer.writerow(row)
                    
# Replace old csv file with new csv file
os.replace(source_dir + "tmp.csv", source_metadata_file)
print("Original papers csv file was successfully overwritten.")

# Fix for processed paper file
with open(processed_metadata_file, "r", newline='', encoding='utf-8') as input_file:
        with open(source_dir + "tmp.csv", "w", newline='', encoding='utf-8') as output_file:
            csv_reader = csv.reader(input_file, delimiter=';', quotechar='|', quoting=csv.QUOTE_MINIMAL)
            csv_writer = csv.writer(output_file, delimiter=';', quotechar='|', quoting=csv.QUOTE_MINIMAL)
            
            for row in csv_reader:
                if row[0] in repaired_papers:
                    if len(row) > paper_processed_row_length:
                        new_row = repaired_papers[row[0]] + [row[-2], row[-1]]
                        csv_writer.writerow(new_row)
                    else:
                        print(f"Critical error for {row}. Please fix it manually.")
                        csv_writer.writerow(row)
                else:
                    csv_writer.writerow(row)
                    
# Replace old csv file with new csv file
os.replace(source_dir + "tmp.csv", processed_metadata_file)

print("Processed papers csv file was successfully overwritten.")

Original papers csv file was successfully overwritten.
Processed papers csv file was successfully overwritten.


In [45]:
# Checking if unprocessed ppaers are still on the disk
for paper_row in processed_corrupted_papers:
    if len(paper_row) != 3:
        print(paper_row)
    else:
        paper_id = paper_row[0]
        if not os.path.isdir(source_dir+paper_id):
            raise ValueError(f"No files found for paper {paper_id}")
            
paper_ids = [row[0] for row in processed_corrupted_papers]
critical_figs = []
critical_tabs = []

# Remove critical figures to allow for reprocessing
with open(figure_metadata_file, "r", newline='', encoding='utf-8') as input_file:
        with open(figure_output_dir + "tmp.csv", "w", newline='', encoding='utf-8') as output_file:
            csv_reader = csv.reader(input_file, delimiter=';', quotechar='|', quoting=csv.QUOTE_MINIMAL)
            csv_writer = csv.writer(output_file, delimiter=';', quotechar='|', quoting=csv.QUOTE_MINIMAL)
            
            for row in csv_reader:
                if row[1] not in paper_ids:
                    csv_writer.writerow(row)
                else:
                    critical_figs.append(row[0])
os.replace(figure_output_dir + "tmp.csv", figure_metadata_file)

# Remove critical figures to allow for reprocessing
with open(table_metadata_file, "r", newline='', encoding='utf-8') as input_file:
        with open(table_output_dir + "tmp.csv", "w", newline='', encoding='utf-8') as output_file:
            csv_reader = csv.reader(input_file, delimiter=';', quotechar='|', quoting=csv.QUOTE_MINIMAL)
            csv_writer = csv.writer(output_file, delimiter=';', quotechar='|', quoting=csv.QUOTE_MINIMAL)
            
            for row in csv_reader:
                if row[1] not in paper_ids:
                    csv_writer.writerow(row)
                else:
                    critical_tabs.append(row[0])
os.replace(table_output_dir + "tmp.csv", table_metadata_file)

print(f"{len(critical_figs)} figures and {len(critical_tabs)} have been removed.")

5 figures and 6 were removed.


In [49]:
# Removing these unprocessed papers from csv file
with open(processed_metadata_file, "r", newline='', encoding='utf-8') as input_file:
        with open(source_dir + "tmp.csv", "w", newline='', encoding='utf-8') as output_file:
            csv_reader = csv.reader(input_file, delimiter=';', quotechar='|', quoting=csv.QUOTE_MINIMAL)
            csv_writer = csv.writer(output_file, delimiter=';', quotechar='|', quoting=csv.QUOTE_MINIMAL)
            
            for row in csv_reader:
                if row[0] not in paper_ids:
                    csv_writer.writerow(row)
                    
# Replace old csv file with new csv file
os.replace(source_dir + "tmp.csv", processed_metadata_file)

print("Processed papers csv file was successfully overwritten.")

Processed papers csv file was successfully overwritten.


In [7]:
# Checking metadata files for double occurrences
def check_for_double(file_path):
    with open(file_path, "r", newline='', encoding='utf-8') as csv_file:
        spamreader = csv.reader(csv_file, delimiter=';', quotechar='|', quoting=csv.QUOTE_MINIMAL)
        id_set = set()
        for row in spamreader:
            if row[0] in id_set:
                print(f"Double finding of {row[0]}")
            id_set.add(row[0])
    print(f"Check for {file_path} completed.")
    
check_for_double(source_metadata_file)
check_for_double(processed_metadata_file)
check_for_double(figure_metadata_file)
check_for_double(table_metadata_file)

Check for source_files/papers.csv completed.
Check for source_files/papers_processed.csv completed.
Check for extracted_figures/figures.csv completed.
Check for extracted_tables/tables.csv completed.
