In [None]:
import os
import csv

In [None]:
# File and Directory Paths
figure_dir = "extracted_figures/"
figure_metadata_file = figure_dir + "figures.csv"

possible_extensions = [".pdf", ".png", ".jpg", ".jpeg", ".eps", ".ps"]

In [None]:
# Delete figures from dataset if their image size is larger than the defined threshold
def delete_figures_size(max_size):
    with open(figure_metadata_file, "r", newline='', encoding='utf-8') as input_file:
        with open(figure_dir + "tmp.csv", "w", newline='', encoding='utf-8') as output_file:
            csv_reader = csv.reader(input_file, delimiter=';', quotechar='|', quoting=csv.QUOTE_MINIMAL)
            csv_writer = csv.writer(output_file, delimiter=';', quotechar='|', quoting=csv.QUOTE_MINIMAL)
            
            counter = 0
            for row in csv_reader:
                figure_id = row[0]
                try:
                    # Find figure file
                    found_ext = None
                    for extension in possible_extensions:
                        if os.path.isfile(figure_dir + figure_id + extension):
                            found_ext = extension
                            break
                            
                    if found_ext:
                        # Obtain file size
                        figure_file = figure_dir + figure_id + extension
                        file_size = os.path.getsize(figure_file)

                        # Remove from dataset when file size is too large
                        if file_size > max_size:
                            os.remove(figure_file)
                            counter += 1
                        else:
                            csv_writer.writerow(row)
                    else:
                        # Remove from dataset if file was not found
                        print(f"File for figure {figure_id} was not found.")
                        counter += 1
                    
                except Exception as e:
                    # No removal when an error occurres
                    print(f"Error occurred for {figure_id}: {e}")
                    csv_writer.writerow(row)
                    
    # Replace old csv file with new csv file
    os.replace(figure_dir + "tmp.csv", figure_metadata_file)
    
    print(f"{counter} figures were deleted from the dataset.")

In [None]:
# Set threshold for figure file size
MAX_FILE_SIZE = 2 * 1024 * 1024

# Delete figures with a larger file size
delete_figures_size(MAX_FILE_SIZE)