In [2]:
import os
import random
import shutil

In [None]:
def split_dataset(image_dir, output_dir, train_pct=0.8, val_pct=0.1, test_pct=0.1):
    """
    Splits an image dataset into train, validation, and test directories.

    Args:
        image_dir (str): The directory containing the image files to split.
        output_dir (str): The directory where the train, validation, and test subdirectories will be created.
        train_pct (float): The proportion of images to use for the training set.
        val_pct (float): The proportion of images to use for the validation set.
        test_pct (float): The proportion of images to use for the test set.

    Returns:
        None
    """
    # Get a list of all image files in the directory
    files = os.listdir(image_dir)
    # Shuffle the list of files
    random.shuffle(files)

    # Determine the number of files for each split
    num_files = len(files)
    num_train = int(num_files * train_pct)
    num_val = int(num_files * val_pct)
    num_test = num_files - num_train - num_val

    print('Number of files: {}'.format(num_files))
    print('Number of train files: {}'.format(num_train))
    print('Number of validation files: {}'.format(num_val))
    print('Number of test files: {}'.format(num_test))

    print("Sum: {}".format(num_train + num_val + num_test))
    
    # Create the subdirectories
    train_dir = os.path.join(output_dir, 'train_all')
    os.makedirs(train_dir, exist_ok=True)
    val_dir = os.path.join(output_dir, 'validation_all')
    os.makedirs(val_dir, exist_ok=True)
    test_dir = os.path.join(output_dir, 'test_all')
    os.makedirs(test_dir, exist_ok=True)

    # Copy the image files to the subdirectories
    for i, file in enumerate(files):
        if i < num_train:
            dst_dir = train_dir
        elif i < num_train + num_val:
            dst_dir = val_dir
        else:
            dst_dir = test_dir

        src_path = os.path.join(image_dir, file)
        dst_path = os.path.join(dst_dir, file)
        shutil.copyfile(src_path, dst_path)

        if i % 1000 == 0:
            print('{} files copied.'.format(i + 1))

    # Print the number of files in each subdirectory
    print('Number of train files: {}'.format(len(os.listdir(train_dir))))
    print('Number of validation files: {}'.format(len(os.listdir(val_dir))))
    print('Number of test files: {}'.format(len(os.listdir(test_dir))))

    return

In [5]:
# image_dir = 'processed_data/all_images/'
# out_dir = 'processed_data/'

# split_dataset(image_dir, out_dir)

Number of files: 103536
Number of train files: 82828
Number of validation files: 10353
Number of test files: 10355
Sum: 103536
1 files copied.
1001 files copied.
2001 files copied.
3001 files copied.
4001 files copied.
5001 files copied.
6001 files copied.
7001 files copied.
8001 files copied.
9001 files copied.
10001 files copied.
11001 files copied.
12001 files copied.
13001 files copied.
14001 files copied.
15001 files copied.
16001 files copied.
17001 files copied.
18001 files copied.
19001 files copied.
20001 files copied.
21001 files copied.
22001 files copied.
23001 files copied.
24001 files copied.
25001 files copied.
26001 files copied.
27001 files copied.
28001 files copied.
29001 files copied.
30001 files copied.
31001 files copied.
32001 files copied.
33001 files copied.
34001 files copied.
35001 files copied.
36001 files copied.
37001 files copied.
38001 files copied.
39001 files copied.
40001 files copied.
41001 files copied.
42001 files copied.
43001 files copied.
44001 

In [4]:
def create_mapping_file(output_file, image_dir, mapping_file):
    """
    Creates a mapping file from image file names to formula strings.

    Args:
        image_dir (str): The directory containing the image files to split.
        output_file (str): The file path to save the mapping file to.

    Returns:
        None
    """
    # Get a list of all image files in the directory
    files = os.listdir(image_dir)

    # Create the mapping file
    with open(output_file, 'w', encoding='windows-1252') as f:
        # Open the mapping file
        with open(mapping_file, 'r', encoding='windows-1252') as f2:
            for line in f2:
                # Get the first item of the line
                image_name = line.strip().split()[0]
                # Get everything except the first item of the line, and join them together
                formula = ' '.join(line.strip().split()[1:])

                # # Remove all the spaces in the formula
                formula = formula.replace(' ', '')

                # If the image file is in the image directory, add the image-formula mapping to the mapping file
                if image_name in files:
                    f.write('{} {}\n'.format(image_name, formula))


    # Print the number of lines in the output file
    print('Number of lines in {}: {}'.format(output_file, len(open(output_file).readlines())))

    # Print the number of images in the image directory
    print('Number of images in {}: {}'.format(image_dir, len(os.listdir(image_dir))))

    return

mapping_file = 'processed_data/all_formulas_norm.txt'

output_file = 'processed_data/train_formulas_all_ns.txt'
image_dir = 'processed_data/train_all'
create_mapping_file(output_file, image_dir, mapping_file)

output_file = 'processed_data/validation_formulas_all_ns.txt'
image_dir = 'processed_data/validation_all'
create_mapping_file(output_file, image_dir, mapping_file)

output_file = 'processed_data/test_formulas_all_ns.txt'
image_dir = 'processed_data/test_all'
create_mapping_file(output_file, image_dir, mapping_file)


Number of lines in processed_data/train_formulas_all_ns.txt: 82828
Number of images in processed_data/train_all: 82828
Number of lines in processed_data/validation_formulas_all_ns.txt: 10353
Number of images in processed_data/validation_all: 10353
Number of lines in processed_data/test_formulas_all_ns.txt: 10355
Number of images in processed_data/test_all: 10355


In [None]:
import re

mapping_file = 'processed_data/all_formulas.txt'
out_file = 'processed_data/all_formulas_norm.txt'

with open(mapping_file, 'r', encoding='windows-1252') as in_file:
    with open(out_file, 'w', encoding='windows-1252') as out_file:
        for line in in_file:
            # Split the line by spaces
            line_split = line.strip().split()
            # Get the first item of the line
            image_name = line_split[0]
            # Get everything except the first item of the line, and join them together
            formula = ' '.join(line_split[1:])
            # Remove all the \label{...} tags
            modified_formula = re.sub(r'\\label\{.*?\}', '', formula)
            # Remove all the \label { ... } tags
            modified_formula = re.sub(r'\\label\s*\{.*?\}', '', formula)
            # Remove all the \label? tags
            modified_formula = re.sub(r'\\label\w', '', modified_formula)\
            # Remove all the \label ? tags
            modified_formula = re.sub(r'\\label\s*\w', '', modified_formula)\
            # Remove all the \ref{...} tags
            modified_formula = re.sub(r'\\ref\{.*?\}', '', modified_formula)
            # Remove all the \cite{...} tags
            modified_formula = re.sub(r'\\cite\{.*?\}', '', modified_formula)

            # Remove leading '%' characters
            while modified_formula.startswith('%'):
                modified_formula = modified_formula[1:]

            # Remove everything after the first percent character
            modified_formula = modified_formula.split('%')[0]
            
            # Write the modified formula to the output file
            out_file.write('{} {}\n'.format(image_name, modified_formula))