In [64]:
import os
import numpy as np
import csv
import pandas as pd
from tqdm import tqdm

dataDir = '/dcs/project/seal-script-project-data/seal-script-new/scraped/'
charListPath = './classical_chars.csv'
filetype = 'png'

In [65]:
def load_data(char_from, char_to):
    data = np.genfromtxt(os.path.join(dataDir, 'hsk.csv'), delimiter=',', encoding='utf8', dtype=None)
    char_array = data[char_from-1:char_to]
    print(f"Imported {char_array.shape[0]} characters from index {char_from} to {char_to}")
    
    return char_array

def checkMissingFolders(thresh):
    ''' Get list of folders with less than or equals <thresh> image files. Set <thresh> to 0 for empty folders. '''
    missing_chars = []
    charList = pd.read_csv(charListPath, names=['idx', 'char'])
    for i, (idx, char) in charList.iterrows():
        charDir = os.path.join(dataDir, str(idx))
        # Get list of all objects in directory
        files = os.listdir(charDir)
        # Keep objects which are files
        files = [f for f in files if os.path.isfile(charDir+'/'+f) and f'.{filetype}' in f]
        # Get number of files in directory
        imageCount = len(files)

        if imageCount <= thresh:
            missing_chars.append(idx)
            
    return missing_chars

def renameImages():
    ''' Renames all files in dataDir to have consistent style and numbering '''
    
    data_dir = dataDir

    files = [d.name for d in os.scandir(data_dir) if d.is_dir()]

    for file in tqdm(files):
        images = os.listdir(data_dir + os.sep + file)
        for ii, image in enumerate(images):
            prefix = data_dir + os.sep + file + os.sep 
            os.rename(prefix + image, prefix + f"{file}_{ii+1}_temp.png")
        images = os.listdir(data_dir + os.sep + file)
        for ii, image in enumerate(images):
            prefix = data_dir + os.sep + file + os.sep 
            os.rename(prefix + image, prefix + f"{file}_{ii+1}.png")
        

def save_missing_chars(data):
    char_df = pd.DataFrame(data)
    char_df.to_csv("missing_chars.csv", header=False, index=False)

### Check that all characters in the main store are valid 

In [37]:
modern_char_store = './source/hsk.csv'

def check_modern_char_store(dir):
    ''' Performs checks on the dataset csv modern_char_store to ensure all characters are valid '''

    # Load in the data
    data = np.genfromtxt(dir, delimiter=',', encoding='utf8', dtype=None)
    print(data.dtype)
    valid_chars = []

    for i in range(data.shape[0]):

        # Add checks here
        if data[i][1] == '\ufeff':
            pass
        else: # If the character is valid
            valid_chars.append(data[i][1])
    
    formatted_data = np.asarray([(i+1, valid_chars[i]) for i in range(len(valid_chars))], dtype=[('f0', '<i8'), ('f1', '<U1')])
    
    print(formatted_data[417])
    print(formatted_data.shape)
    print(formatted_data)

    # Save the new formatted data to file
    np.savetxt('./source/hsk.csv', formatted_data, delimiter=',', fmt='%s')
    


check_modern_char_store(modern_char_store)

[('f0', '<i8'), ('f1', '<U1')]
(418, '根')
(1075,)
[(   1, '起') (   2, '果') (   3, '热') ... (1073, '赢') (1074, '骗')
 (1075, '阅')]


### Delete all images

In [12]:
def deleteAllImages():
    ''' Deletes all images found under directory charListPath '''
    charList = pd.read_csv(charListPath, names=['idx', 'char'])
    for i, (idx, char) in charList.iterrows():
        charDir = os.path.join(dataDir, str(idx))
  
        for filename in os.listdir(charDir):
            file_path = os.path.join(charDir, filename)
            
            try:
                if os.path.isfile(file_path):
                    os.remove(file_path)
                elif os.path.isdir(file_path):
                    shutil.rmtree(file_path)
            except Exception as e:
                print('Failed to delete %s. Reason: %s' % (file_path, e))

# deleteAllImages()

In [45]:
def main():
    # missed_chars = checkMissingFolders(2)
    # print('Characters without images:')
    # print(missed_chars)
    # print(f'Found {len(missed_chars)} characters without images')
    # save_missing_chars(missed_chars)
    # renameImages()

In [66]:
main()

1000it [00:00, 3592.74it/s]
