In [11]:
from os.path import join, basename, isdir, splitext, isfile
from os import mkdir, listdir
from shutil import copyfile

def list_images(base_dir, valid_exts=['.jpg', '.jpeg', '.png', '.bmp']):
    images_list = []
    for f in listdir(base_dir):
        if not isfile(join(base_dir, f)):
            continue
        filext = splitext(f.lower())[1]
        if filext not in valid_exts:
            continue
        images_list.append(f)
    return images_list

base_dir = '/home/ubuntu/projects/data/captcha'

fix_dir = join(base_dir, 'fix_errors')
orig_dir = join(base_dir, 'labeled_captchas_orig')

imnames_orig = list_images(orig_dir)

old2new_map = {imname:imname for imname in imnames_orig}

imnames_to_change = list_images(fix_dir)
for imname_t in imnames_to_change:
    parts = imname_t.split('_')
    if len(parts) == 3:
        orig_name = parts[2][4:]
        new_name = parts[1][4:] + '.png'
    elif len(parts) == 2:
        orig_name = parts[0][9:] + '.png'
        new_name = parts[1][4:]
    else:
        raise ValueError
    assert orig_name in old2new_map
    assert len(new_name) == 5+4
    old2new_map[orig_name] = new_name
    
print('number of fixed samples: {}'.format(len(imnames_to_change)))

def read_imnames(input_filename):
    with open(input_filename) as fp:
        lines = fp.read().splitlines()
    imnames = [line.split(';')[0] for line in lines]
    return imnames

train_imnames = read_imnames(join(orig_dir, 'train.txt'))
train_imnames_m = [old2new_map[imname] for imname in train_imnames]
print('train.txt: {}'.format(len(train_imnames_m)))

output_dir = join(base_dir, 'labeled_captchas_fixed')
if not isdir(output_dir):
    mkdir(output_dir)

def get_code(c):
    if c.isdigit():
        label = ord(c) - ord('0')
    else:
        label = ord(c) - ord('A') + 10
    return label
    
def name_to_labels(new_name):
    name_base = splitext(new_name)[0]
    assert len(name_base) == 5
    name_base = name_base.upper()
    labels = [get_code(c) for c in name_base]
    return labels

samples = []
for orig_name in old2new_map:
    new_name = old2new_map[orig_name]
    assert isfile(join(orig_dir, orig_name))
    copyfile(join(orig_dir, orig_name), join(output_dir, new_name))
    labels = name_to_labels(new_name)
    samples.append((new_name, labels))
    
total = len(samples)
print('total number of samples: {}'.format(total))

def write_data(output_filename, data):
    with open(output_filename, 'w') as fp:
        for imname, labels in data:
            labels_str = ';'.join([str(l) for l in labels])
            fp.write('{};{}\n'.format(imname, labels_str))

test_samples = []
train_samples = []
for imname, labels in samples:
    if imname not in train_imnames_m:
        test_samples.append((imname, labels))
    else:
        train_samples.append((imname, labels))
        
n_train = len(train_samples)
n_test = len(test_samples)
print('{} (total) = {} (train) + {} (test)'.format(total, n_train, n_test))
       
write_data(join(output_dir, 'samples.txt'), samples)
write_data(join(output_dir, 'train.txt'), train_samples)
write_data(join(output_dir, 'test.txt'), test_samples)

print('all done.')


number of fixed samples: 44
train.txt: 806
total number of samples: 1007
1007 (total) = 806 (train) + 201 (test)
all done.
