In [9]:
from os.path import join, basename, isdir, splitext, isfile
from os import mkdir, listdir
import re
import random
import numpy as np

def list_images(base_dir, valid_exts=['.jpg', '.jpeg', '.png', '.bmp']):
    images_list = []
    for f in listdir(base_dir):
        if not isfile(join(base_dir, f)):
            continue
        filext = splitext(f.lower())[1]
        if filext not in valid_exts:
            continue
        images_list.append(f)
    return images_list

base_dir = '/home/ubuntu/projects/data/captcha/labeled_captchas/'

imnames = list_images(base_dir)
print('number of images: {}'.format(len(imnames)))

samples = []
for imname in imnames:
    imbase = splitext(imname)[0]
    imbase = imbase.upper()
    imlabels = []
    for c in imbase:
        if c.isdigit():
            label = ord(c) - ord('0')
        else:
            label = ord(c) - ord('A') + 10
        imlabels.append(label)
    samples.append((imname, imlabels))
        
def write_data(output_filename, samples):
    with open(output_filename, 'w') as fp:
        for imname, labels in samples:
            labels_str = ';'.join([str(l) for l in labels])
            fp.write('{};{}\n'.format(imname, labels_str))
            
def split_test_train(samples, test_ratio=0.2):
    
    n_samples = len(samples)
    n_test = int(n_samples * test_ratio)
    
    test_idx = np.random.choice(n_samples, n_test, replace=False)
    test_mask = np.zeros(n_samples, dtype=np.bool)
    test_mask[test_idx] = True
    train_mask = ~test_mask
    train_idx = np.nonzero(train_mask)[0]
    
    test_samples = [samples[idx] for idx in test_idx]
    train_samples = [samples[idx] for idx in train_idx]
    
    return train_samples, test_samples
            
print('total number of samples: {}'.format(len(samples)))
write_data(join(base_dir, 'samples.txt'), samples)

train_samples, test_samples = split_test_train(samples)
print('{} = {} (train) + {} (test)'.format(len(samples), len(train_samples), len(test_samples)))

write_data(join(base_dir, 'train.txt'), train_samples)
write_data(join(base_dir, 'test.txt'), test_samples)
print('all done.')

number of images: 1007
total number of samples: 1007
1007 = 806 (train) + 201 (test)
all done.
