In [218]:
from matplotlib import pyplot as plt
import cv2
import numpy as np
from PIL import Image, ImageDraw, ImageFont
import random
from os import mkdir
from os.path import join, isdir, splitext
import tqdm

def code_to_symbol(code):
    if code < 10:
        return chr(ord('0') + code)
    else:
        return chr(ord('A') + code - 10)
    
def get_random_color():  
    color = tuple(np.random.randint(0, 256, 3))
    return color
    
def select_random_palette_set():
    r = np.random.randint(0, 3)
    # colors = ('background', 'grid', '1ch', '2ch', '3ch', '4ch', '5ch', '6ch')
    colors = []
    for i in range(8):
        colors.append(get_random_color())
    return colors

def get_random_accepted_position(min_x, max_x, positions, min_sep):
        while True:
            pos = np.random.randint(min_x, max_x)
            good = True
            for p in positions:
                delta = abs(pos - p)
                if delta < min_sep:
                    good = False
                    break
            if good:
                break
        return pos

def generate_captcha():
    img = Image.new('RGB', (330, 150), color = (255, 255, 255))
    d = ImageDraw.Draw(img)
    fnt = ImageFont.truetype('/home/ubuntu/projects/data/captcha/font/ARIALBD.TTF', 81)

    colors = select_random_palette_set()

    d.rectangle([9, 11, 309, 130], fill=colors[0])

    for i in range(9):
        d.line([(9, 11 + 14*i), (309, 11 + 14*i)], colors[1], 1)

    for i in range(22):
        d.line([(9 + 14*i, 11), (9 + 14*i, 130)], colors[1], 1)

    # number_of_symbols = np.random.choice(np.arange(4, 7), p=[0.05, 0.93, 0.02])
    number_of_symbols = 5
    positions = []
    # for i in range(number_of_symbols):
    #     new_pos = get_random_accepted_position(10, 280, positions, 20)
    #     positions.append(new_pos)
        
    distance_between = np.random.randint(30, 64)
    c_x = 140 + np.random.randint(-10, 10)
    for i in range(number_of_symbols // 2):
        ind = number_of_symbols // 2 - i
        positions.append(int(c_x - ind * distance_between))
    for i in range(number_of_symbols // 2, number_of_symbols):
        ind = i - number_of_symbols // 2
        positions.append(int(c_x + ind * distance_between))

    positions = sorted(positions)
    imcode = []
    imsymbols = []
    for i in range(len(positions)):
        x_pos = positions[i]
        code = np.random.randint(0, 36)
        imcode.append(code)
        symbol = code_to_symbol(code)
        imsymbols.append(symbol)
        y_shift = np.random.randint(-20,20)

        fnt_sz = 81 + np.random.randint(-7, 7)
        fnt = ImageFont.truetype('/home/ubuntu/projects/data/captcha/font/ARIALBD.TTF', fnt_sz)
        d.text((x_pos, 75 - 50 + y_shift), symbol, font=fnt, fill=colors[2+i])
        
    img = np.array(img)
    return img, imcode, imsymbols

# img, imcode, imsymbols = generate_captcha()
# plt.imshow(img)
# plt.title(imcode)
# plt.show()
# raise KeyboardInterrupt

output_dir = '/home/ubuntu/projects/data/captcha/generated_captchas_v3/'
if not isdir(output_dir):
    mkdir(output_dir)
number_of_captchas = 100000
data = {}
with tqdm.tqdm(total=number_of_captchas) as pb:
    for i in range(number_of_captchas):
        img, imcode, imsymbols = generate_captcha()
        imname = '{}.jpg'.format(''.join([str(l) for l in imsymbols]))
        if imname in data:
            imname = '{0}_{2}{1}'.format(*splitext(imname), np.random.randint(1000))
        cv2.imwrite(join(output_dir, imname), img)
        data[imname] = imcode
        pb.update(1)
        
samples = [(imname, data[imname]) for imname in data]
    
def write_data(output_filename, samples):
    with open(output_filename, 'w') as fp:
        for imname, labels in samples:
            labels_str = ';'.join([str(l) for l in labels])
            fp.write('{};{}\n'.format(imname, labels_str))
            
def split_test_train(samples, test_ratio=0.2):
    
    n_samples = len(samples)
    n_test = int(n_samples * test_ratio)
    
    test_idx = np.random.choice(n_samples, n_test, replace=False)
    test_mask = np.zeros(n_samples, dtype=np.bool)
    test_mask[test_idx] = True
    train_mask = ~test_mask
    train_idx = np.nonzero(train_mask)[0]
    
    test_samples = [samples[idx] for idx in test_idx]
    train_samples = [samples[idx] for idx in train_idx]
    
    return train_samples, test_samples
            
print('total number of samples: {}'.format(len(samples)))
write_data(join(output_dir, 'samples.txt'), samples)

train_samples, test_samples = split_test_train(samples)
print('{} = {} (train) + {} (test)'.format(len(samples), len(train_samples), len(test_samples)))

write_data(join(output_dir, 'train.txt'), train_samples)
write_data(join(output_dir, 'test.txt'), test_samples)
print('all done.')
            

100%|██████████| 100000/100000 [06:20<00:00, 262.78it/s]


total number of samples: 100000
100000 = 80000 (train) + 20000 (test)
all done.
