In [136]:
from PIL import Image, ImageDraw, ImageFont
import numpy as np
import random
import os

In [293]:
import logging
from logging import handlers

logger = logging.getLogger(__name__)
handler_stream = logging.StreamHandler() #output to console
formatter = logging.Formatter("%(asctime)s:::%(module)s:::%(message)s")
handler_stream.setFormatter(formatter)
logger.setLevel(logging.INFO)
logger.addHandler(handler_stream)

class data_generator(object):
    
    def __init__(self, images_height = 32, image_width = 256, max_string_lenght = 50, 
                 background_type = ['real','conts'], dataset_type = 'random', backgrounds_path = 'backgrounds/',
                 fonts_path = 'valid_fonts/', valid_charset_path = 'valid_charset.txt', 
                 text_examples = 'text_examples.txt'):
        '''
        background_type - list consist used types of bg, e.g. ['real', 'random', 'plain']. 
                    If use 'real', then backgrounds_path should be set to used backgrounds.
        dataset - type of dataset 'random' or 'samples'. If 'samples' then text_examples
                    should be defined. If 'random' then text string will be generated from valid 
                    charset randomly
        valid_char_set - path to text file with valid chars.
        '''
        assert set(['real', 'random', 'plain']).intersection(set(background_type)) == \
                        set(background_type), 'background_type argument should consist only "real", "random" or "conts"'
        assert dataset_type in ['random', 'samples'] , 'dataset_type argument should be one of "random" or "samples"'
        self.images_height = images_height
        self.image_width = image_width
        self.max_string_lenght = max_string_lenght
        self.background_type = background_type
        self.dataset_type = dataset_type
        self.backgrounds_files = None
        if 'real' in background_type:
            self.backgrounds_files = (backgrounds_path, self.set_backgrounds(backgrounds_path))
        self.text_samples = None
        if dataset_type == 'samples':
            self.text_samples = self.set_texts_samples(text_examples)
        self.fonts_files = (fonts_path, self.set_fonts(fonts_path))
        logging.info('{} fonts to produce data'.format(len(self.fonts_files)))
        self.valid_charset = self.set_valid_charset(valid_charset_path) + list('    ') #add three blank spaces to encrease probobility
        
    def get_batch(self, batch_size = 64):
        '''
        produce batch of text string images images
        '''
        background = sample_background()
        pass
    
    def get_text_image(self, font, background):
        '''
        produce one text string image
        '''
        text_string = sample_random_string()
        
        return text_string
    def sample_font_file(self, size=20):
        font_file = random.sample(self.fonts_files[1], 1)[0]
        fnt = ImageFont.truetype(self.fonts_files[0] + font_file, size)
        return fnt
    
    def sample_string(self,):
        if self.dataset_type == 'random':
            random_string_length = random.randint(1, self.max_string_lenght)
            text_string_indx = [random.randint(0, len(self.valid_charset)-1) for _ in range(random_string_length)]
            text_string = ''.join([self.valid_charset[i] for i in text_string_indx])
        elif self.dataset_type == 'sample':
            assert self.text_samples != None, 'self.text_sample is None, cant get examples of text'
            text_string = random.sample(self.text_samples, 1)[0]
        return text_string
    
    def sample_background(self,):
        bg_type = random.sample(self.background_type,1)[0]
        if bg_type == 'plain':
            backgroun_image = Image.new('L', (self.image_width, self.images_height), 128)
        elif bg_type == 'real':
            background_file = random.sample(self.backgrounds_files[1], 1)[0]
            background_file_path = self.backgrounds_files[0] + background_file
            backgroun_image = Image.open(background_file_path)
            backgroun_image = backgroun_image.convert('L')
            width, height = backgroun_image.size
            left, upper = width - self.image_width, height - self.images_height
            rndm_left, rndm_upper = random.randint(0, left), random.randint(0, upper)
            backgroun_image = backgroun_image.crop(box=(rndm_left, rndm_upper, 
                                                   rndm_left+self.image_width, rndm_upper+self.images_height))
        return backgroun_image

    def set_fonts(self, path):
        '''
        set fonts path
        '''
        assert os.path.isdir(path), 'There is no folder {}'.format(path)
        fonts_path = [f for f in next(os.walk(path))[2] if os.path.splitext(f)[1] == '.ttf']
        assert len(fonts_path) !=0, 'Folder {} is empty'.format(path)
        return fonts_path
    
    def set_backgrounds(self, path):
        '''
        set background images path
        '''
        assert os.path.isdir(path), 'There is no folder {}'.format(path)
        backgrounds_files = [f for f in next(os.walk(path))[2] if os.path.splitext(f)[1] == '.jpg']
        valid_backgrounds_files = []
        for f in backgrounds_files:
            im = Image.open(path+f)
            width, height = im.size
            if width>=self.image_width and height>=self.images_height:
                valid_backgrounds_files.append(f)
        assert len(valid_backgrounds_files) !=0, 'No valid backgounds in {}'.format(path)
        logging.info('{} background files to produce data'.format(len(valid_backgrounds_files)))
        return valid_backgrounds_files
    
    def set_texts_samples(self, path):
        '''
        set text file which lines will be used as text strings to draw
        '''
        assert os.path.isfile(path), 'There is no file {}'.format(path)
        with open(path,'r',encoding='utf-8') as f:
            text_samples = r.read().split('\n')
        text_samples = list(set(text_samples))
        assert len(text_samples) != 0, 'text samples file is empty'
        logging.info('{} text strings to produce data'.format(len(text_samples)))
        return text_samples
    
    def set_valid_charset(self, path):
        '''
        set usable chars
        '''
        assert os.path.isfile(path), 'There is no file {}'.format(path)
        with open(path,'r',encoding='utf-8') as f:
            charset_string = f.read()
        valid_charset = list(set(charset_string))
        assert len(valid_charset) != 0, 'valid charset is empty'
        return valid_charset
            

In [294]:
dg = data_generator(background_type=['real','plain'])

In [None]:
dg.sample_background()

In [223]:
i = Image.open('backgrounds/009f50066527919646b952e12a6367cd.jpg')

In [224]:
i.size

(1100, 619)