In [1]:
# -*- coding: utf-8 -*- 

#------------------------------------ Imports ----------------------------------#

# Import python imaging libs
from PIL import Image
from PIL import ImageDraw
from PIL import ImageFont
from PIL import ImageFilter

# Import noise creating library
from noisify.recipes import human_error, machine_error

# Import operating system lib and time
import os
import time

# Import random generator
from random import randint
import numpy as np

# import csv reader and pandas
import csv
import pandas as pd

In [2]:
#------------------------------------ Cleanup ----------------------------------#	
def Cleanup():
    print("Cleaning up...", end="")
    # Delete ds_store file
    if os.path.isfile(font_dir + '.DS_Store'):
        os.unlink(font_dir + '.DS_Store')
	
    # Delete all files from output directory
    for file in os.listdir(out_dir):
        file_path = os.path.join(out_dir, file)
        if os.path.isfile(file_path):
            os.unlink(file_path)
    
    print("done.")
    return

In [3]:
#---------------------------------- Input and Output ---------------------------#

# Directory containing fonts
font_dir = './fonts/font_files/'

# Output
out_dir = './train_data/chars/'

In [5]:
def duration_text(secs):

    secs = round(secs, 2)
    mins = int(secs // 60)
    hours = int(mins // 60)
    
    remain_mins = int(mins - hours * 60)
    remain_secs = round(secs - mins * 60, 2)
    
    remain_secs_text = "0" + str(remain_secs) if len(str(int(remain_secs))) == 1 else str(remain_secs)
    remain_mins_text = "0" + str(remain_mins) if len(str(remain_mins)) == 1 else str(remain_mins)
    
    time_str = remain_secs_text + "s"
    if hours > 0:
        hours_text = "0" + str(hours) if len(str(hours)) == 1 else str(hours)
        time_str = hours_text + ":" + remain_mins_text + ":" + time_str
    elif mins > 0:
        time_str = remain_mins_text + ":" + time_str
    else:
        time_str = str(remain_secs) + "s"
        
    return time_str

In [4]:
dataset = []
with open('chars.csv', newline='', encoding='utf-8') as csvfile:
    data = csv.reader(csvfile, delimiter=',', quotechar='"')
    for row in data:
        dataset.append(row)

chars_df = pd.DataFrame(dataset[1:], columns=['index', 'char', 'name'])
chars_df = chars_df.set_index(chars_df.columns[0])

In [6]:
#------------------------------ Generate Characters ----------------------------#
def GenerateCharacters():
    # Counter
    k = 1
    
    durations = []
    
    X_dataset = []
    y_dataset = []
    
    # Process the font files
    for dirname, dirnames, filenames in os.walk(font_dir):
        size = len(filenames)
        item = 1
        # For each font do
        for filename in filenames:
            print(item, "/", size, ":", filename, "... ", end="")
            start_time = time.time()
            # Get font full file path
            font_resource_file = os.path.join(dirname, filename)
            
            # For each character do
            for index, row in chars_df.iterrows():
                # For each font size do
                for font_size in font_sizes:
                    if font_size > 0:
                        # For each background color do
                        for background_color in background_colors:
                            # For each rotation option do
                            for rotation in rotations:
                                # For each type of quality do
                                for quality in qualities:
                                    # Create row dictionary
                                    file_row = dict()

                                    # Set char id
                                    file_row['index'] = index
                                    file_row['char'] = row['char']
                                    file_row['char_name'] = row['name']
                                    file_row['font_file'] = filename
                                    file_row['font_size'] = font_size
                                    file_row['rotation'] = rotation
                                    file_row['quality'] = quality
                                    file_row['background_color'] = background_color

                                    # Convert the character into unicode
                                    character = row['char']

                                    # Create character image : 
                                    # Grayscale, image size, background color
                                    char_image = Image.new('RGB', (image_size, image_size),\
                                    background_color)

                                    # Draw character image
                                    draw = ImageDraw.Draw(char_image)

                                    # Specify font : Resource file, font size
                                    font = ImageFont.truetype(font_resource_file, font_size, encoding='unic')

                                    # Get character width and height
                                    (font_width, font_height) = font.getsize(character)

                                    # Calculate x position
                                    x = (image_size - font_width)/2

                                    # Calculate y position
                                    y = (image_size - font_height)/2

                                    # Draw text : Position, String, 
                                    # Options = Fill color, Font
                                    draw.text((x, y), character, (245-background_color) + \
                                    randint(0, 10) , font=font)

                                    # Set Character Rotation
                                    char_image = char_image.rotate(angle=rotation, fillcolor=background_color)

                                    # If quality is set to noisy, add noise to the image
                                    if quality == "noisy":
                                        combined_noise = machine_error(errors[0]) + human_error(errors[1])
                                        char_image = list(combined_noise(char_image))[0]

                                    # Final file name    				
                                    file_name = out_dir + str(k) + '_' + \
                                    filename + '_fs_' + \
                                    str(font_size) + '_bc_' + \
                                    str(background_color) + '_rot_' + \
                                    str(rotation) + '_' + \
                                    str(quality) + '.' + \
                                    row['name'] + '.png'

                                    file_row['file_name'] = file_name
                                    
                                    char_data = np.asarray(char_image)
                                    X_dataset.append(char_data)
                                    y_dataset.append([index])

                                    # Increment counter
                                    k = k + 1

            end_time = time.time()
            duration = end_time - start_time
            durations.append(duration)
            avg_duration = sum(durations) / len(durations)
            eta = (size - item) * avg_duration
            eta_text = duration_text(eta)
            print(round(duration,2), " s. ETA: ", eta_text, ".", sep="")
            
            item += 1
            
    return X_dataset, y_dataset

In [7]:
#------------------------------------- Colors ----------------------------------#

# Background color
white_colors = (215, 225, 235, 245)
black_colors = (0, 10, 20, 30)
gray_colors = (135, 145, 155)

background_colors = white_colors + black_colors + gray_colors

#-------------------------------------- Sizes ----------------------------------#

# Character sizes
font_sizes = (18, 24)
    	
# Image size
image_size = 32

#-------------------------------------- Augmentation Params --------------------#

# Blur factor
blur = 1

# Writing error generation
qualities = ["clean", "noisy"]
errors = (20, 45) # 20 machine error; # 45 human error

# Rotations
rotations = (-45, -5, 0, 5, 45)

In [11]:
#-------------------------------------- Main -----------------------------------#

# Do cleanup
# Cleanup()

# Generate characters
X_dataset, y_dataset = GenerateCharacters()

1 / 85 : AbrilFatface-Regular.otf ... 25.29 s. ETA: 35:24.11s.
2 / 85 : Aileron-Bold.otf ... 24.85 s. ETA: 34:40.68s.
3 / 85 : Aileron-Italic.otf ... 24.37 s. ETA: 33:56.53s.
4 / 85 : Aileron-Light.otf ... 24.53 s. ETA: 33:25.41s.
5 / 85 : Aileron-Regular.otf ... 24.46 s. ETA: 32:55.85s.
6 / 85 : Aller_Bd.ttf ... 44.44 s. ETA: 36:51.12s.
7 / 85 : Aller_It.ttf ... 43.39 s. ETA: 39:14.72s.
8 / 85 : Aller_Rg.ttf ... 42.76 s. ETA: 40:45.55s.
9 / 85 : Andada-Bold.otf ... 26.8 s. ETA: 39:31.95s.
10 / 85 : Andada-Italic.otf ... 26.45 s. ETA: 38:25.07s.
11 / 85 : Andada-Regular.otf ... 26.63 s. ETA: 37:26.7s.
12 / 85 : ArchivoBlack.otf ... 25.84 s. ETA: 36:28.85s.
13 / 85 : arial-ro.ttf ... 24.59 s. ETA: 35:29.01s.
14 / 85 : ArialUnicodeMS.ttf ... 37.26 s. ETA: 35:38.46s.
15 / 85 : Baloo-Regular.ttf ... 36.94 s. ETA: 35:40.17s.
16 / 85 : Bitter-Bold.otf ... 31.27 s. ETA: 35:12.6s.
17 / 85 : Bitter-Italic.otf ... 26.44 s. ETA: 34:25.29s.
18 / 85 : Bitter-Regular.otf ... 27.35 s. ETA: 33:43.66s.

In [12]:
# save to npy file
data = [X_dataset, y_dataset]

MemoryError: 

In [14]:
np.save('full_char_dataset.npy', data)

MemoryError: 

In [9]:
output_df = pd.DataFrame(files_data)

In [10]:
output_df

Unnamed: 0,index,char,char_name,font_file,font_size,rotation,quality,background_color,file_name
0,0,0,0,AbrilFatface-Regular.otf,18,-45,clean,215,./chars/1_AbrilFatface-Regular.otf_fs_18_bc_21...
1,0,0,0,AbrilFatface-Regular.otf,18,-45,noisy,215,./chars/2_AbrilFatface-Regular.otf_fs_18_bc_21...
2,0,0,0,AbrilFatface-Regular.otf,18,-5,clean,215,./chars/3_AbrilFatface-Regular.otf_fs_18_bc_21...
3,0,0,0,AbrilFatface-Regular.otf,18,-5,noisy,215,./chars/4_AbrilFatface-Regular.otf_fs_18_bc_21...
4,0,0,0,AbrilFatface-Regular.otf,18,0,clean,215,./chars/5_AbrilFatface-Regular.otf_fs_18_bc_21...
...,...,...,...,...,...,...,...,...,...
1832595,97,_,sym_underscore,Znikomit.otf,24,0,noisy,155,./chars/1832596_Znikomit.otf_fs_24_bc_155_rot_...
1832596,97,_,sym_underscore,Znikomit.otf,24,5,clean,155,./chars/1832597_Znikomit.otf_fs_24_bc_155_rot_...
1832597,97,_,sym_underscore,Znikomit.otf,24,5,noisy,155,./chars/1832598_Znikomit.otf_fs_24_bc_155_rot_...
1832598,97,_,sym_underscore,Znikomit.otf,24,45,clean,155,./chars/1832599_Znikomit.otf_fs_24_bc_155_rot_...


In [11]:
output_df.to_csv('final_files_mapping.csv', index=False)