# Data preparation
## - Load images and create training dataset
## - Create training dataset by performing data augmentation

In [27]:
# import libraries
import csv
from PIL import Image
import numpy as np
import matplotlib.pyplot as plt
import os
import shutil

from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.utils import to_categorical

In [28]:
# initiate path variabels
PROJ_PATH = os.path.abspath(os.curdir)
GENERATE_FOR_TRAIN = 1
PATH_TRAIN_DATASET = PROJ_PATH + '\\dataset\\train\\'
PATH_VALIDATION_DATASET = PROJ_PATH + '\\dataset\\train\\'

In [29]:
# build a hash table with key as file name and value as card title
dict_card_title = dict()
arr_card_title = []
arr_card_code = []
with open(PROJ_PATH + '\\dataset\\teppen_card_database.csv', newline='') as csvfile:
    row = csv.reader(csvfile, delimiter=',')
    for elem in row:
        dict_card_title[elem[0]] = elem[1]
        arr_card_code.append(elem[0])
        arr_card_title.append(elem[1])

dict_card_image = dict()
numOfClasses = len(dict_card_title)
imgData = []
labels = []
# read from teppen_card_database.csv to build a hash table
# key -> card code, value -> card title
for index, key in enumerate(dict_card_title):
    img = Image.open(PROJ_PATH + '\\dataset\\images\\' + key + '.png')
    dict_card_image[key] = np.asarray(img)
    imgData.append(np.asarray(img))
    labels.append(index)
imgData = np.array(imgData)
labels = np.array(labels)

In [30]:
# Converts a class vector (integers) to binary class matrix.
y_label = to_categorical(labels, 12)
print('y_label shape = ', y_label.shape)
print('y_label \n', y_label)
print('labels \n', labels)

y_label shape =  (12, 12)
y_label 
 [[1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]]
labels 
 [ 0  1  2  3  4  5  6  7  8  9 10 11]


## An array that stores all the raw images for data augmentation

In [31]:
imgData.shape

(12, 498, 360, 4)

## Perform data augmentation and generate initial training images

In [41]:
# Indicates number of new images generated from a single raw image
num_of_samples_to_generate = 20
output_path = PATH_TRAIN_DATASET if GENERATE_FOR_TRAIN else PATH_VALIDATION_DATASET

def generateData(output_path, key, index, save_directory):
    if os.path.exists(output_path) == 0:
        os.makedirs(output_path)
    x = np.array([imgData[index]])
    y = np.array([y_label[index]])
    count = 0
    for x_batch, y_batch in datagen.flow(x, y, batch_size=1, shuffle=1, save_prefix=key, save_to_dir=save_directory):
        count = count + 1
        if count > num_of_samples_to_generate:
            break

# data augmentation on
# 1) brightness
# 2) channel shift
datagen = ImageDataGenerator(
    rescale=1./255, 
    brightness_range=(0.3,0.7), 
    channel_shift_range=100,
    fill_mode='nearest')

if os.path.exists(output_path) == 0:
    os.makedirs(output_path)
for index, key in enumerate(dict_card_title):
    # create a folder 
    path_card_folder = output_path + key + '\\'
    if os.path.exists(path_card_folder) == 0:
        os.makedirs(path_card_folder)
    # generate data and save into the folder
    generateData(output_path, key, index, path_card_folder)

## Perform another data augmentation to rename and resize training images

In [42]:
# additional augmentation on 
# 1) resizing to get different resolution
print('Starting to retrieve all the images to resize and save...')
for directory in os.listdir(output_path):
    for index, filename in enumerate(os.listdir(output_path + directory)):
        # make sure to only process images
        if filename.lower().endswith(('.png', 'jpg', 'jpeg')) == 0:
            continue
        img = Image.open(output_path + directory + '\\' + filename)
        
        width, height = img.size
        target_shrink_scale = [1, 2, 3, 4, 5]
        
        # save processed images with a new filename and size
        for new_scale in target_shrink_scale:
            new_size = ((int)(width/new_scale), (int)(height/new_scale))
            resized_img = img.resize(new_size)
            save_path = output_path + directory + '\\'
            save_filename = directory+'_'+str(index)+'_'+str(new_scale)+'.png'
            resized_img.save(save_path + save_filename)
        
        # remove the original training image that was used to generate new images
        # to ensure filenames are incremental
        file_to_remove = output_path + directory + '\\' + filename
        print('Removing file['+file_to_remove+']')
        os.remove(file_to_remove)
print('Completed!')

ing file[d:\Work\Personal projects\teppen-deck-predict\dataset\train\CHUN-LI 004\CHUN-LI 004_0_3122.png]
Removing file[d:\Work\Personal projects\teppen-deck-predict\dataset\train\CHUN-LI 004\CHUN-LI 004_0_3595.png]
Removing file[d:\Work\Personal projects\teppen-deck-predict\dataset\train\CHUN-LI 004\CHUN-LI 004_0_3793.png]
Removing file[d:\Work\Personal projects\teppen-deck-predict\dataset\train\CHUN-LI 004\CHUN-LI 004_0_4108.png]
Removing file[d:\Work\Personal projects\teppen-deck-predict\dataset\train\CHUN-LI 004\CHUN-LI 004_0_430.png]
Removing file[d:\Work\Personal projects\teppen-deck-predict\dataset\train\CHUN-LI 004\CHUN-LI 004_0_4550.png]
Removing file[d:\Work\Personal projects\teppen-deck-predict\dataset\train\CHUN-LI 004\CHUN-LI 004_0_5375.png]
Removing file[d:\Work\Personal projects\teppen-deck-predict\dataset\train\CHUN-LI 004\CHUN-LI 004_0_6315.png]
Removing file[d:\Work\Personal projects\teppen-deck-predict\dataset\train\CHUN-LI 004\CHUN-LI 004_0_6491.png]
Removing file[d: