# Initialize a fresh test data dir

Includes horiz and vertical symmetry manipulations

In [1]:
import os
import shutil
import tensorflow as tf
from tensorflow import keras
import numpy as np
import pandas as pd
from numpy import asarray
import h5py
import json
from glob import glob
import json
import sys
import random
import cv2
import PIL
from PIL import Image, ImageEnhance
import matplotlib.pyplot as plt

In [2]:
from train import train
from utilities import (init_data_dir, count_all_files, engineer_images_by_num,
                       parse_bad_images, engineer_bad_images, zipdir, process_image)

In [3]:
np.random.seed(123)
tf.random.set_seed(123)

In [4]:
NUMERALS = ["i", "ii", "iii", "iv", "v", "vi", "vii", "viii", "ix", "x"]

In [5]:
data_dir = './dcai_gcb_08/dcai_gcb_08'
hand_dir = './hand_drawn/'
train_dir = data_dir + '/train'
val_dir = data_dir + '/val'

target_count_tot = 9995
n_hand_images = 1500

In [6]:
if os.path.exists(data_dir):
    shutil.rmtree(data_dir)
shutil.copytree('./data_baseline_clean/data_baseline_clean', './dcai_gcb_08/dcai_gcb_08')

'./dcai_gcb_08/dcai_gcb_08'

In [7]:
# initial target file count - naive
target_counts = {'i': 500, 'ii': 500, 'iii': 500, 'iv': 500,
                 'v': 500, 'vi': 500, 'vii': 500, 'viii': 500,
                 'ix': 500, 'x': 500}

In [8]:
eng_kwargs = {
    'i': {'horiz': True, 'vert': True},
    'ii': {'horiz': True, 'vert': True},
    'iii': {'horiz': True, 'vert': True},
    'iv': {'horiz': False, 'vert': False, 'target': 'vi'},
    'v': {'horiz': True, 'vert': False},
    'vi': {'horiz': False, 'vert': False, 'target': 'iv'},
    'vii': {'horiz': False, 'vert': False},
    'viii': {'horiz': False, 'vert': False},
    'ix': {'horiz': False, 'vert': True},
    'x': {'horiz': True, 'vert': True},
}

In [9]:
engineer_images_by_num(eng_kwargs, data_dir, target_counts)

In [10]:
_, file_count = count_all_files(data_dir)
n_opt_files = target_count_tot - file_count - n_hand_images
print('Aiming for {} optimization files'.format(n_opt_files))

Numeral "i" has 500 files
Numeral "ii" has 500 files
Numeral "iii" has 500 files
Numeral "iv" has 500 files
Numeral "v" has 500 files
Numeral "vi" has 499 files
Numeral "vii" has 500 files
Numeral "viii" has 500 files
Numeral "ix" has 500 files
Numeral "x" has 500 files
Total of 4999 files
Aiming for 3496 optimization files


In [11]:
with open('./bad_image_record_dcai_gcb_07.json', 'r') as f:
    bad_images = json.load(f)
bad_images = {int(k): v for k, v in bad_images.items()}
i_last = sorted([k for k in bad_images.keys()])[-1]
bad_images = bad_images[i_last]
bad_images = [bi.replace('_gcb_07', '_gcb_08') for bi in bad_images]
bad_images[0:5]

['././dcai_gcb_08/dcai_gcb_08/val/i/b1ac72d4-ce5d-11eb-b317-38f9d35ea60f.png',
 '././dcai_gcb_08/dcai_gcb_08/val/ii/b13e7b12-ce5d-11eb-b317-38f9d35ea60f.png',
 '././dcai_gcb_08/dcai_gcb_08/val/ii/b1448f7a-ce5d-11eb-b317-38f9d35ea60f.png',
 '././dcai_gcb_08/dcai_gcb_08/val/ii/b151b448-ce5d-11eb-b317-38f9d35ea60f.png',
 '././dcai_gcb_08/dcai_gcb_08/val/ii/b1590d9c-ce5d-11eb-b317-38f9d35ea60f.png']

In [12]:
engineer_bad_images(bad_images, n_opt_files, data_dir)

Engineering bad images with target count of 3496
Numeral "i" has 500 files
Numeral "ii" has 500 files
Numeral "iii" has 500 files
Numeral "iv" has 500 files
Numeral "v" has 500 files
Numeral "vi" has 499 files
Numeral "vii" has 500 files
Numeral "viii" has 500 files
Numeral "ix" has 500 files
Numeral "x" has 500 files
Total of 4999 files
Numeral "i" has 614 files
Numeral "ii" has 1034 files
Numeral "iii" has 829 files
Numeral "iv" has 987 files
Numeral "v" has 571 files
Numeral "vi" has 1017 files
Numeral "vii" has 908 files
Numeral "viii" has 1058 files
Numeral "ix" has 830 files
Numeral "x" has 647 files
Total of 8495 files
Engineered 3496 bad images.


In [13]:
hand_images = glob(hand_dir + '/*.png')
len(hand_images)

69

In [14]:
ifile = 0
temp_dir ='./temp'
ifps = np.random.choice(np.arange(len(hand_images)), n_hand_images,
                        replace=True)
for i in ifps:
    fp = hand_images[i]
    num = [n for n in NUMERALS if os.path.basename(fp).startswith('{}_'.format(n))][0]
    fp_out = train_dir + '/{0}/{0}_hand_eng_{1}.png'.format(num, ifile)

    process_image(fp, fp_out,
        enhance='random',
        contrast=1,
        rotate='random',
        erode='random',
        dilate=False,
        show=False,
        )
    ifile += 1

In [16]:
n_count_after, tot_after = count_all_files(data_dir)
n_remove = tot_after - target_count_tot
fps = glob(train_dir + '/*/*.png')
ifps = np.random.choice(np.arange(len(fps)), n_remove,
                        replace=False)

for i in ifps:
    fp = fps[i]
    os.remove(fp)

print('Removed {} files'.format(n_remove))

Numeral "i" has 614 files
Numeral "ii" has 1167 files
Numeral "iii" has 829 files
Numeral "iv" has 1298 files
Numeral "v" has 662 files
Numeral "vi" has 1241 files
Numeral "vii" has 1327 files
Numeral "viii" has 1130 files
Numeral "ix" has 1080 files
Numeral "x" has 647 files
Total of 9995 files
Removed 0 files


In [17]:
n_count_final, tot_final = count_all_files(data_dir)

Numeral "i" has 614 files
Numeral "ii" has 1167 files
Numeral "iii" has 829 files
Numeral "iv" has 1298 files
Numeral "v" has 662 files
Numeral "vi" has 1241 files
Numeral "vii" has 1327 files
Numeral "viii" has 1130 files
Numeral "ix" has 1080 files
Numeral "x" has 647 files
Total of 9995 files
