# Initialize a fresh test data dir

Includes horiz and vertical symmetry manipulations

In [1]:
import os
import shutil
import tensorflow as tf
from tensorflow import keras
import numpy as np
import pandas as pd
from numpy import asarray
import h5py
import json
from glob import glob
import json
import sys
import random
import cv2
import PIL
from PIL import Image, ImageEnhance
import matplotlib.pyplot as plt

In [2]:
from train import train
from utilities import (init_data_dir, count_all_files, engineer_images_by_num,
                       parse_bad_images, engineer_bad_images, zipdir, process_image)

In [3]:
np.random.seed(123)
tf.random.set_seed(123)

In [4]:
NUMERALS = ["i", "ii", "iii", "iv", "v", "vi", "vii", "viii", "ix", "x"]

In [5]:
hand_dir = './hand_drawn/all'

In [6]:
# initial target file count - naive
target_counts_8010 = {'i': 500, 'ii': 930, 'iii': 930, 'iv': 930,
                      'v': 500, 'vi': 930, 'vii': 930, 'viii': 930,
                      'ix': 930, 'x': 500}

target_counts_7020 = {'i': 450, 'ii': 810, 'iii': 810, 'iv': 810,
                      'v': 450, 'vi': 810, 'vii': 810, 'viii': 810,
                      'ix': 810, 'x': 450}

In [7]:
ENG_KWARGS = {
    'i': {'horiz': True, 'vert': True},
    'ii': {'horiz': True, 'vert': True},
    'iii': {'horiz': True, 'vert': True},
    'iv': {'horiz': False, 'vert': False, 'target': 'vi'},
    'v': {'horiz': True, 'vert': False},
    'vi': {'horiz': False, 'vert': False, 'target': 'iv'},
    'vii': {'horiz': False, 'vert': False},
    'viii': {'horiz': False, 'vert': False},
    'ix': {'horiz': False, 'vert': True},
    'x': {'horiz': True, 'vert': True},
}

In [8]:
with open('./bad_image_record_dcai_gcb_07.json', 'r') as f:
    bad_images = json.load(f)
bad_images = {int(k): v for k, v in bad_images.items()}
i_last = sorted([k for k in bad_images.keys()])[-1]
bad_images = bad_images[i_last]
bad_images = [bi.replace('././dcai_gcb_07/dcai_gcb_07', '{DATA_DIR}') for bi in bad_images]
bad_images[0:5], len(bad_images), len(set(bad_images))

(['{DATA_DIR}/val/i/b178641c-ce5d-11eb-b317-38f9d35ea60f.png',
  '{DATA_DIR}/val/i/b19f714c-ce5d-11eb-b317-38f9d35ea60f.png',
  '{DATA_DIR}/val/i/b19ff702-ce5d-11eb-b317-38f9d35ea60f.png',
  '{DATA_DIR}/val/i/b1a7c54a-ce5d-11eb-b317-38f9d35ea60f.png',
  '{DATA_DIR}/val/i/b1adda0c-ce5d-11eb-b317-38f9d35ea60f.png'],
 786,
 376)

In [9]:
hand_images = glob(hand_dir + '/*.png')
hand_images[0:5], len(hand_images)

(['./hand_drawn/all/vi_hand_11.png',
  './hand_drawn/all/ii_hand_3.png',
  './hand_drawn/all/iv_hand_1.png',
  './hand_drawn/all/ix_hand_3.png',
  './hand_drawn/all/viii_hand_3.png'],
 74)

In [10]:
def run(data_dir, target_counts, n_hand_images, bad_images, hand_images, eng_kwargs=ENG_KWARGS, target_count_tot=9990):
    
    if os.path.exists(data_dir):
        shutil.rmtree(data_dir)
    shutil.copytree('./data_baseline_clean/data_baseline_clean', data_dir)
    shutil.rmtree(data_dir +'/ignore')
    
    engineer_images_by_num(eng_kwargs, data_dir, target_counts)
    
    _, file_count = count_all_files(data_dir)
    n_opt_files = target_count_tot - file_count - n_hand_images
    print('Aiming for {} optimization files'.format(n_opt_files))
    bad_images = [b.format(DATA_DIR=data_dir) for b in bad_images]
    engineer_bad_images(bad_images, n_opt_files, data_dir)
    
    n_hand_images_train = int(n_hand_images * 0.9)
    n_hand_images_valid = n_hand_images - n_hand_images_train
    
    ifile = 0
    ifps = np.random.choice(np.arange(len(hand_images)), n_hand_images_train, replace=True)
    for i in ifps:
        fp = hand_images[i]
        num = [n for n in NUMERALS if os.path.basename(fp).startswith('{}_'.format(n))][0]
        fp_out = data_dir + '/train/{0}/{0}_hand_eng_{1}.png'.format(num, ifile)
        process_image(fp, fp_out, enhance='random', contrast=1, rotate='random', erode='random', dilate='random', show=False)
        ifile += 1
        
    ifps = np.random.choice(np.arange(len(hand_images)), n_hand_images_valid, replace=True)
    for i in ifps:
        fp = hand_images[i]
        num = [n for n in NUMERALS if os.path.basename(fp).startswith('{}_'.format(n))][0]
        fp_out = data_dir + '/val/{0}/{0}_hand_eng_{1}.png'.format(num, ifile)
        process_image(fp, fp_out, enhance='random', contrast=1, rotate='random', erode='random', dilate='random', show=False)
        ifile += 1
        
    n_count_final, tot_final = count_all_files(data_dir)

In [11]:
jobs = [
    {'data_dir': './dcai_gcb_09_00/dcai_gcb_09_00',
     'target_counts': target_counts_8010, 
     'n_hand_images': 1000, 
     'bad_images': list(set(bad_images)), 
     'hand_images': hand_images},
    {'data_dir': './dcai_gcb_09_01/dcai_gcb_09_01',
     'target_counts': target_counts_8010, 
     'n_hand_images': 500, 
     'bad_images': list(set(bad_images)), 
     'hand_images': hand_images},
    {'data_dir': './dcai_gcb_09_02/dcai_gcb_09_02',
     'target_counts': target_counts_7020, 
     'n_hand_images': 1000, 
     'bad_images': list(set(bad_images)), 
     'hand_images': hand_images},
    {'data_dir': './dcai_gcb_09_03/dcai_gcb_09_03',
     'target_counts': target_counts_7020, 
     'n_hand_images': 2000, 
     'bad_images': list(set(bad_images)), 
     'hand_images': hand_images},
]

for i, kwargs in enumerate(jobs):
    print('\nRunning {}: {}'.format(i, kwargs['data_dir']))
    run(**kwargs)


Running 0: ./dcai_gcb_09_00/dcai_gcb_09_00
Numeral "i" has 500 files
Numeral "ii" has 930 files
Numeral "iii" has 930 files
Numeral "iv" has 930 files
Numeral "v" has 500 files
Numeral "vi" has 929 files
Numeral "vii" has 930 files
Numeral "viii" has 930 files
Numeral "ix" has 930 files
Numeral "x" has 500 files
Total of 8009 files
Aiming for 981 optimization files
Engineering bad images with target count of 981
Numeral "i" has 500 files
Numeral "ii" has 930 files
Numeral "iii" has 930 files
Numeral "iv" has 930 files
Numeral "v" has 500 files
Numeral "vi" has 929 files
Numeral "vii" has 930 files
Numeral "viii" has 930 files
Numeral "ix" has 930 files
Numeral "x" has 500 files
Total of 8009 files
Numeral "i" has 572 files
Numeral "ii" has 1057 files
Numeral "iii" has 1023 files
Numeral "iv" has 1063 files
Numeral "v" has 543 files
Numeral "vi" has 1026 files
Numeral "vii" has 1049 files
Numeral "viii" has 1066 files
Numeral "ix" has 1038 files
Numeral "x" has 553 files
Total of 8990 

In [12]:
for job in jobs:
    _, total = count_all_files(job['data_dir'])
    assert total <= 9990

Numeral "i" has 572 files
Numeral "ii" has 1137 files
Numeral "iii" has 1023 files
Numeral "iv" has 1327 files
Numeral "v" has 605 files
Numeral "vi" has 1191 files
Numeral "vii" has 1277 files
Numeral "viii" has 1123 files
Numeral "ix" has 1182 files
Numeral "x" has 553 files
Total of 9990 files
Numeral "i" has 591 files
Numeral "ii" has 1164 files
Numeral "iii" has 1058 files
Numeral "iv" has 1262 files
Numeral "v" has 571 files
Numeral "vi" has 1135 files
Numeral "vii" has 1271 files
Numeral "viii" has 1165 files
Numeral "ix" has 1183 files
Numeral "x" has 590 files
Total of 9990 files
Numeral "i" has 588 files
Numeral "ii" has 1157 files
Numeral "iii" has 1017 files
Numeral "iv" has 1326 files
Numeral "v" has 578 files
Numeral "vi" has 1152 files
Numeral "vii" has 1265 files
Numeral "viii" has 1159 files
Numeral "ix" has 1206 files
Numeral "x" has 542 files
Total of 9990 files
Numeral "i" has 512 files
Numeral "ii" has 1131 files
Numeral "iii" has 906 files
Numeral "iv" has 1470 fi