# Write Images

1. Keras CNN requires ImageDataGenerator
1. Difficulties using raw dataframes
1. Write each data frame as 20x20 png image

## Imports and Globals

In [5]:
from collections import namedtuple
from datetime import datetime

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split

import font_utils.load_font as LF

TRIMMED_PATH='/data/udel-ms-data-science/math-637/project-1/char-fonts/trimmed'
ALL_FONTS_CSV=f'{TRIMMED_PATH}/all_fonts.csv'

DATA_COLS=[f'r{row}c{col}' for row in range(20) for col in range(20)]
TARGET_COL=['font_class']
META_COLS=['m_label', 'font', 'variant']

IMAGE_ROOT_DIR='/data/udel-ms-data-science/math-637/project-1/char-fonts/images'

## Read All Font Data

1. Randomly shuffle the data first to mix the classes within the data set
1. m_label - the character encoded as ASCII
1. font
1. font variant
1. class_label (TARGET)
    1. 1 - The font character is the given font class
    1. 0 - The font character is not the given font class
1. r0c0...r19c19 (400 elements in raw image data)

In [6]:
all_df = pd.read_csv(ALL_FONTS_CSV).drop('Unnamed: 0', axis=1)

class_counts = {
    cc_i:cc_v for cc_i, cc_v in all_df.groupby(by='font_class').count().m_label.items()
}

BY_CLASS = {c:all_df.loc[all_df.font_class == c] for c in ['mo', 'os', 'ss', 'tr']}

WITHOUT = {
    'mo' : ['os', 'ss', 'tr'],
    'os' : ['mo', 'ss', 'tr'],
    'ss' : ['mo', 'os', 'tr'],
    'tr' : ['mo', 'os', 'ss']    
}

RAW_INPUT_DICT = dict()
for k,v in class_counts.items():
    one_third = int(v/3)
    others = list()
    for c in WITHOUT[k]:
        others.append(BY_CLASS[c].sample(frac=float(one_third/class_counts[c])).reset_index(drop=True))
    others.append(BY_CLASS[k])
    t_df = pd.DataFrame().append(others).sample(frac=1).reset_index(drop=True)
    t_df['font_class'] = np.where(t_df.font_class == k, 1, 0)
    t_df['font_class'] = t_df['font_class'].astype(int)
    RAW_INPUT_DICT[k] = t_df
    
for c, df in RAW_INPUT_DICT.items():
    print(f'{c} {df.shape}')

mo (3470, 404)
os (2726, 404)
ss (5702, 404)
tr (2479, 404)


In [17]:
junk = RAW_INPUT_DICT['os']
nrows = junk.shape[0]
thr = int(0.8*junk.shape[0])
print(f'Shape {junk.shape} : threshold {thr}')

train = junk.iloc[:thr, :]
test  = junk.iloc[thr:, :]

print(f'Shape train {train.shape} Shape test {test.shape}')

Shape (2726, 404) : threshold 2180
Shape train (2180, 404) Shape test (546, 404)


In [18]:
def split_df(df, ratio):
    thr = int(ratio*df.shape[0])
    train = df.iloc[:thr,:]
    test  = df.iloc[thr:,:]
    return train, test

def reshape_img(df):
    img = df[DATA_COLS]
    
def address_dfs(tag, d):
    for k,v in d:
        font_path=f'{IMAGE_ROOT_DIR}/tag'
        if k == 'target_train':
            o_path = f'{font_path}/train/class_a'
        if k == 'target_test':
            o_path = f'{font_path}/test/class_a'
        if k == 'misses_train':
            o_path = f'{font_path}/train/class_b'
        if k == 'misses_test':
            o_path = f'{font_path}/test/class_b'
        
        img_df = reshape_img(v)

In [10]:
for c, df in RAW_INPUT_DICT.items():
    target_df = df.loc[df.font_class == 1]
    misses_df = df.loc[df.font_class == 0]
    print(f'Shape of target {target_df.shape} of misses {misses_df.shape}')
    
    # Split data frames
    t1, t2 = split_df(target_df, 0.8)
    m1, m2  = split_df(misses_df, 0.8)
    
    splits = dict()
    splits['target_train'] = t1
    splits['target_test']  = t2
    splits['misses_train'] = m1
    splits['misses_test']  = m2
    
    address_dfs(c, splits)

Shape of target (1736, 404) of misses (1734, 404)
Shape of target (1364, 404) of misses (1362, 404)
Shape of target (2852, 404) of misses (2850, 404)
Shape of target (1240, 404) of misses (1239, 404)


## Split Data Into Train/Test

1. X cols: r0c0,...,r19c19
1. Y cols: font_class
1. Others: m_label, font, variant

In [7]:
IN_DATA = dict()
for k,df in RAW_INPUT_DICT.items():
    x = df.loc[:, DATA_COLS]
    y = df.loc[:, TARGET_COL]
    m = df.loc[:, META_COLS]
    
    X_train, X_test, Y_train, Y_test = train_test_split(x, y, test_size=0.2)
#     X_train, X_val, Y_train, Y_val = train_test_split(X_train, Y_train, test_size=0.2)
    
    i_data = dict()
    i_data['train_x'] = X_train
    i_data['train_y'] = Y_train
#     i_data['val_x']   = X_val
#     i_data['val_y']   = Y_val
    i_data['test_x'] = X_test
    i_data['test_y'] = Y_test
    
    IN_DATA[k] = i_data