# Convolutional Neural Network

1. Compound pipeline
    1. Convolution Stage
    1. MaxPooling Stage
1. Multiple Stages
1. Data already ready into Pandas DataFrame

## Imports and Globals

In [1]:
from collections import namedtuple
from datetime import datetime

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.decomposition import PCA
from sklearn.manifold import Isomap
from sklearn.manifold import TSNE

from keras import backend as K

from keras.models import Sequential
from keras.layers import Conv2D, MaxPooling2D
from keras.layers import Dropout, Flatten, Dense

from sklearn.model_selection import train_test_split

from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve

import font_utils.load_font as LF

TRIMMED_PATH='/data/udel-ms-data-science/math-637/project-1/char-fonts/trimmed'
ALL_FONTS_CSV=f'{TRIMMED_PATH}/all_fonts.csv'

DATA_COLS=[f'r{row}c{col}' for row in range(20) for col in range(20)]
TARGET_COL=['font_class']
META_COLS=['m_label', 'font', 'variant']

## Read All Font Data

1. Randomly shuffle the data first to mix the classes within the data set
1. m_label - the character encoded as ASCII
1. font
1. font variant
1. class_label (TARGET)
    1. 1 - The font character is the given font class
    1. 0 - The font character is not the given font class
1. r0c0...r19c19 (400 elements in raw image data)

In [2]:
all_df = pd.read_csv(ALL_FONTS_CSV).drop('Unnamed: 0', axis=1)

class_counts = {
    cc_i:cc_v for cc_i, cc_v in all_df.groupby(by='font_class').count().m_label.items()
}

BY_CLASS = {c:all_df.loc[all_df.font_class == c] for c in ['mo', 'os', 'ss', 'tr']}

WITHOUT = {
    'mo' : ['os', 'ss', 'tr'],
    'os' : ['mo', 'ss', 'tr'],
    'ss' : ['mo', 'os', 'tr'],
    'tr' : ['mo', 'os', 'ss']    
}

RAW_INPUT_DICT = dict()
for k,v in class_counts.items():
    one_third = int(v/3)
    others = list()
    for c in WITHOUT[k]:
        others.append(BY_CLASS[c].sample(frac=float(one_third/class_counts[c])).reset_index(drop=True))
    others.append(BY_CLASS[k])
    t_df = pd.DataFrame().append(others).sample(frac=1).reset_index(drop=True)
    t_df['font_class'] = np.where(t_df.font_class == k, 1, 0)
    t_df['font_class'] = t_df['font_class'].astype(int)
    RAW_INPUT_DICT[k] = t_df
    
for c, df in RAW_INPUT_DICT.items():
    print(f'{c} {df.shape}')

mo (3470, 404)
os (2726, 404)
ss (5702, 404)
tr (2479, 404)


## Split Data Into Train/Test

1. X cols: r0c0,...,r19c19
1. Y cols: font_class
1. Others: m_label, font, variant

In [29]:
# a = IN_DATA['os']['train_x']
# b = a.iloc[0]
# ba = np.array(b)
# aa = ba.reshape(20,20)
# aa
# cc = reshape_img(a.iloc[0])
# cc.shape

# old_df = IN_DATA['os']['train_x']
# new_df = old_df.apply(lambda row: reshape_img(row), axis=1)
# new_df

2248    [[222, 135, 145, 250, 153, 1, 1, 1, 1, 1, 1, 1...
2078    [[1, 1, 1, 29, 114, 114, 220, 255, 255, 255, 2...
415     [[1, 1, 1, 1, 1, 1, 12, 74, 131, 210, 148, 116...
733     [[255, 255, 255, 255, 255, 255, 255, 255, 255,...
1653    [[1, 1, 1, 21, 57, 147, 158, 255, 255, 255, 25...
                              ...                        
1631    [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 22, 34, 34, 13...
248     [[255, 255, 255, 255, 255, 255, 255, 255, 255,...
2595    [[232, 254, 255, 255, 62, 1, 1, 1, 1, 1, 1, 1,...
1402    [[1, 1, 1, 1, 1, 1, 1, 1, 154, 255, 231, 23, 1...
2416    [[24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, ...
Length: 2180, dtype: object

In [3]:
def reshape_img(img_data):
    aa=np.array(img_data).reshape(20,20)
    return K.constant(aa,shape=(20,20))

In [4]:
IN_DATA = dict()
for k,df in RAW_INPUT_DICT.items():
    x = df.loc[:, DATA_COLS]
    y = df.loc[:, TARGET_COL]
    m = df.loc[:, META_COLS]

    x = x.apply(lambda row: reshape_img(row), axis=1)
    
    X_train, X_test, Y_train, Y_test = train_test_split(x, y, test_size=0.2)
#     X_train, X_val, Y_train, Y_val = train_test_split(X_train, Y_train, test_size=0.2)
    
    i_data = dict()
    i_data['train_x'] = X_train
    i_data['train_y'] = Y_train
#     i_data['val_x']   = X_val
#     i_data['val_y']   = Y_val
    i_data['test_x'] = X_test
    i_data['test_y'] = Y_test
    
    IN_DATA[k] = i_data

In [5]:
junk = IN_DATA['os']
junk

{'train_x': 980     ((tf.Tensor(96.0, shape=(), dtype=float32), tf...
 455     ((tf.Tensor(1.0, shape=(), dtype=float32), tf....
 839     ((tf.Tensor(255.0, shape=(), dtype=float32), t...
 538     ((tf.Tensor(255.0, shape=(), dtype=float32), t...
 2412    ((tf.Tensor(1.0, shape=(), dtype=float32), tf....
                               ...                        
 1065    ((tf.Tensor(1.0, shape=(), dtype=float32), tf....
 1428    ((tf.Tensor(1.0, shape=(), dtype=float32), tf....
 2435    ((tf.Tensor(1.0, shape=(), dtype=float32), tf....
 1203    ((tf.Tensor(1.0, shape=(), dtype=float32), tf....
 1685    ((tf.Tensor(1.0, shape=(), dtype=float32), tf....
 Length: 2180, dtype: object,
 'train_y':       font_class
 980            0
 455            1
 839            0
 538            1
 2412           0
 ...          ...
 1065           1
 1428           0
 2435           1
 1203           0
 1685           1
 
 [2180 rows x 1 columns],
 'test_x': 379     ((tf.Tensor(214.0, shape=(), dtype=f

## Setup Convolutional Neural Network

In [5]:
def plot_cm(nn_m, in_data, font_class):
    #  Confusion Matrix
    y_test_pred = nn_m.predict_classes(in_data['test_x'])
    c_matrix = confusion_matrix(in_data['test_y'], y_test_pred)
    ax = sns.heatmap(
        c_matrix, 
        annot=True,
        xticklabels=[f'Not {font_class}', f'{font_class}'],
        yticklabels=[f'Not {font_class}', f'{font_class}'],
        cbar=False,
        cmap='Blues',
        fmt='g'
    )
    ax.set_title(f'Confusion Matrix {font_class}')
    ax.set_xlabel('Prediction')
    ax.set_ylabel('Actual')
    plt.savefig(f'Simple_CM_{font_class.replace(" ","_")}.pdf')
    plt.show()
    
def plot_roc(nn_m, in_data, font_class):
    # ROC Curve
    y_test_pred_probs = nn_m.predict(in_data['test_x'])
    fpr, tpr, _ = roc_curve(in_data['test_y'], y_test_pred_probs)
    plt.plot(fpr, tpr)
    plt.plot([0,1],[0,1],'--', color='black')
    plt.title(f'ROC Curve {font_class}')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.savefig(f'Simple_ROC_{font_class.replace(" ","_")}.pdf')
    plt.show()
    
def run_convo(in_data, font_class):
    filter_size     = 3
    n_filters       = 32
    input_size      = 20
    max_pool_size   = 2
    batch_size      = 16
    steps_per_epoch = 20000//batch_size
    n_epochs        = 10
    
    # Setup Convolutional Neural Network
    nn = Sequential()
    nn.add(Conv2D(n_filters, (filter_size, filter_size), input_shape=(input_size, input_size, 1), activation='relu'))
    nn.add(MaxPooling2D(pool_size=(max_pool_size, max_pool_size)))
    nn.add(Conv2D(n_filters, (filter_size, filter_size), input_shape=(input_size/2, input_size/2, 1), activation='relu'))
    nn.add(MaxPooling2D(pool_size=(max_pool_size, max_pool_size)))
    nn.add(Flatten())
    nn.add(Dense(units=128, activation='relu'))
    nn.add(Dropout(0.5))
    nn.add(Dense(units=1, activation='sigmoid'))
    nn.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])

    print(f'Start {datetime.now()}')
    nn.fit(in_data['train_x'],in_data['train_y'], epochs=n_epochs,verbose=1)
    print(f'Stop  {datetime.now()}')

    scores = nn.evaluate(in_data['train_x'],in_data['train_y'])
    print(f'Training Accurancy {scores[1]*100}')

    scores = nn.evaluate(in_data['test_x'],in_data['test_y'])
    print(f'Test Accurancy {scores[1]*100}')
    
    plot_cm(nn, in_data, font_class)
    plot_roc(nn, in_data, font_class)

In [9]:
# tag_lu = {
#     'os': 'Old Style',
#     'tr': 'Transitional',
#     'mo': 'Modern',
#     'ss': 'Sans Serif'
# }

# for k,v in IN_DATA.items():
#     run_convo(v, tag_lu[k])

run_convo(IN_DATA['os'], 'Old Style')

Start 2021-05-23 02:42:07.961862


ValueError: Failed to convert a NumPy array to a Tensor (Unsupported object type tensorflow.python.framework.ops.EagerTensor).