# Load data, process and save as numpy array files

Load data and create numpy array files to test in google colab

### Libraries and modules

In [1]:
%matplotlib inline

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from tqdm import tqdm

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

from tensorflow.keras.preprocessing import image
from tensorflow.keras.utils import to_categorical


### Load images and save to numpy arrays

Load images, process them(resize and change between rgb and grayscale), and add them to a numpy array.

In [3]:
alphabet = ['A','B','C','D','E','F','G','H','I','K','L','M','N','O','P','Q','R','S','T','U','V','W','X','Y']

train_image = []
labels = []

for letter in alphabet:
    for i in tqdm([*range(1,112)]):
        img_path = f'new_dataset/train/{letter}/{letter}{i}.jpg'
        img = image.load_img(img_path, target_size=(100, 100, 1), color_mode="grayscale")
        img = image.img_to_array(img)
        img = img/255
        train_image.append(img)
        labels = np.append(labels, letter)
    

100%|██████████| 111/111 [00:01<00:00, 75.05it/s]
100%|██████████| 111/111 [00:02<00:00, 44.29it/s]
100%|██████████| 111/111 [00:01<00:00, 100.29it/s]
100%|██████████| 111/111 [00:04<00:00, 24.51it/s]
100%|██████████| 111/111 [00:00<00:00, 142.86it/s]
100%|██████████| 111/111 [00:00<00:00, 138.45it/s]
100%|██████████| 111/111 [00:00<00:00, 145.18it/s]
100%|██████████| 111/111 [00:03<00:00, 30.24it/s]
100%|██████████| 111/111 [00:03<00:00, 29.89it/s]
100%|██████████| 111/111 [00:00<00:00, 144.16it/s]
100%|██████████| 111/111 [00:03<00:00, 31.47it/s]
100%|██████████| 111/111 [00:00<00:00, 138.36it/s]
100%|██████████| 111/111 [00:00<00:00, 142.24it/s]
100%|██████████| 111/111 [00:00<00:00, 130.41it/s]
100%|██████████| 111/111 [00:00<00:00, 139.44it/s]
100%|██████████| 111/111 [00:00<00:00, 126.20it/s]
100%|██████████| 111/111 [00:00<00:00, 139.36it/s]
100%|██████████| 111/111 [00:00<00:00, 141.54it/s]
100%|██████████| 111/111 [00:00<00:00, 127.86it/s]
100%|██████████| 111/111 [00:03<00:00

In [4]:
# Convert image arrays to numpy array
X = np.array(train_image)


In [5]:
# y = labels['Image Value'].values
y = labels


Split data in train and test

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2)


In [7]:
label_encoder = LabelEncoder()
label_encoder.fit(y_train)
encoded_y_train = label_encoder.transform(y_train)
encoded_y_test = label_encoder.transform(y_test)


In [8]:
y_train_cat = to_categorical(encoded_y_train)
y_test_cat = to_categorical(encoded_y_test)


Save image arrays as numpy array files for easy transport and re-use

In [9]:
# Save as h5py file - not tested

import h5py

f = h5py.File("X_train_array.hdf5", "w")
f.create_dataset("X_train", data = X_train, compression="gzip")
f = h5py.File("y_train_array.hdf5", "w")
f.create_dataset("y_train", data = X_train, compression="gzip")
f = h5py.File("X_test_array.hdf5", "w")
f.create_dataset("X_test", data = X_train, compression="gzip")
f = h5py.File("y_test_array.hdf5", "w")
f.create_dataset("y_test", data = X_train, compression="gzip")


<HDF5 dataset "y_test": shape (2131, 100, 100, 1), type "<f4">

In [10]:
# Save as npy file

from numpy import save

save('X_train.npy', X_train)
save('y_train.npy', y_train_cat)
save('X_test.npy', X_test)
save('y_test.npy', y_test_cat)
