The goal of this notebook is to get a subset of the merged datasets.
We will feed this into our baseline model and neural net as the starting
point for the project progress report.

In [1]:
import os
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import random
import shutil
from collections import defaultdict
import time
import json
from PIL import Image

In [11]:
alphabet_dir = "D:\\APS360 Data\\Initial_Data\\A_to_Z\\"
alphabet_classes = ['A','B','C','D','E','F','G','H','I','J','K','L','M','N',
                   'O','P','Q','R','S','T','U','V','W','X','Y','Z']
space_delete_dir = "D:\\APS360 Data\\Initial_Data\\space_delete\\"
hagrid_call_dir = "D:\\APS360 Data\\Initial_Data\\hagrid_call_cropped\\"
hagrid_ok_dir = "D:\\APS360 Data\\Initial_Data\\hagrid_ok_cropped\\"

subset_dir = "D:\APS360 Data\Subset\\"

sub_num = 350

In [8]:
def show_image(path):
    img = mpimg.imread(path)
    plt.imshow(img)
    return img

In [9]:
def choose_random_images(selected_dir,n):
    # print(selected_dir)
    folder_ls = os.listdir(selected_dir)
    length = len(folder_ls)
    indices = range(length)
    random_indices = random.sample(indices,n)
    filtered_ls = [selected_dir+"\\" + folder_ls[i] for i in random_indices]
    return filtered_ls

## Start with alphabet A to Z



In [12]:
alphabet_file_dict = defaultdict(list)

for alphabet in alphabet_classes:
    s = time.time()
    folder_path = alphabet_dir + alphabet
    random_ls = choose_random_images(folder_path,sub_num)
    alphabet_file_dict[alphabet] += random_ls
    e = time.time()
    print("Took alphabet " + alphabet + ": " + str(round(e-s,2)) + "seconds.")


Took alphabet A: 0.01seconds.
Took alphabet B: 0.01seconds.
Took alphabet C: 0.01seconds.
Took alphabet D: 0.01seconds.
Took alphabet E: 0.01seconds.
Took alphabet F: 0.01seconds.
Took alphabet G: 0.01seconds.
Took alphabet H: 0.01seconds.
Took alphabet I: 0.01seconds.
Took alphabet J: 0.01seconds.
Took alphabet K: 0.01seconds.
Took alphabet L: 0.0seconds.
Took alphabet M: 0.01seconds.
Took alphabet N: 0.0seconds.
Took alphabet O: 0.0seconds.
Took alphabet P: 0.0seconds.
Took alphabet Q: 0.0seconds.
Took alphabet R: 0.0seconds.
Took alphabet S: 0.0seconds.
Took alphabet T: 0.0seconds.
Took alphabet U: 0.0seconds.
Took alphabet V: 0.01seconds.
Took alphabet W: 0.0seconds.
Took alphabet X: 0.0seconds.
Took alphabet Y: 0.0seconds.
Took alphabet Z: 0.0seconds.


In [15]:
len(alphabet_file_dict['Z'])

350

In [16]:
for alphabet in alphabet_classes:
    s = time.time()
    new_folder = subset_dir+alphabet
    os.mkdir(new_folder)
    for image_path in alphabet_file_dict[alphabet]:
        shutil.copy(image_path,new_folder)
    e = time.time()
    print("Took alphabet " + alphabet + ": " + str(round(e-s,2)) + "seconds.")

Took alphabet A: 10.49seconds.
Took alphabet B: 9.08seconds.
Took alphabet C: 10.27seconds.
Took alphabet D: 9.49seconds.
Took alphabet E: 9.45seconds.
Took alphabet F: 10.25seconds.
Took alphabet G: 8.91seconds.
Took alphabet H: 10.67seconds.
Took alphabet I: 10.39seconds.
Took alphabet J: 9.33seconds.
Took alphabet K: 9.92seconds.
Took alphabet L: 9.68seconds.
Took alphabet M: 9.27seconds.
Took alphabet N: 9.21seconds.
Took alphabet O: 9.0seconds.
Took alphabet P: 8.94seconds.
Took alphabet Q: 9.55seconds.
Took alphabet R: 9.52seconds.
Took alphabet S: 10.08seconds.
Took alphabet T: 9.54seconds.
Took alphabet U: 9.52seconds.
Took alphabet V: 9.84seconds.
Took alphabet W: 8.53seconds.
Took alphabet X: 7.91seconds.
Took alphabet Y: 8.61seconds.
Took alphabet Z: 9.22seconds.


## Get space and delete symbols

In [18]:
symbols = ['space','del']

symbol_file_dict = defaultdict(list)

for symbol in symbols:
    s = time.time()
    folder_path = space_delete_dir + symbol
    random_ls = choose_random_images(folder_path,sub_num)
    symbol_file_dict[symbol] += random_ls
    e = time.time()
    print("Took symbol " + symbol + ": " + str(round(e-s,2)) + "seconds.")


Took symbol space: 0.01seconds.
Took symbol del: 0.0seconds.


In [23]:
len(symbol_file_dict['space'])

350

In [25]:
for symbol in symbols:
    s = time.time()
    new_folder = subset_dir+symbol
    os.mkdir(new_folder)
    for image_path in symbol_file_dict[symbol]:
        shutil.copy(image_path,new_folder)
    e = time.time()
    print("Took symbol " + symbol + ": " + str(round(e-s,2)) + "seconds.")

Took symbol space: 0.0seconds.
Took symbol del: 0.0seconds.


## Hagrid Call

In [40]:
s = time.time()
folder_path = hagrid_call_dir
random_ls = choose_random_images(folder_path,sub_num)
e = time.time()
print("Took symbol call: " + str(round(e-s,2)) + "seconds.")

Took symbol call: 0.01seconds.


In [41]:
random_ls;

In [42]:
s = time.time()
new_folder = subset_dir+"call"
os.mkdir(new_folder)
for image_path in random_ls:
    shutil.copy(image_path,new_folder)
e = time.time()

In [43]:
print("Took symbol call: "+ str(round(e-s,2)) + "seconds.")

Took symbol call: 4.17seconds.


## Hagrid OK

In [44]:
s = time.time()
folder_path = hagrid_ok_dir
random_ls = choose_random_images(folder_path,sub_num)
e = time.time()
print("Took symbol call: " + str(round(e-s,2)) + "seconds.")

Took symbol call: 0.01seconds.


In [45]:
s = time.time()
new_folder = subset_dir+"ok"
os.mkdir(new_folder)
for image_path in random_ls:
    shutil.copy(image_path,new_folder)
e = time.time()

In [46]:
print("Took symbol ok: "+ str(round(e-s,2)) + "seconds.")

Took symbol ok: 6.32seconds.


# Looking at final subset

In [49]:
len(os.listdir(subset_dir))

30