In [1]:
#!pip install tqdm
from tqdm import tqdm
import pandas as pd
import numpy as np
import cv2 as cv
import datetime

### Organize the original dataset


In the original dataset, we are given about 3500 images. In each image, there are multiple characters, a rough average of 180 per image. The original train.csv file has one row for each image. The labels are separated by spaces and include the unicode character, x position start, y position start, width, and height of the area where the character is. 

First, we're going to turn this into a new dataframe where each character from each image has it's own row.

In [2]:
df = pd.read_csv("train.csv")
new_rows = []

for index, row in df.iterrows():
    label_array = row["labels"].split()
    length = int(len(label_array) / 5)
    for i in range(0,length):
        new_row = {}
        new_row["image_id"] = row["image_id"]
        new_row["x_start"] = label_array[(i*5)+1]
        new_row["y_start"] = label_array[(i*5)+2]
        new_row["width"] = label_array[(i*5)+3]
        new_row["height"] = label_array[(i*5)+4]
        new_row["character"] = label_array[(i*5)+0]
        new_rows.append(new_row)    


labels_to_features = pd.DataFrame(new_rows)
labels_to_features.head()

Unnamed: 0,character,height,image_id,width,x_start,y_start
0,U+306F,53,100241706_00004_2,133,1231,3465
1,U+304C,69,100241706_00004_2,84,275,1652
2,U+3044,69,100241706_00004_2,143,1495,1218
3,U+3051,91,100241706_00004_2,53,220,3331
4,U+306B,92,100241706_00004_2,61,911,1452


In [3]:
labels_to_features.describe()

Unnamed: 0,character,height,image_id,width,x_start,y_start
count,683464,683464,683464,683464,683464,683464
unique,4212,455,3605,333,2826,3927
top,U+306B,84,200014685-00010_1,84,683,1328
freq,24685,10061,614,9813,554,349


In [4]:
char_counts = pd.DataFrame(labels_to_features.groupby(['character']).size(), columns=["count"])
print("lowest counts: \n", char_counts.sort_values(by=["count"])[0:5])
print("highest counts: \n", char_counts.sort_values(by=["count"])[-6:-1])

lowest counts: 
            count
character       
U+003F         1
U+717D         1
U+717B         1
U+716C         1
U+7165         1
highest counts: 
            count
character       
U+3092     15743
U+3068     16588
U+3066     20569
U+3057     22209
U+306E     24136


In [5]:
# char_counts_1 = set(char_counts.index[char_counts['count'] <= 1].tolist())
# print("Dropping", len(char_counts_1), "rows where there was only one instance of the character.")
# labels_to_features_over_1 = labels_to_features[~labels_to_features['character'].isin(char_counts_1)]

In [6]:
# import zipfile as zf

# files = zf.ZipFile("train_images.zip", 'r')
# files.extractall('train')
# files.close()

Firstly, let's cut down the "unique" character count of a whopping 4212. We can do this by separating the data into three data sets based on the character type and handle them separately for now.

This would be a good step for a layer in a deep learning model.

In [5]:
df_hiragana = pd.read_csv("hiragana.csv")
df_katakana = pd.read_csv("katakana.csv")
set_hiragana = set(df_hiragana["unicode"])
set_katakana = set(df_katakana["unicode"])
print(len(set_hiragana))
print(len(set_katakana))

82
80


In [6]:
import matplotlib.image as mpimg
import matplotlib.pyplot as plt

%matplotlib inline

def writeToCSV(file, img_size, labels_to_features,alphabet = "all", limit = float("inf")):
    # Write column headers to the file
    for i in range(img_size * img_size):
        file.write(str(i) + ",")
    file.write("character")
    file.write("\n")
    
    curr_file_name = ""
    curr_img = []
    for row in labels_to_features.iterrows():
        # set a limit for testing purposes
        if row[0] > limit:
            break
            
        row = row[1]
        # skip anything not in the alphabet we specified
        label = row["character"]
        if alphabet == "hiragana" and label not in set_hiragana:
            continue
        if alphabet == "katakana" and label not in set_katakana:
            continue
        if alphabet == "kanji" and (label in set_hiragana or label in set_katakana):
            continue
        
        # for each image, we want to convert it to black n white, with a threshhold for easy reading
        # they're in order based on file name, so we can do this just once per image and save it until
        #    another image appears in the loop
        filename = "train/" + row["image_id"] + ".jpg"
        if filename != curr_file_name: 
            orig_img = cv.imread(filename)
            bw_img = cv.cvtColor(orig_img, cv.COLOR_RGB2GRAY )
            thresh_img = cv.adaptiveThreshold(bw_img,255,cv.ADAPTIVE_THRESH_MEAN_C,cv.THRESH_BINARY,101,17)
            curr_file_name = filename
            curr_img = thresh_img        

        width = int(row["width"])
        height = int(row["height"])
        start_row = int(row["x_start"])
        start_col = int(row["y_start"])
        end_row = start_row + width
        end_col = start_col + height
        
        char_img = curr_img[start_col:end_col,start_row:end_row]
        
        # resize image to the specified size and pad it to make it a square
        padding = []
        if height > width:
            new_height = img_size
            new_width = int((width*new_height)/(height))
            if new_width % 2 == 1:
                new_width += 1
            pad = int((img_size-new_width)/2)
            padding = [0,0,pad,pad]
        else:
            new_width = img_size
            new_height = int((height*new_width)/(width))
            if new_height % 2 == 1:
                new_height += 1
            pad = int((img_size-new_height)/2)
            padding = [pad,pad,0,0]
        sm_char_img = cv.resize(char_img, (new_width,new_height), interpolation = cv.INTER_AREA)     
        norm_char_img = cv.copyMakeBorder(sm_char_img, padding[0], padding[1], padding[2], padding[3], cv.BORDER_CONSTANT,value=[255,255,255])

#         plt.figure(figsize=(1,1))
#         plt.imshow(norm_char_img, cmap=plt.cm.gray, interpolation='nearest')
        
        for row in norm_char_img:
            for i in range(0,len(row)):
                f.write(str(row[i])+",")
        file.write(label)
        file.write("\n")


In [9]:
# # SAMPLE
# f = open("all_hiragana2.csv", "w")
# writeToCSV(f,30,labels_to_features,"hiragana",100)
# f.close()

In [10]:
# now = datetime.datetime.now()

# f = open("all_hiragana.csv", "w")
# writeToCSV(f,30,labels_to_features,"hiragana")
# f.close()

# print("Time to complete:", datetime.datetime.now() - now)

In [11]:
# now = datetime.datetime.now()

# f = open("all_katakana.csv", "w")
# writeToCSV(f,30,labels_to_features,"katakana")
# f.close()

# print("Time to complete:", datetime.datetime.now() - now)

In [12]:
now = datetime.datetime.now()

f = open("all_kanji.csv", "w")
writeToCSV(f,30,labels_to_features,"kanji")
f.close()

print("Time to complete:", datetime.datetime.now() - now)

Time to complete: 0:16:02.647281


In [7]:
now = datetime.datetime.now()

f = open("all_chars.csv", "w")
writeToCSV(f,30,labels_to_features)
f.close()

print("Time to complete:", datetime.datetime.now() - now)

Time to complete: 0:30:07.293539
