In [1]:
# Text Detection
# Forrester Welch
# The goal of this project to recognize where an instance of text appears in an image

# Imports for convolutional neural networks, data management, and image processing
import tensorflow as tf
import pandas as pd
import json
import os
import csv
import numpy as np
from PIL import Image
from sklearn.model_selection import train_test_split
import tensorflow.keras.layers as layer

In [2]:
# Load the data from the cocotext json file 
# Download the cocotext annotations json here: https://bgshih.github.io/cocotext/#h2-download
# Download cocotext.v2.zip [12 MB] and unzip for cocotext.v2.json, then rename to cocotext.json
data = pd.read_json('cocotext.json')

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 236291 entries, 45346 to 390310
Data columns (total 5 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   cats       0 non-null       float64
 1   anns       201126 non-null  object 
 2   imgs       53686 non-null   object 
 3   imgToAnns  53686 non-null   object 
 4   info       0 non-null       float64
dtypes: float64(2), object(3)
memory usage: 10.8+ MB


In [4]:
data.columns

Index(['cats', 'anns', 'imgs', 'imgToAnns', 'info'], dtype='object')

In [5]:
#example of element in data['anns']
data['anns'].iloc[1000]

{'area': 67.21,
 'bbox': [262.6, 218.4, 9.8, 8.1],
 'class': 'machine printed',
 'id': 102540,
 'image_id': 353906,
 'language': 'english',
 'legibility': 'illegible',
 'mask': [263.5, 219.3, 262.6, 225.9, 272.4, 226.5, 272.0, 218.4],
 'utf8_string': ''}

In [6]:
# cycle through annotations
# only add elements if machine printed, english, and legible
# create dataset of image ids (we will later convert id to image filename)
# create bbox dataset
annotations = data['anns']
image = []
bbox = []

# This step may take a couple minutes
for i in range(len(data['anns'])):
    current = annotations.iloc[i]
    if(pd.isna(current)):
        continue
    if(current['class'] == 'machine printed' and current['language'] == 'english'
      and current['legibility'] == 'legible' and current['image_id'] not in list(image)):
            image.append(annotations.iloc[i]['image_id'])
            bbox.append(annotations.iloc[i]['bbox'])
        


In [7]:
# example element of data['imgs']
data['imgs'].iloc[1000]['file_name']

'COCO_train2014_000000102540.jpg'

In [8]:
# To change the image_id value to the filename of the image, I need a hashmap of key-value pairs
# The dict object in python is supposed to operate like a hash map, but I could not figure out how
# to make it work for our purposes. I found this implementation of a hash table at the link listed
# below. This HashTable implementation made it simple and easy to convert image_id to image name.
# https://www.geeksforgeeks.org/hash-map-in-python/

class HashTable: 
  
    # Create empty bucket list of given size 
    def __init__(self, size): 
        self.size = size 
        self.hash_table = self.create_buckets() 
  
    def create_buckets(self): 
        return [[] for _ in range(self.size)] 
  
    # Insert values into hash map 
    def set_val(self, key, val): 
        
        # Get the index from the key 
        # using hash function 
        hashed_key = hash(key) % self.size 
          
        # Get the bucket corresponding to index 
        bucket = self.hash_table[hashed_key] 
  
        found_key = False
        for index, record in enumerate(bucket): 
            record_key, record_val = record 
              
            # check if the bucket has same key as 
            # the key to be inserted 
            if record_key == key: 
                found_key = True
                break
  
        # If the bucket has same key as the key to be inserted, 
        # Update the key value 
        # Otherwise append the new key-value pair to the bucket 
        if found_key: 
            bucket[index] = (key, val) 
        else: 
            bucket.append((key, val)) 
  
    # Return searched value with specific key 
    def get_val(self, key): 
        
        # Get the index from the key using 
        # hash function 
        hashed_key = hash(key) % self.size 
          
        # Get the bucket corresponding to index 
        bucket = self.hash_table[hashed_key] 
  
        found_key = False
        for index, record in enumerate(bucket): 
            record_key, record_val = record 
              
            # check if the bucket has same key as  
            # the key being searched 
            if record_key == key: 
                found_key = True
                break
  
        # If the bucket has same key as the key being searched, 
        # Return the value found 
        # Otherwise indicate there was no record found 
        if found_key: 
            return record_val 
        else: 
            return "No record found"
  
    # Remove a value with specific key 
    def delete_val(self, key): 
        
        # Get the index from the key using 
        # hash function 
        hashed_key = hash(key) % self.size 
          
        # Get the bucket corresponding to index 
        bucket = self.hash_table[hashed_key] 
  
        found_key = False
        for index, record in enumerate(bucket): 
            record_key, record_val = record 
              
            # check if the bucket has same key as 
            # the key to be deleted 
            if record_key == key: 
                found_key = True
                break
        if found_key: 
            bucket.pop(index) 
        return
  
    # To print the items of hash map 
    def __str__(self): 
        return "".join(str(item) for item in self.hash_table) 
  
  
hash_table = HashTable(50) 
  
# insert some values 
hash_table.set_val('gfg@example.com', 'some value') 
print(hash_table) 
print() 
  
hash_table.set_val('portal@example.com', 'some other value') 
print(hash_table) 
print() 
  
# search/access a record with key 
print(hash_table.get_val('portal@example.com')) 
print() 
  
# delete or remove a value 
hash_table.delete_val('portal@example.com') 
print(hash_table) 

[][][][][][][][][][][][][][][][][][][][][][][][][][][][][][][][][][][][][][][][][][][][][][][][][][('gfg@example.com', 'some value')]

[('portal@example.com', 'some other value')][][][][][][][][][][][][][][][][][][][][][][][][][][][][][][][][][][][][][][][][][][][][][][][][][('gfg@example.com', 'some value')]

some other value

[][][][][][][][][][][][][][][][][][][][][][][][][][][][][][][][][][][][][][][][][][][][][][][][][][('gfg@example.com', 'some value')]


In [9]:
# Intialize HashTable with key-value pair of image_id <-> file_name

# Our hashtable that will store key-value pairs of id-filename
image_tree = HashTable(10000)

# Cycle through imgs to gather data
for i in range(len(data['imgs'])):
    if(pd.isna(data['imgs'].iloc[i])):
        continue
    current = data['imgs'].iloc[i]
    image_tree.set_val(current['id'], current['file_name'])

In [10]:
# Convert the image vector to contain file names instead of id numbers
for i in range(len(image)):
    image[i] = image_tree.get_val(image[i])

In [11]:
# This block of code can be skipped in the future. It is now commented out for final submission
# The purpose of this block is to put the image file names into a text file
# The reason for this has to do with how the images for this project were collected.
# The coco-text.json annotations were released as an addendum to the original COCO2014 image dataset.
# Every instance of text in that datset was recorded in coco-text.json. However, not every image
# has an instance of text. It is not posssible to download just the text images, only the entire
# 2014 COCO image dataset can be downloaded. To save storage space, I moved the necessary images out
# of the folder so I could delete the unnecessary images all at once. For reference, the terminal
# command to move a list of files is as follows: 
# for i in $(cat all_text_image_names.txt); do mv "$i" /temp_dest/; done
# The set of necessary images can be found in my github at: 


#unique_filenames = list(set(image))
#import numpy as np
#name_file = open("all_text_image_names.txt", "w")
#np.savetxt(name_file, unique_filenames, fmt="%s")

#name_file.close()

In [12]:
# Convert the list of image file_names to a list of 2d-arrays of pixels
# Converts the bbox into a scale of [0,1]
# bbox is originally annotated: x,y,width,height
    # We convert to xmin, ymin, xmax, ymax on scale of 0-1
# This step may take five minutes
for i in range(len(image)):
    image_name = "train2014/" + image[i]
    width, height = Image.open(image_name).size
    xmax = (bbox[i][0] + bbox[i][2]) / width
    ymax = (bbox[i][1] + bbox[i][3]) / height
    xmin = bbox[i][0] / width
    ymin = bbox[i][1] / height
    bbox[i] = [xmin, ymin, xmax, ymax]
    # The images are resized to (100,100) because the kernel could not handle a larger size
    # With limitless computational resources, a full size 600x600 image may yield more accurate results
    # Interestingly, when images were resized to 128x128 or 164x164, they had slightly less accuracy
    # than the 100x100 option.
    file = tf.keras.preprocessing.image.load_img(image_name, target_size=(100,100), color_mode='grayscale')
    image[i] = tf.keras.preprocessing.image.img_to_array(file)

In [13]:
# The image pixel values are rescaled from [0-1]
image = np.array(image, dtype="float32") / 255
bbox = np.array(bbox, dtype="float32")


In [14]:
# Split the train and test data with split size .2 and random seed = 400
image_train, image_test, bbox_train, bbox_test = train_test_split(image, bbox, test_size=0.2, random_state=400)

In [15]:
# use imgs to create key value map of id to image name
# get list of all image names from imgs
# cycle through anns and keep every image that contains an annotation of text

In [16]:
# Create a keras Model
# Our model has 3 convolutional layers with 32,32, and 64 filters 
# The final layer of our model returns four neurons, each representing a coordinate of the bounding box
# This architecture differs from traditional neural networks doing classification
# Instead of recognizing what an object is, we aim to find where an object is
# This is done using regression to calculate the best fitting bounding box.
# The outline for how to make a regression layer the final layer of the model was found at the 
# following link: https://medium.com/analytics-vidhya/object-localization-with-keras-2f272f79e03c
# We trained the model using a different number of filters on each layer, as well as a different number
# of layers. The results of these experiments are noted in the final writeup. Ultimately, having 
# more filters lead to overfitting which lowered accuracy on the validation datset.
def get_model():
    inputs = tf.keras.Input(shape=(100,100,1))
    x = layer.Conv2D(32, (3,3), activation='relu')(inputs)
    x = layer.MaxPooling2D((3,3))(x)
    x = layer.Conv2D(32, (3,3), activation='relu')(x)
    x = layer.MaxPooling2D((3,3))(x)
    x = layer.Conv2D(64, (3,3), activation='relu')(x)
    x = layer.GlobalAveragePooling2D()(x)
    
    
    reg_head = layer.Dense(128, activation='relu')(x)
    reg_head = layer.Dense(64, activation='relu')(x)
    reg_head = layer.Dense(32, activation='relu')(reg_head)
    # Notice the name of the layer.
    reg_head = layer.Dense(4, activation='sigmoid', name='bbox')(reg_head)
    return tf.keras.Model(inputs=[inputs], outputs=[reg_head])

In [17]:
# Initalize the model
model = get_model()
model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 100, 100, 1)]     0         
_________________________________________________________________
conv2d (Conv2D)              (None, 98, 98, 32)        320       
_________________________________________________________________
max_pooling2d (MaxPooling2D) (None, 32, 32, 32)        0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 30, 30, 32)        9248      
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 10, 10, 32)        0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 8, 8, 64)          18496     
_________________________________________________________________
global_average_pooling2d (Gl (None, 64)                0     

In [18]:
batch_size = 128
# We experimented with more epochs, but this led to overfitting the data and lowered the accuracy
# on the validation set. Around 30 epochs, the loss function comes close to convergence.
epochs = 30

losses = "mean_squared_error"

model.compile(loss=losses, optimizer="adam", metrics=["accuracy"])



In [19]:
# Train the model.
# This step may take 20-30 minutes. It may be easier to change epochs to 10 to save time.
model.fit(image_train, bbox_train, batch_size=batch_size, epochs=epochs, validation_split=0.2)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<tensorflow.python.keras.callbacks.History at 0x7fb29ef1a7b8>

In [20]:
# Measure accuracy against validation set.
model.evaluate(image_test, bbox_test)



[0.06526488810777664, 0.5688604712486267]