In [1]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
import numpy as np
import matplotlib.pyplot as plt

from pathlib import Path
from collections import Counter

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

The dataset contains 1040 captcha files as png images. The label for each sample is a string, the name of the file (minus the file extension). We will map each character in the string to an integer for training the model. Similary, we will need to map the predictions of the model back to strings. For this purpose we will maintain two dictionaries, mapping characters to integers, and integers to characters, respectively.

In [2]:
# Path to the data directory
data_dir = Path("./data/captcha_images_v2/")

# Get list of all the images
images = sorted(list(map(str, list(data_dir.glob("*.png"))))) #get all .png files and sorted the list in alphabetical order
labels = [img.split(os.path.sep)[-1].split(".png")[0] for img in images]
characters = sorted(list(set(char for label in labels for char in label))) #get all the characters in all the labels, remove non unique values with (set) and sorted the result

print(f"Number of images found : {len(images)}")
print(f"Number of labels found : {len(labels)}")
print(f"Number of unique characters: {len(characters)}")
print(f"characters present: {characters}")

# Batch size for training and validation
batch_size = 16

# Desired image dimensions
img_width = 200
img_height = 50

# Factor by which the image is going to be downsampled
# by the convolutional blocks. We will be using two
# convolution blocks and each block will have
# a pooling layer which downsample the features by a factor of 2.
# Hence total downsampling factor would be 4.
downsample_factor = 4

# Maximum length of any captcha in the dataset
max_length = max([len(label) for label in labels])



Number of images found : 1040
Number of labels found : 1040
Number of unique characters: 19
characters present: ['2', '3', '4', '5', '6', '7', '8', 'b', 'c', 'd', 'e', 'f', 'g', 'm', 'n', 'p', 'w', 'x', 'y']


Preprocessing

In [23]:
# Mapping characters to integers : return a class constructor ?
char_to_num = layers.StringLookup(vocabulary=list(characters), mask_token=None)
# Mapping characters integers back to original characters : return a class constructor ?
num_to_char = layers.StringLookup(vocabulary=char_to_num.get_vocabulary(), mask_token=None)

# Function to split the dataset between training set and validation set
def split_data(images, labels, train_size=0.9, shuffle=True):
    # 1. Get the total size of the sataset
    size = len(images)
    # 2. Make an indices array and shuffle it, if required
    indices = np.arange(size)
    if shuffle:
        np.random.shuffle(indices)
    # 3. Get the size of training sample
    train_samples = int(size * train_size)
    # 4. Split data into training and validation sets
    x_train, y_train = images[indices[:train_samples]], labels[indices[:train_samples]]
    x_valid, y_valid = images[indices[train_samples:]], labels[indices[train_samples:]]
    return x_train, x_valid, y_train, y_valid

# Splitting daqta into training and validation set
x_train, x_valid, y_train, y_valid = split_data(np.array(images), np.array(labels))

def encode_single_sample(img_path, label):
    # 1. Read image
    img = tf.io.read_file(img_path)
    # 2. Decode and convert to gray scale (channels = 1 = grayscale
    img = tf.io.decode_png(img, channels=1)
    # 3. Convert in float32 in [0, 1] range
    img = tf.image.convert_image_dtype(img, tf.float32)
    # 4. Resize to the desired size
    img = tf.image.resize(img, [img_height, img_width])
    # 5. Transpose the image data array because we want the time
    # dimension to corresponde to the width of the image (e.g. make the width appear first and then the height)
    img = tf.transpose(img, perm=[1, 0, 2])
    # 6. Map the characters in label to numbers
    label = char_to_num(tf.strings.unicode_split(label, input_encoding="UTF-8"))
    # 7. Return a dict as our model is expecting two outputs
    return {"image": img, "label": label}


Create Dataset object

In [26]:
train_dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train))
print(train_dataset.take(1))

<TakeDataset element_spec=(TensorSpec(shape=(), dtype=tf.string, name=None), TensorSpec(shape=(), dtype=tf.string, name=None))>
