In [1]:
# https://www.tensorflow.org/tutorials/text/image_captioning
import sys
import tensorflow as tf
import tensorflow_text as tf_text
sys.path.append('..')
tf.get_logger().setLevel('ERROR')

# Check GPU working

In [2]:
physical_devices = tf.config.list_physical_devices('GPU') 
tf.config.experimental.set_memory_growth(physical_devices[0], True)

In [3]:
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0': raise SystemError('GPU device not found')
print('Found GPU at:', device_name)
!nvcc -V

Found GPU at: /device:GPU:0
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2020 NVIDIA Corporation
Built on Mon_Nov_30_19:15:10_Pacific_Standard_Time_2020
Cuda compilation tools, release 11.2, V11.2.67
Build cuda_11.2.r11.2/compiler.29373293_0


# Import the dataset

In [5]:
from dataset_handler import create_dataset, remove_rare_chars
DATASET_DIR = r'../../Dataset/trdg'
LABELS_PATH = r'../../Dataset/trdg/labels.txt'
FONT_PATH = r'../../Dataset/NomNaTong-Regular.ttf'
HEIGHT = 148
WIDTH = 32

## Load and remove records with rare characters

In [7]:
img_paths, labels, vocabs = create_dataset(DATASET_DIR, LABELS_PATH, sim2tra=True)
img_paths, labels, vocabs = remove_rare_chars(img_paths, labels, vocabs, threshold=3)
print('Number of images found:', len(img_paths))
print('Number of labels found:', len(labels))
print('Number of unique characters:', len(vocabs))
print('Characters present:', vocabs)

Number of images found: 4437
Number of labels found: 4437
Number of unique characters: 3202
Characters present:{'㐌': 316, '浪': 302, '朱': 247, '\U000f047c': 214, '\U000f070b': 213, '尼': 211, '吏': 200, '𦋦': 197, '𢚸': 197, '娘': 197, '𠊛': 192, '固': 189, '拱': 172, '之': 167, '情': 161, '埃': 161, '買': 151, '芇': 146, '𠬠': 145, '衝': 143, '仙': 139, '欺': 138, '羣': 136, '𠳒': 136, '別': 135, '麻': 130, '𠓨': 127, '𧗱': 124, '爫': 122, '塘': 117, '花': 114, '𠫾': 109, '𠄩': 108, '払': 106, '渚': 106, '𡗶': 106, '浽': 104, '傷': 103, '雲': 102, '強': 102, '吶': 101, '𦖑': 99, '渃': 99, '饒': 97, '庒': 97, '如': 96, '些': 95, '分': 94, '畧': 93, '咍': 92, '昆': 90, '鐄': 90, '低': 90, '時': 90, '𢬣': 89, '𡢐': 88, '於': 88, '身': 87, '兜': 87, '𧡊': 86, '細': 85, '生': 81, '月': 80, '皮': 78, '戈': 78, '公': 78, '調': 76, '﨤': 74, '𣈜': 74, '油': 73, '沛': 73, '牢': 73, '爲': 73, '仍': 73, '茹': 73, '命': 72, '排': 70, '包': 66, '娥': 66, '典': 66, '碎': 65, '㗂': 64, '\U000f061a': 63, '𠁀': 61, '\U000f02c7': 61, '恩': 61, '耒': 59, '迻': 59, '茄': 57, '共': 57, '

## Visualize the data

In [None]:
from visualizer import visualize_images_labels
visualize_images_labels(img_paths, labels, font_path=FONT_PATH, text_x=WIDTH + 3)