In [1]:
import tensorflow as tf

In [2]:
physical_devices = tf.config.experimental.list_physical_devices('GPU') 
for physical_device in physical_devices: 
    tf.config.experimental.set_memory_growth(physical_device, True)

print(physical_devices)

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [3]:
import numpy as np
import json
from tqdm import tqdm

In [4]:
!pwd

/home/sweet/1-workdir/nlp_attention/en_vi_attention_nlp


In [5]:
db_dir = "/home/sweet/1-workdir/nlp_attention/en_vi_data_preprocess/src/"
db_file = "train-test.json"
dict_file = "dictionary.json"

train_X = []
train_Y = []

test_X = []
test_Y = []

with open(db_dir + db_file, 'r') as f_db, open(db_dir + dict_file, 'r') as f_dict:
    db = json.load(f_db)
    dictionary = json.load(f_dict)
    
train_X = db['train_X']
train_Y = db['train_Y']
test_X = db['test_X']
test_Y = db['test_Y']

dictionary_from = dictionary['from']['dictionary']
rev_dictionary_from = dictionary['from']['rev_dictionary']

dictionary_to = dictionary['to']['dictionary']
rev_dictionary_to = dictionary['to']['rev_dictionary']

In [6]:
GO = dictionary_from['GO']
PAD = dictionary_from['PAD']
EOS = dictionary_from['EOS']
UNK = dictionary_from['UNK']

In [7]:
for i in tqdm(range(len(train_X))):
    train_X[i] += ' EOS'

100%|██████████| 133317/133317 [00:00<00:00, 2354676.22it/s]


In [8]:
for i in tqdm(range(len(test_X))):
    test_X[i] += ' EOS'

100%|██████████| 2821/2821 [00:00<00:00, 1161151.28it/s]


In [9]:
num_encoders=6
num_multi_heads=8
d_k=64
d_v=64
d_model=512
optimizer="adam"
null_token_value=0
source_vocab_size = len(dictionary_from)
target_vocab_size = len(dictionary_to)
share_word_embedding=False
MAXIMUM_TEXT_LENGTH = 250

print(source_vocab_size, target_vocab_size)

48114 22468


In [10]:
def str_idx(corpus, dic):
    X = []
    for i in corpus:
        ints = []
        for k in i.split():
            ints.append(dic.get(k,UNK))
        X.append(ints)
    return X

def pad_sentence_batch(sentence_batch, pad_int):
    padded_seqs = []
    seq_lens = []
    max_sentence_len = max([len(sentence) for sentence in sentence_batch])
    for sentence in sentence_batch:
        padded_seqs.append(sentence + [pad_int] * (max_sentence_len - len(sentence)))
        seq_lens.append(len(sentence))
    return np.array(padded_seqs, dtype=np.int32), np.array(seq_lens)

def pad_along_axis(array: np.ndarray, target_length: int, axis: int = 0):
    pad_size = target_length - array.shape[axis]
    if pad_size <= 0:
        return array
    npad = [(0, 0)] * array.ndim
    npad[axis] = (0, pad_size)
    return np.pad(array, pad_width=npad, mode='constant', constant_values=0)

In [11]:
train_X = str_idx(train_X, dictionary_from)
test_X = str_idx(test_X, dictionary_from)
train_Y = str_idx(train_Y, dictionary_to)
test_Y = str_idx(test_Y, dictionary_to)

In [12]:
padded_train_X, _ = pad_sentence_batch(train_X, PAD)
padded_train_Y, _ = pad_sentence_batch(train_Y, PAD)
print(padded_train_X.shape, padded_train_Y.shape)

(133317, 685) (133317, 865)


In [13]:
padded_test_X, _ = pad_sentence_batch(test_X, PAD)
padded_test_Y, _ = pad_sentence_batch(test_Y, PAD)
print(padded_test_X.shape, padded_test_Y.shape)

(2821, 104) (2821, 123)


In [14]:
max_char_length = max(padded_train_X.shape[1], padded_train_Y.shape[1])
padded_train_X = pad_along_axis(padded_train_X, max_char_length, axis=1)
padded_train_Y = pad_along_axis(padded_train_Y, max_char_length, axis=1)
print(padded_train_X.shape, padded_train_Y.shape)

max_char_length = max(padded_test_X.shape[1], padded_test_Y.shape[1])
padded_test_X = pad_along_axis(padded_test_X, max_char_length, axis=1)
padded_test_Y = pad_along_axis(padded_test_Y, max_char_length, axis=1)
print(padded_test_X.shape, padded_test_Y.shape)

(133317, 865) (133317, 865)
(2821, 123) (2821, 123)


In [15]:
def _bytes_feature(value):
    """Returns a bytes_list from a string / byte."""
    if isinstance(value, type(tf.constant(0))):
        value = value.numpy() # BytesList won't unpack a string from an EagerTensor.
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def serialize_array(array):
    array = tf.io.serialize_tensor(array)
    return array

def image_example(input_arr, target_arr):

    feature = {
      'input': _bytes_feature(input_arr),
      'target': _bytes_feature(target_arr),
    }

    return tf.train.Example(features=tf.train.Features(feature=feature))

In [16]:
record_file = 'train.tfrecords'
with tf.io.TFRecordWriter(record_file) as writer:
    for i in tqdm(range(len(padded_train_X))):
        tf_example = image_example(serialize_array(padded_train_X[i]), serialize_array(padded_train_Y[i]))
        writer.write(tf_example.SerializeToString())

100%|██████████| 133317/133317 [00:16<00:00, 8159.88it/s]


In [17]:
record_file = 'test.tfrecords'
with tf.io.TFRecordWriter(record_file) as writer:
    for i in tqdm(range(len(padded_test_X))):
        tf_example = image_example(serialize_array(padded_test_X[i]), serialize_array(padded_test_Y[i]))
        writer.write(tf_example.SerializeToString())

100%|██████████| 2821/2821 [00:00<00:00, 9347.89it/s]
