In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import transformers
import json
import re
import string
import os

### Loading Data...

In [2]:
train_data_path = "data/en.train.json"
validation_data_path = "data/en.dev.json"
test_data_path = "data/en.trial.complete.json"

assert os.path.exists(train_data_path)
assert os.path.exists(validation_data_path)
assert os.path.exists(test_data_path)

In [3]:
with open(train_data_path) as f : 
    train_data = json.load(f)
    
with open(validation_data_path) as f : 
    validation_data = json.load(f)
    
with open(test_data_path) as f : 
    test_data = json.load(f)
    

In [4]:
print("Total Train Data Samples : " , len(train_data))
print("Total Val Data Samples : " , len(validation_data))
print("Total Test Data Samples : " , len(test_data))

Total Train Data Samples :  43608
Total Val Data Samples :  6375
Total Test Data Samples :  200


In [5]:
print('Sanity Checks...')
print('\nSample Train Data : ')
print('Available keys : ' , train_data[0].keys())
print('ID : ' , train_data[0]['id'])
print('GLOSS : ' , train_data[0]['gloss'])


print('\nSample Train Data : ')
print('Available keys : ' , validation_data[0].keys())
print('ID : ' , validation_data[0]['id'])
print('GLOSS : ' , validation_data[0]['gloss'])

print('\nSample Train Data : ')
print('Available keys : ' , test_data[0].keys())
print('ID : ' , test_data[0]['id'])
print('GLOSS : ' , test_data[0]['gloss'])
print('WORD : ' , test_data[0]['word'])
print('POS : ' , test_data[0]['pos'])
print('EXAMPLE : ' , test_data[0]['example'])
print('TYPE : ' , test_data[0]['type'])
print('COUNTS : ' , test_data[0]['counts'])
print('F_RNK : ' , test_data[0]['f_rnk'])
print('CONCRETE : ' , test_data[0]['concrete'])
print('POLYSEMOUS : ' , test_data[0]['polysemous'])

print('\nEmbedding sizes : ')
print('SGNS SIZE : ', len(test_data[0]['sgns']))
print('CHAR SIZE : ', len(test_data[0]['char']))
print('ELECTRA SIZE : ', len(test_data[0]['electra']))

print('\nMaximum gloss length : ')

char_level = max(train_data+validation_data+test_data, key=lambda x: len(x['gloss']))
word_level = max(train_data+validation_data+test_data, key=lambda x: len(x['gloss'].split()))

print('Character level : ', len(char_level['gloss']))
print('Word level : ', len(word_level['gloss'].split()))

Sanity Checks...

Sample Train Data : 
Available keys :  dict_keys(['id', 'gloss', 'sgns', 'char', 'electra'])
ID :  en.train.1
GLOSS :  A blemish .

Sample Train Data : 
Available keys :  dict_keys(['id', 'gloss', 'sgns', 'char', 'electra'])
ID :  en.dev.1
GLOSS :  A meal consisting of food normally eaten in the morning , which may typically include eggs , sausages , toast , bacon , etc .

Sample Train Data : 
Available keys :  dict_keys(['id', 'word', 'pos', 'gloss', 'example', 'type', 'counts', 'f_rnk', 'concrete', 'polysemous', 'sgns', 'char', 'electra'])
ID :  en.trial.1
GLOSS :  Pleasant ; clear .
WORD :  beautiful
POS :  adjective
EXAMPLE :  It 's beautiful outside , let 's go for a walk .
TYPE :  synonym/antonym-based
COUNTS :  124908
F_RNK :  706
CONCRETE :  0
POLYSEMOUS :  0

Embedding sizes : 
SGNS SIZE :  256
CHAR SIZE :  256
ELECTRA SIZE :  256

Maximum gloss length : 
Character level :  643
Word level :  129


### Tokenizing gloss...

In [6]:
def process_gloss(gloss) : 
    
    '''
    1. Removes puntuations and numerals from the gloss. 
    2. Lowercases the words. 
    3. Returns list of words. 
    '''
    
    processed = re.sub(r'[^\w\s]','',gloss)
    processed = processed.lower()
    return processed.split()

In [7]:
def get_word_tokenizer(data, unk_ratio=0.0) : 
    
    '''
    Returns list of all words in the gloss. Index serves as the 
    tokenization index. 
    
    unk_ratio is used to set a percentage of singletons in 
    the gloss to "UNK" token. 
    '''
    
    assert unk_ratio <= 1.00, 'unk_ratio should be between 0 and 1.'
    tokens = []
    token_counts = {}
    
    # preparing token list
    
    for word in test_data + train_data + validation_data : 
        processed_gloss = process_gloss(word['gloss'])

        for token in processed_gloss :

            if token not in tokens : 
                tokens.append(token)
                token_counts[token] = 1
            else : 
                token_counts[token] += 1
                
    
    if unk_ratio > 0.0 :
         
        '''
        If UNK tokens are required - 
        1. Find all the singleton tokens
        2. unk_ratio of all the singleton tokens are bucketed in unk_tokens 
        3. Remove these unk_tokens from the tokens and token_counts 
        '''
        
        singleton_tokens = [token for token, count in token_counts.items() if count==1]
        
        num_unk_tokens = int(len(singleton_tokens) * unk_ratio)
        unk_tokens = singleton_tokens[:num_unk_tokens]
        
        unk_token_count = 0
        for token in unk_tokens : unk_token_count += token_counts[token] 
        
        tokens = [token for token in tokens if token not in unk_tokens]
        token_counts = {token:count for token, count in token_counts.items() if token not in unk_tokens}
        
        
    '''
    1. Adding UNK token
    2. Adding PAD token
    '''
    tokens.append("UNK")
    token_counts["UNK"] = unk_token_count
    tokens.append("PAD")
        
    return tokens, token_counts

def get_char_tokenizer(data) : 
    
    tokens = list(string.ascii_lowercase + " " + ".")
    tokens.append("UNK")
    tokens.append("PAD")
    return tokens

In [8]:
data = train_data + validation_data + test_data

word_tokenizer, word_counts = get_word_tokenizer(data, unk_ratio=0.25)
char_tokenizer = get_char_tokenizer(data)

word_vocab_size = len(word_tokenizer)
char_vocab_size = len(char_tokenizer)

In [9]:
def tokenize(gloss, tokenizer, tokenize_level="WORD", pad=False, target_len=100) : 
    
    processed_gloss = process_gloss(gloss)
    tokens = []
    
    if tokenize_level=="WORD" :       
        words = processed_gloss
    else : 
        words = " ".join(processed_gloss)
        
        
    for word in words : 
        try : 
            tokens.append(tokenizer.index(word))
        except : 
            tokens.append(tokenizer.index("UNK"))
    
    if pad : 
        
        pad_token = tokenizer.index("PAD")
        
        if len(tokens)>=target_len : 
            return tokens[:target_len]
        
        while len(tokens) < target_len : 
            tokens.append(pad_token)
    
        
    return tokens

In [10]:
print('Word Level Tokenizer : ')
print('Token0 is UNK : ' , tokenize("arthur lay in the mud", word_tokenizer, tokenize_level="WORD"))
print('Token1 is UNK : ' , tokenize("ford prefect saw the vogons", word_tokenizer, tokenize_level="WORD"))
print('Token0 is UNK : ' , tokenize("slartibaarfast", word_tokenizer, tokenize_level="WORD"))
print('\nChar Level Tokenizer : ')
print('Padded : ' , tokenize("arthur lay in the mud", char_tokenizer, tokenize_level="CHAR", pad=True, target_len=30))
print('Unpadded : ' , tokenize("ford prefect saw the vogons", char_tokenizer, tokenize_level="CHAR"))
print('Token13 is UNK : ' , tokenize("slartibaarfasΩt", char_tokenizer, tokenize_level="CHAR"))

Word Level Tokenizer : 
Token0 is UNK :  [25162, 3545, 25, 42, 9361]
Token1 is UNK :  [18605, 25162, 18048, 42, 25162]
Token0 is UNK :  [25162]

Char Level Tokenizer : 
Padded :  [0, 17, 19, 7, 20, 17, 26, 11, 0, 24, 26, 8, 13, 26, 19, 7, 4, 26, 12, 20, 3, 29, 29, 29, 29, 29, 29, 29, 29, 29]
Unpadded :  [5, 14, 17, 3, 26, 15, 17, 4, 5, 4, 2, 19, 26, 18, 0, 22, 26, 19, 7, 4, 26, 21, 14, 6, 14, 13, 18]
Token13 is UNK :  [18, 11, 0, 17, 19, 8, 1, 0, 0, 17, 5, 0, 18, 28, 19]


In [11]:
def train_generator(batch_size) :
    
    start = 0 
    train_data_size = len(train_data)

    while True : 
        
        char_level_tokens = []
        word_level_tokens = []

        sgns_embeddings = []
        char_embeddings = []
        electra_embeddings = []
        
        end = start + batch_size
        
        if end > train_data_size : 
            end = train_data_size 
        elif end == train_data_size : 
            start = 0
            end = start + batch_size
        else : 
            pass
        
        curr_batch = train_data[start:end]
        
        for sample in curr_batch : 
            
            gloss = sample['gloss']
            
            char_level_tokens.append(tokenize(gloss, char_tokenizer, tokenize_level="CHAR", pad=True, target_len=char_seq_len))
            word_level_tokens.append(tokenize(gloss, word_tokenizer, tokenize_level="WORD", pad=True, target_len=word_seq_len))
            
            sgns_embeddings.append(sample['sgns'])
            char_embeddings.append(sample['char'])
            electra_embeddings.append(sample['electra'])
            
        yield (np.array(char_level_tokens), np.array(word_level_tokens)),\
        (np.array(sgns_embeddings), np.array(char_embeddings), np.array(electra_embeddings))

### Model definition

In [12]:
from tensorflow.keras import Model, Input
from tensorflow.keras.layers import LSTM, Bidirectional, Dense, Dropout, BatchNormalization, Embedding, Concatenate, Flatten
from tensorflow.keras.callbacks import ModelCheckpoint, CSVLogger
from tensorflow.keras import optimizers, losses, metrics

In [13]:
batch_size = 64
char_seq_len = 640
word_seq_len = 128
embedding_size = 256

learning_rate = 0.001
epochs = 1000
steps_per_epoch = int(len(train_data)/epochs)
batch_size = 256

checkpoint_dir = 'models/vanilla_lstm2.0'
csv_log_path = 'logs/vanilla_lstm2.0_training.log'
history_log_path = 'logs/vanilla_lstm2.0_history.log'

In [14]:
(char_level_tokens, word_level_tokens), (sgns_embeddings, char_embeddings, electra_embeddings) = next(train_generator(batch_size))

assert char_level_tokens.shape == (batch_size, char_seq_len)
assert word_level_tokens.shape == (batch_size, word_seq_len)
assert sgns_embeddings.shape == (batch_size, embedding_size)
assert char_embeddings.shape == (batch_size, embedding_size)
assert electra_embeddings.shape == (batch_size, embedding_size)

In [15]:
char_inputs = Input(shape=(char_seq_len, ))
char_embedding = Embedding(char_vocab_size, embedding_size)(char_inputs)
char_lstm = LSTM(128, recurrent_dropout=0.1, return_sequences=True)(char_embedding)
char_lstm = LSTM(128, recurrent_dropout=0.1, return_sequences=True)(char_lstm)
char_output = Flatten()(char_lstm)


word_inputs = Input(shape=(word_seq_len, ))
word_embedding = Embedding(word_vocab_size, embedding_size)(word_inputs)
word_lstm = LSTM(128, recurrent_dropout=0.1, return_sequences=True)(word_embedding)
word_lstm = LSTM(128, recurrent_dropout=0.1, return_sequences=True)(word_lstm)
word_output = Flatten()(word_lstm)

concatenated = Concatenate(axis=-1)([char_output, word_output])
normalized = BatchNormalization()(concatenated)

dense = Dense(2048, activation='relu')(normalized)
dense = BatchNormalization()(dense)
dense = Dense(1024, activation='relu')(dense)
dense = Dropout(0.2)(dense)
dense = Dense(512, activation='relu')(dense)
dense = Dropout(0.2)(dense)
dense = BatchNormalization()(dense)
dense = Dense(512, activation='relu')(dense)
dense = Dense(256, activation='relu')(dense)

sgns_out = Dense(embedding_size, activation='tanh')(dense)
chars_out = Dense(embedding_size, activation='tanh')(dense)
electra_out = Dense(embedding_size, activation='tanh')(dense)


model = Model(inputs=(char_inputs, word_inputs), 
             outputs=(sgns_out, chars_out, electra_out))



2021-12-06 14:39:14.138757: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-12-06 14:39:14.149483: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-12-06 14:39:14.150272: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-12-06 14:39:14.151593: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags



In [16]:
model.compile(optimizer=optimizers.Adam(), 
              loss=losses.MeanSquaredError())

In [18]:
if not os.path.isdir(checkpoint_dir) : 
    os.mkdir(checkpoint_dir)

model_checkpoint = ModelCheckpoint(filepath=checkpoint_dir, save_freq=5*steps_per_epoch)

csv_log_path = CSVLogger(csv_log_path)

In [None]:
history = model.fit(train_generator(batch_size), 
                   batch_size=batch_size,
                   epochs=epochs, 
                   steps_per_epoch=steps_per_epoch, 
                   callbacks=[model_checkpoint, csv_log_path])

2021-12-06 14:40:16.311287: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000

2021-12-06 15:00:03.562640: W tensorflow/python/util/util.cc:348] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.
2021-12-06 15:00:10.708990: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 805306368 exceeds 10% of free system memory.
2021-12-06 15:00:11.019468: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 805306368 exceeds 10% of free system memory.
2021-12-06 15:00:11.331179: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 805306368 exceeds 10% of free system memory.


INFO:tensorflow:Assets written to: models/vanilla_lstm2.0/assets
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000

2021-12-06 15:19:05.125287: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 805306368 exceeds 10% of free system memory.
2021-12-06 15:19:05.437572: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 805306368 exceeds 10% of free system memory.


INFO:tensorflow:Assets written to: models/vanilla_lstm2.0/assets
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000

In [49]:
print(history.params)
print(history.history)

NameError: name 'history' is not defined

In [50]:
X , Y = next(train_generator(1))

model(X)

(<tf.Tensor: shape=(1, 256), dtype=float32, numpy=
 array([[-5.49655378e-01,  2.97194690e-01,  6.95654303e-02,
         -8.11954066e-02,  5.77607214e-01, -3.92344892e-01,
          1.21077761e-01, -1.22862101e-01,  8.06132674e-01,
          4.18311864e-01, -3.53744537e-01, -7.54541218e-01,
          3.82973224e-01, -3.80358666e-01,  3.59223634e-01,
          5.42707980e-01,  7.37505034e-02,  3.34000178e-02,
         -4.82165277e-01, -1.91515803e-01,  2.92198751e-02,
         -5.04274905e-01,  2.36272603e-01, -1.15609974e-01,
         -4.06451553e-01,  5.23873381e-02,  2.89239585e-01,
          1.40692845e-01,  1.59932554e-01,  3.68848979e-01,
         -2.48135790e-01,  4.65482831e-01, -1.03306912e-01,
          7.01117814e-01,  3.12040746e-01, -6.93565488e-01,
         -9.64240968e-01,  5.19052386e-01, -8.97340775e-01,
          6.09122753e-01,  4.48237568e-01,  1.94943532e-01,
         -1.02290325e-01, -1.53341323e-01, -4.47221808e-02,
          5.21485172e-02,  2.69740939e-01, -2.368

In [62]:
def val_generator(batch_size) :
    
    start = 0 
    train_data_size = len(validation_data)

    while True : 
        
        char_level_tokens = []
        word_level_tokens = []

        sgns_embeddings = []
        char_embeddings = []
        electra_embeddings = []
        
        end = start + batch_size
        
        if end > train_data_size : 
            end = train_data_size 
        elif end == train_data_size : 
            start = 0
            end = start + batch_size
        else : 
            pass
        
        curr_batch = validation_data[start:end]
        
        for sample in curr_batch : 
            
            gloss = sample['gloss']
            char_level_tokens.append(tokenize(gloss, char_tokenizer, tokenize_level="CHAR", pad=True, target_len=char_seq_len))
            word_level_tokens.append(tokenize(gloss, char_tokenizer, tokenize_level="WORD", pad=True, target_len=word_seq_len))
            
            sgns_embeddings.append(sample['sgns'])
            char_embeddings.append(sample['char'])
            electra_embeddings.append(sample['electra'])
            
        yield (np.array(char_level_tokens), np.array(word_level_tokens)),\
        (np.array(sgns_embeddings), np.array(char_embeddings), np.array(electra_embeddings))

In [68]:
val_X = []
val_Y = []


for sample in validation_data : 
    
    char_level_tokens = []
    word_level_tokens = []

    sgns_embeddings = []
    char_embeddings = []
    electra_embeddings = []
    
    gloss = sample['gloss']
    
    char_level_tokens.append(tokenize(gloss, char_tokenizer, tokenize_level="CHAR", pad=True, target_len=char_seq_len))
    word_level_tokens.append(tokenize(gloss, char_tokenizer, tokenize_level="WORD", pad=True, target_len=word_seq_len))

    sgns_embeddings.append(sample['sgns'])
    char_embeddings.append(sample['char'])
    electra_embeddings.append(sample['electra'])
    
    val_X.append((np.array(char_level_tokens), np.array(word_level_tokens)))
    val_Y.append((np.array(sgns_embeddings), np.array(char_embeddings), np.array(electra_embeddings)))

    # val_X.append(tuple(X))
    # val_Y.append(tuple(Y))
    
    
print("Number of validation samples : ", len(val_X))
assert len(val_X)==len(val_Y)
    



model.evaluate(val_generator(batch_size), batch_size=batch_size, steps=int(len(validation_data)/batch_size))

Number of validation samples :  6375


[4.239328861236572,
 1.1200318336486816,
 0.24299781024456024,
 2.8762998580932617]

In [69]:
model.evaluate([val_X[0]], [val_Y[0]], batch_size=1)



[2.993809938430786, 2.200071096420288, 0.2920176088809967, 0.5017211437225342]

In [70]:
model.save('models/vanilla_lstm')

2021-12-06 14:37:23.633559: W tensorflow/python/util/util.cc:348] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.
2021-12-06 14:37:30.605374: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 805306368 exceeds 10% of free system memory.
2021-12-06 14:37:31.235069: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 805306368 exceeds 10% of free system memory.
2021-12-06 14:37:31.859863: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 805306368 exceeds 10% of free system memory.


INFO:tensorflow:Assets written to: models/vanilla_lstm/assets
