In [1]:
from utils.model_pipeline import *

In [2]:
vocab, tag_map = get_vocab('dataset/words.txt', 'dataset/tags.txt')
t_sentences, t_labels, _ = get_params(vocab, tag_map, 'dataset/train/sentences.txt', 'dataset/train/labels.txt')
v_sentences, v_labels, _ = get_params(vocab, tag_map, 'dataset/val/sentences.txt', 'dataset/val/labels.txt')
test_sentences, test_labels, _ = get_params(vocab, tag_map, 'dataset/test/sentences.txt', 'dataset/test/labels.txt')

# dataset usefull info
vocab_size = len(vocab) # dataset vocab
embedded_size = len(t_sentences[0]) # words/sentence

In [3]:
# initializing the LSTM model
model = tl.Serial(
    tl.Embedding(vocab_size, embedded_size), # Embedding layer
    tl.LSTM(embedded_size), # LSTM layer
    tl.Dense(len(tag_map)), # Dense layer with len(tag_map) units
    tl.LogSoftmax()  # LogSoftmax layer
)
# display the model
# print(model)

In [4]:
batch_size = 64
train_steps = 1000

In [5]:
# remove model path if it exists
if os.path.exists('model'):
    shutil.rmtree('model')

# Create training data generator
train_generator = trax.data.inputs.add_loss_weights(
    data_generator(batch_size, t_sentences, t_labels, vocab['<PAD>'], True),
    id_to_mask=vocab['<PAD>']
)

# Create validation data generator
eval_generator = trax.data.inputs.add_loss_weights(
    data_generator(batch_size, v_sentences, v_labels, vocab['<PAD>'], True),
    id_to_mask=vocab['<PAD>']
)

# initialize the training loop
training_loop = train_model(model, train_generator, eval_generator, train_steps)

  "jax.host_count has been renamed to jax.process_count. This alias "



Step      1: Total number of trainable weights: 255056
Step      1: Ran 1 train steps in 2.61 secs
Step      1: train CrossEntropyLoss |  2.02989888
Step      1: eval  CrossEntropyLoss |  0.89070380
Step      1: eval          Accuracy |  0.93600001

Step    100: Ran 99 train steps in 2.19 secs
Step    100: train CrossEntropyLoss |  0.09123459
Step    100: eval  CrossEntropyLoss |  0.12652491
Step    100: eval          Accuracy |  0.98200001

Step    200: Ran 100 train steps in 2.09 secs
Step    200: train CrossEntropyLoss |  0.00685188
Step    200: eval  CrossEntropyLoss |  0.14044195
Step    200: eval          Accuracy |  0.98196875

Step    300: Ran 100 train steps in 2.11 secs
Step    300: train CrossEntropyLoss |  0.00282311
Step    300: eval  CrossEntropyLoss |  0.15059000
Step    300: eval          Accuracy |  0.98203125

Step    400: Ran 100 train steps in 2.08 secs
Step    400: train CrossEntropyLoss |  0.00138004
Step    400: eval  CrossEntropyLoss |  0.15800631
Step    400: 

In [6]:
# load pretrained model
"""
model = tl.Serial(
    tl.Embedding(vocab_size, embedded_size),    # Embedding layer
    tl.LSTM(embedded_size),                     # LSTM layer
    tl.Dense(len(tags)),                        # Dense layer with len(tags) units
    tl.LogSoftmax()                             # LogSoftmax layer
)
model.init(trax.shapes.ShapeDtype((1, 1), dtype=np.int32))
model.init_from_file('model/model.pkl.gz', weights_only=True)
"""

"\nmodel = tl.Serial(\n    tl.Embedding(vocab_size, embedded_size),    # Embedding layer\n    tl.LSTM(embedded_size),                     # LSTM layer\n    tl.Dense(len(tags)),                        # Dense layer with len(tags) units\n    tl.LogSoftmax()                             # LogSoftmax layer\n)\nmodel.init(trax.shapes.ShapeDtype((1, 1), dtype=np.int32))\nmodel.init_from_file('model/model.pkl.gz', weights_only=True)\n"

In [7]:
# test the model with evaluation data
eval_model_gen = data_generator(len(test_sentences), test_sentences, test_labels, vocab['<PAD>'], shuffle=True)
x, y = next(eval_model_gen)
# print("input shapes", x.shape, y.shape)

# Evaluate modle accuracy
#print(f"accuracy: {evaluate_prediction(model(x), y, vocab['<PAD>'])}")

In [58]:
# test your own data
#txts = "today we are in spain for the big final match between the cristiano ronaldo team real madrid and barcelona and we hope to see a really good football match"
#txts_token = [vocab[token] if token in vocab else vocab['UNK'] for token in txts.split(' ')]
#eval_model_gen = data_generator(1, [txts_token], [[0 for _ in range(len(txts_token))]], vocab['<PAD>'])

In [59]:
# create the evaluation inputs
x, y = next(eval_model_gen)
# print("input shapes", x.shape, y.shape, x[0])

# sample prediction
tmp_pred = model(x)
# print(type(tmp_pred))
# print(f"tmp_pred has shape: {tmp_pred.shape}")

x_aux, inv_vocab = [], {v: k for k, v in vocab.items()}
for idx, p in enumerate(x[:1]):
    for t in p:
        for k,v in vocab.items():
            if v == t:
                x_aux.append(k)
    preds = [(t,p1,p2,[q for q,w in tag_map.items() if w == p1][0],[q for q,w in tag_map.items() if w == p2][0]) for (t,p1,p2) in list(zip(x_aux, y[idx], np.argmax(tmp_pred, axis=2)[idx])) if p2 != 0]
    
    print(f"Sentence: {' '.join([inv_vocab[t] for t in x[0]])}\n")
    print(f'Preds: {preds}')

Sentence: today we are in spain for the big final match between the cristiano ronaldo team real madrid and barcelona and we hope to see a really good football match

Preds: [('spain', 0, 4, 'O', 'TEA'), ('cristiano', 0, 1, 'O', 'PLA'), ('ronaldo', 0, 1, 'O', 'PLA'), ('madrid', 0, 4, 'O', 'TEA'), ('barcelona', 0, 4, 'O', 'TEA')]
