In [134]:
#C3W3

In [135]:
import trax 
from trax import layers as tl
import os 
import numpy as np
import pandas as pd


from utilsc3w3 import get_params, get_vocab
import random as rnd

# set random seeds to make this notebook easier to replicate
trax.supervised.training.Loop

trax.supervised.training.Loop

In [136]:
# Exploring data

In [137]:
# display original kaggle data
data = pd.read_csv("ner_dataset.csv", encoding = "ISO-8859-1") 
train_sents = open('datac3w3/small/train/sentences.txt', 'r').readline()
train_labels = open('datac3w3/small/train/labels.txt', 'r').readline()
print('SENTENCE:', train_sents)
print('SENTENCE LABEL:', train_labels)
print('ORIGINAL DATA:\n', data.head(5))
del(data, train_sents, train_labels)

SENTENCE: Thousands of demonstrators have marched through London to protest the war in Iraq and demand the withdrawal of British troops from that country .

SENTENCE LABEL: O O O O O O B-geo O O O O O B-geo O O O O O B-gpe O O O O O

ORIGINAL DATA:
     Sentence #           Word  POS Tag
0  Sentence: 1      Thousands  NNS   O
1          NaN             of   IN   O
2          NaN  demonstrators  NNS   O
3          NaN           have  VBP   O
4          NaN        marched  VBN   O


In [138]:
# 1.1.Importing Data

In [139]:
vocab, tag_map = get_vocab('datac3w3/large/words.txt', 'datac3w3/large/tags.txt')
t_sentences, t_labels, t_size = get_params(vocab, tag_map, 'datac3w3/large/train/sentences.txt', 'datac3w3/large/train/labels.txt')
v_sentences, v_labels, v_size = get_params(vocab, tag_map, 'datac3w3/large/val/sentences.txt', 'datac3w3/large/val/labels.txt')
test_sentences, test_labels, test_size = get_params(vocab, tag_map, 'datac3w3/large/test/sentences.txt', 'datac3w3/large/test/labels.txt')

In [140]:
# vocab translates from a word to a unique number
print('vocab["the"]:', vocab["the"])
# Pad token
print('padded token:', vocab['<PAD>'])

vocab["the"]: 9
padded token: 35180


In [141]:
print(tag_map)

{'O': 0, 'B-geo': 1, 'B-gpe': 2, 'B-per': 3, 'I-geo': 4, 'B-org': 5, 'I-org': 6, 'B-tim': 7, 'B-art': 8, 'I-art': 9, 'I-per': 10, 'I-gpe': 11, 'I-tim': 12, 'B-nat': 13, 'B-eve': 14, 'I-eve': 15, 'I-nat': 16}


In [142]:
print('The number of outputs is tag_map', len(tag_map))
g_vocab_size = len(vocab)
print(f"Num of vocabulary words: {g_vocab_size}")
print('The vocab size is', len(vocab))
print('The training size is', t_size)
print('The validation size is', v_size)
print('An example of the first sentence is', t_sentences[0])
print('An example of its corresponding label is', t_labels[0])

The number of outputs is tag_map 17
Num of vocabulary words: 35181
The vocab size is 35181
The training size is 33570
The validation size is 7194
An example of the first sentence is [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 9, 15, 1, 16, 17, 18, 19, 20, 21]
An example of its corresponding label is [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0]


In [143]:
# 1.2. Data Generator

In [144]:
def data_generator(batch_size, x, y, pad, shuffle=False, verbose=False):

    num_lines = len(x)
    
    lines_index = [*range(num_lines)]
    
    if shuffle:
        rnd.shuffle(lines_index)
    
    index = 0
    while True:
        buffer_x = [0] * batch_size
        buffer_y = [0] * batch_size

        max_len = 0
        for i in range(batch_size):
            if index >= num_lines:
                index = 0
                if shuffle:
                    rnd.shuffle(lines_index)

            buffer_x[i] = x[lines_index[index]]            
            buffer_y[i] = y[lines_index[index]]
            
            lenx = len(x[lines_index[index]])
            if lenx > max_len:
                max_len = lenx
            
            index += 1


        X = np.full((batch_size, max_len), pad)
        Y = np.full((batch_size, max_len), pad)

        for i in range(batch_size):

            x_i = buffer_x[i]
            
            y_i = buffer_y[i]
            
            for j in range(len(x_i)):
                X[i, j] = x_i[j]                
                Y[i, j] = y_i[j]

        if verbose: print("index=", index)
        yield((X,Y))

In [145]:
batch_size = 5
mini_sentences = t_sentences[0: 8]
mini_labels = t_labels[0: 8]
dg = data_generator(batch_size, mini_sentences, mini_labels, vocab["<PAD>"], shuffle=False, verbose=True)
X1, Y1 = next(dg)
X2, Y2 = next(dg)
print(Y1.shape, X1.shape, Y2.shape, X2.shape)
print(X1[0][:], "\n", Y1[0][:])

index= 5
index= 2
(5, 30) (5, 30) (5, 30) (5, 30)
[    0     1     2     3     4     5     6     7     8     9    10    11
    12    13    14     9    15     1    16    17    18    19    20    21
 35180 35180 35180 35180 35180 35180] 
 [    0     0     0     0     0     0     1     0     0     0     0     0
     1     0     0     0     0     0     2     0     0     0     0     0
 35180 35180 35180 35180 35180 35180]


In [146]:
# Building the model

In [147]:

def NER(vocab_size=35181, d_model=50, tags=tag_map):

    model = tl.Serial(
      tl.Embedding(vocab_size, d_model),
      tl.LSTM(d_model),
      tl.Dense(len(tags)),
      tl.LogSoftmax()
      )
    return model

In [148]:
model = NER()

print(model)

Serial[
  Embedding_35181_50
  LSTM_50
  Dense_17
  LogSoftmax
]


In [149]:
# 3. Train Model

In [150]:
from trax.supervised import training

rnd.seed(33)

batch_size = 64

train_generator = trax.data.inputs.add_loss_weights(
    data_generator(batch_size, t_sentences, t_labels, vocab['<PAD>'], True),
    id_to_mask=vocab['<PAD>'])

eval_generator = trax.data.inputs.add_loss_weights(
    data_generator(batch_size, v_sentences, v_labels, vocab['<PAD>'], True),
    id_to_mask=vocab['<PAD>'])

In [151]:
# 3.1. Training the model

In [152]:

def train_model(NER, train_generator, eval_generator, train_steps=1, output_dir='model'):

    train_task = training.TrainTask(
      train_generator,
      loss_layer = tl.CrossEntropyLoss(),
      optimizer = trax.optimizers.Adam(0.01),
    )

    eval_task = training.EvalTask(
      labeled_data = eval_generator,
      metrics = [tl.CrossEntropyLoss(), tl.Accuracy()],
      n_eval_batches = 10
    )

    training_loop = training.Loop(
        NER,
        train_task,
        eval_tasks = eval_task,
        output_dir = output_dir) 

    training_loop.run(n_steps = train_steps)

    return training_loop

In [153]:
train_steps = 100            # In coursera we can only train 100 steps
!rm -f 'model/model.pkl.gz'  # Remove old model.pkl if it exists

# Train the model
training_loop = train_model(NER(), train_generator, eval_generator, train_steps)


Step      1: Total number of trainable weights: 1780117
Step      1: Ran 1 train steps in 3.91 secs
Step      1: train CrossEntropyLoss |  3.29953766
Step      1: eval  CrossEntropyLoss |  2.28489747
Step      1: eval          Accuracy |  0.00013775

Step    100: Ran 99 train steps in 78.46 secs
Step    100: train CrossEntropyLoss |  0.53927511
Step    100: eval  CrossEntropyLoss |  0.26379415
Step    100: eval          Accuracy |  0.93500191


In [154]:
# loading in a pretrained model..
model = NER()
model.init(trax.shapes.ShapeDtype((1, 1), dtype=np.int32))

# Load the pretrained model
model.init_from_file('modelc3w3.pkl.gz', weights_only=True)

((array([[ 1.6746343 , -1.2386736 , -1.2588062 , ..., -0.00851805,
          -0.58789647,  0.21786927],
         [-0.49910122,  0.7476655 , -1.341309  , ...,  1.2629304 ,
           1.0195953 ,  0.9969896 ],
         [ 0.27949372,  0.5704571 ,  0.6879833 , ..., -1.8033195 ,
           0.10972298, -0.8717324 ],
         ...,
         [ 0.26727983, -0.4827838 ,  1.8206602 , ..., -1.4759233 ,
          -0.7623981 , -2.1510365 ],
         [ 0.09075634, -0.3017605 ,  0.97404975, ...,  0.9246912 ,
           0.73218006, -1.1800029 ],
         [ 0.20993397,  0.7626188 , -2.4542587 , ...,  1.3712136 ,
           0.07566787, -1.3855839 ]], dtype=float32),
  (((), ((), ())),
   ((array([[ 0.05901118, -0.1423949 , -0.20627049, ..., -0.32257217,
              0.27084   ,  0.06706806],
            [ 0.05068694, -0.15523194, -0.08237975, ...,  0.31102452,
              0.2882233 , -0.06509141],
            [ 0.10575256, -0.00132665, -0.0978074 , ..., -0.02731692,
              0.37720057,  0.2022040

In [155]:
#4. Compute Accuracy

In [156]:
#Example of a comparision on a matrix 
a = np.array([1, 2, 3, 4])
a == 2

array([False,  True, False, False])

In [157]:
# create the evaluation inputs
x, y = next(data_generator(len(test_sentences), test_sentences, test_labels, vocab['<PAD>']))
print("input shapes", x.shape, y.shape)

input shapes (7194, 70) (7194, 70)


In [158]:
# sample prediction
tmp_pred = model(x)
print(type(tmp_pred))
print(f"tmp_pred has shape: {tmp_pred.shape}")

<class 'jax.interpreters.xla._DeviceArray'>
tmp_pred has shape: (7194, 70, 17)


In [159]:
def evaluate_prediction(pred, labels, pad):
    outputs = np.argmax(pred, axis=2)
    print("outputs shape:", outputs.shape)

    mask = labels != pad
    print("mask shape:", mask.shape, "mask[0][20:30]:", mask[0][20:30])

    accuracy = np.sum(outputs == labels) / float(np.sum(mask))
    return accuracy


In [160]:
accuracy = evaluate_prediction(model(x), y, vocab['<PAD>'])
print("accuracy: ", accuracy)

outputs shape: (7194, 70)
mask shape: (7194, 70) mask[0][20:30]: [ True  True  True False False False False False False False]
accuracy:  0.9100196


In [161]:
# 5. Testing with your own sentence

In [162]:
def predict(sentence, model, vocab, tag_map):
    s = [vocab[token] if token in vocab else vocab['UNK'] for token in sentence.split(' ')]
    batch_data = np.ones((1, len(s)))
    batch_data[0][:] = s
    sentence = np.array(batch_data).astype(int)
    output = model(sentence)
    outputs = np.argmax(output, axis=2)
    labels = list(tag_map.keys())
    pred = []
    for i in range(len(outputs[0])):
        idx = outputs[0][i] 
        pred_label = labels[idx]
        pred.append(pred_label)
    return pred

In [163]:
sentence = "Peter Navarro, the White House director of trade and manufacturing policy of U.S, said in an interview on Sunday morning that the White House was working to prepare for the possibility of a second wave of the coronavirus in the fall, though he said it wouldn’t necessarily come"
s = [vocab[token] if token in vocab else vocab['UNK'] for token in sentence.split(' ')]
predictions = predict(sentence, model, vocab, tag_map)
for x,y in zip(sentence.split(' '), predictions):
    if y != 'O':
        print(x,y)

Peter I-per
White B-geo
House I-org
Sunday B-tim
White B-geo
House I-org
