# Named Entity Recognition
- Design the Neural Network Architecture
- Process features and represent them
- Understand word padding
- Implement LSTMs
- Test 

### Many French citizens are going to Morocco for Christmas  
- French: Geopolitical Entity
- Morocco: Geographic Entity
- Christmas: Time Indicator

In [1]:
import trax
from trax import layers as tl
import os
import numpy as np
import pandas as pd

from utils import get_params, get_vocab
import random as rnd

# Set random seeds to make this notebook easier to replicate
trax.supervised.trainer_lib.init_random_number_generators(33)

INFO:tensorflow:tokens_length=568 inputs_length=512 targets_length=114 noise_density=0.15 mean_noise_span_length=3.0 


[nltk_data] Downloading package twitter_samples to
[nltk_data]     /Users/shankar/nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/shankar/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


DeviceArray([ 0, 33], dtype=uint32)

### Exploring the Data
* geo: geographical entity
* org: organization
* per: person 
* gpe: geopolitical entity
* tim: time indicator
* art: artifact
* eve: event
* nat: natural phenomenon
* O: filler word


In [2]:
data = pd.read_csv("ner_dataset.csv", encoding="ISO-8859-1")
train_sents = open('data/small/train/sentences.txt', 'r').readline()
train_labels = open('data/small/train/labels.txt', 'r').readline()
print('SENTENCE:', train_sents)
print('SENTENCE LABEL:', train_labels)
print('ORIGINAL DATA:\n', data.head(5))
del(data, train_sents, train_labels)

SENTENCE: Thousands of demonstrators have marched through London to protest the war in Iraq and demand the withdrawal of British troops from that country .

SENTENCE LABEL: O O O O O O B-geo O O O O O B-geo O O O O O B-gpe O O O O O

ORIGINAL DATA:
     Sentence #           Word  POS Tag
0  Sentence: 1      Thousands  NNS   O
1          NaN             of   IN   O
2          NaN  demonstrators  NNS   O
3          NaN           have  VBP   O
4          NaN        marched  VBN   O


In [3]:
vocab, tag_map = get_vocab('data/large/words.txt', 'data/large/tags.txt')
t_sentences, t_labels, t_size = get_params(vocab, tag_map, 'data/large/train/sentences.txt', 'data/large/train/labels.txt')
v_sentences, v_labels, v_size = get_params(vocab, tag_map, 'data/large/val/sentences.txt', 'data/large/val/labels.txt')
test_sentences, test_labels, test_size = get_params(vocab, tag_map, 'data/large/test/sentences.txt', 'data/large/test/labels.txt')

In [4]:
# vocab translates from a word to a unique number
print('vocab["the"]:', vocab["the"])
# Pad token
print('padded token:', vocab['<PAD>'])

vocab["the"]: 9
padded token: 35180


In [5]:
print(tag_map)

{'O': 0, 'B-geo': 1, 'B-gpe': 2, 'B-per': 3, 'I-geo': 4, 'B-org': 5, 'I-org': 6, 'B-tim': 7, 'B-art': 8, 'I-art': 9, 'I-per': 10, 'I-gpe': 11, 'I-tim': 12, 'B-nat': 13, 'B-eve': 14, 'I-eve': 15, 'I-nat': 16}


In [6]:
# Exploring information about the data
print('The number of outputs is tag_map', len(tag_map))
# The number of vocabulary tokens (including <PAD>)
g_vocab_size = len(vocab)
print(f"Num of vocabulary words: {g_vocab_size}")
print('The vocab size is', len(vocab))
print('The training size is', t_size)
print('The validation size is', v_size)
print('An example of the first sentence is', t_sentences[0])
print('An example of its corresponding label is', t_labels[0])

The number of outputs is tag_map 17
Num of vocabulary words: 35181
The vocab size is 35181
The training size is 33570
The validation size is 7194
An example of the first sentence is [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 9, 15, 1, 16, 17, 18, 19, 20, 21]
An example of its corresponding label is [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0]


### Data Generator

In [7]:
def data_generator(batch_size, x, y, pad, shuffle=False, verbose=False):
    # Count the number of lines in data_lines
    num_lines = len(x)
    # Create an array with the indexes of data_lines that can be shuffled
    lines_index = [*range(num_lines)]
    # Shuffle the indexes if shuffle is set to True
    if(shuffle):
        rnd.shuffle(lines_index)
        
    # Tracks current location in x, y
    index = 0
    while(True):
        # Temporal array to score the raw x/y data for this batch
        buffer_x = [0] * batch_size
        buffer_y = [0] * batch_size
        
        # Copy into the temporal buffers the sentences in x[index: index + batch_size]
        # along with their corresponding labels y[index: index + batch_size]
        # Find maximum length of sentences in x[index: index + batch_size] for this batch
        # Reset the index if we reach the end of the data set, and shuffle the indexes if needed
        max_len = 0
        for i in range(batch_size):
            # If the index is greater than or equal to the number of lines in x
            if(index >= num_lines):
                index = 0
                if(shuffle):
                    rnd.shuffle(lines_index)
                    
            # The current position is obtained using `lines_index[index]`
            # Store the x/y value at the current position into the buffer_x
            buffer_x[i] = x[lines_index[index]]
            buffer_y[i] = y[lines_index[index]]
            
            lenx = len(buffer_y[i])
            if(lenx > max_len):
                max_len = lenx
                
            index += 1
        
        # Create X, Y, Numpy arrays of size (batch_size, max_len) 'full' of pad value
        X = np.full((batch_size, max_len), pad)
        Y = np.full((batch_size, max_len), pad)
        
        # Copy values from lists to Numpy arrays use the buffered values
        for i in range(batch_size):
            # Get the example (sentence as a tensor)
            # in buffer_x at the i index
            x_i = buffer_x[i]
            y_i = buffer_y[i]
            
            # Walk through each word in x_i
            for j in range(len(x_i)):
                X[i, j] = x_i[j]
                Y[i, j] = y_i[j]
                
        if(verbose):
            print(f'index={index}')
            yield((X, Y))

In [8]:
batch_size = 5
mini_sentences = t_sentences[0:8]
mini_labels = t_labels[0: 8]
dg = data_generator(batch_size, mini_sentences, mini_labels, vocab["<PAD>"], shuffle=False, verbose=True)
X1, Y1 = next(dg)
X2, Y2 = next(dg)
print(Y1.shape, X1.shape, Y2.shape, X2.shape)
print(X1[0][:], "\n", Y1[0][:])

index=5
index=2
(5, 30) (5, 30) (5, 30) (5, 30)
[    0     1     2     3     4     5     6     7     8     9    10    11
    12    13    14     9    15     1    16    17    18    19    20    21
 35180 35180 35180 35180 35180 35180] 
 [    0     0     0     0     0     0     1     0     0     0     0     0
     1     0     0     0     0     0     2     0     0     0     0     0
 35180 35180 35180 35180 35180 35180]


### Building the Model
- Use input tensors you built in your data generator
- Feed it into an Embedding layer, to produce more semantic entries
- Feed it into an LSTM layer
- Run the output through a linear layer
- Run the result though a log softmax layer to get the predicted class for each word

In [9]:
def NER(vocab_size=35181, d_model=50, tags=tag_map):
    model = tl.Serial(
        tl.Embedding(vocab_size, d_model),
        tl.LSTM(d_model),
        tl.Dense(len(tags)),
        tl.LogSoftmax()
    )
    
    return model

In [10]:
# initializing your model
model = NER()
# display your model
print(model)

Serial[
  Embedding_35181_50
  LSTM_50
  Dense_17
  LogSoftmax
]


### Train the Model

In [11]:
from trax.supervised import training
rnd.seed(33)
batch_size = 64

# Create training data, mask pad id=35180 for training.
train_generator = trax.supervised.inputs.add_loss_weights(
    data_generator(batch_size, t_sentences, t_labels, vocab['<PAD>'], True),
    id_to_mask=vocab['<PAD>'])

# Create validation data, mask pad id=35180 for training.
eval_generator = trax.supervised.inputs.add_loss_weights(
    data_generator(batch_size, v_sentences, v_labels, vocab['<PAD>'], True),
    id_to_mask=vocab['<PAD>'])

In [12]:
def train_model(NER, train_generator, eval_generator, train_steps=1, output_dir='model'):
    
    train_task = training.TrainTask(
        train_generator,
        loss_layer=tl.CrossEntropyLoss(),
        optimizer=trax.optimizers.Adam(0.5)
    )
    eval_task = training.EvalTask(
        labeled_data=eval_generator,
        metrics=[tl.CrossEntropyLoss(), tl.Accuracy()],
        n_eval_batches=batch_size
    )
    
    training_loop = training.Loop(
        NER,
        train_task,
        eval_task=eval_task,
        output_dir=output_dir
    )
    training_loop.run(n_steps = train_steps)
    
    return training_loop

In [None]:
train_steps = 100            # In coursera we can only train 100 steps
# !rm -f 'model/model.pkl.gz'  # Remove old model.pkl if it exists

# Train the model
training_loop = train_model(NER(), train_generator, eval_generator, train_steps)

In [None]:
# loading in a pretrained model..
model = NER()
model.init(trax.shapes.ShapeDtype((1, 1), dtype=np.int32))

# Load the pretrained model
model.init_from_file('model.pkl.gz', weights_only=True)

### Compute Accuracy

In [None]:
#Example of a comparision on a matrix 
a = np.array([1, 2, 3, 4])
a == 2

In [None]:
# create the evaluation inputs
x, y = next(data_generator(len(test_sentences), test_sentences, test_labels, vocab['<PAD>']))
print("input shapes", x.shape, y.shape)

In [None]:
def evaluate_prediction(pred, labels, pad):
    outputs = np.argmax(pred, axis=2)
    mask = labels != pad
    accuracy = np.sum(outputs==labels)/float(np.sum(mask))
    
    return accuracy

In [None]:
accuracy = evaluate_prediction(model(x), y, vocab['<PAD>'])
print("accuracy: ", accuracy)

### Testing with your owun Sentence

In [None]:
def predict(sentence, model, vocab, tag_map):
    s = [vocab[token] if token in vocab else vocab['UNK'] for token in sentence.split(' ')]
    batch_data = np.ones((1, len(s)))
    batch_data[0][:] = s
    sentence = np.array(batch_data).astype(int)
    output = model(sentence)
    outputs = np.argmax(output, axis=2)
    labels = list(tag_map.keys())
    pred = []
    for i in range(len(ouputs[0])):
        idx = outputs[0][i]
        pred_label = labels[idx]
        pred.append(pred_label)
        
    return pred