# Make LSTMs Great Again

# Named Entity Recognition on Twitter Data

## Reading the Data

Corpus contains tweets and named entity tags. A line in corpus is a token with a tag separated by a space.

Different tweets are separated by a new line.

Replace usernames that starts with @ with USR and url that starts with 'http:// || https://' with URL

In [1]:
def read_data(file_path):
    tokens = [] # List of list of words in a tweet, for all tweets
    tags = [] # List of list of tags in a tweet, for all tags corresponding to the tweet
    
    tweet_tokens = []
    tweet_tags = []
    for line in open(file_path, encoding='utf-8'):  
        line = line.strip() # remove leading and trailing space
        if not line:
            if tweet_tokens:
                tokens.append(tweet_tokens)
                tags.append(tweet_tags)
            tweet_tokens = []
            tweet_tags = []
        else:
            token, tag = line.split()
            if token.startswith("@"):
                token="<USR>" # Replace username with <USR>
            elif token.startswith("http://") or token.startswith("https://"):
                token="<URL>" # Replace links with <URL>
            tweet_tokens.append(token)
            tweet_tags.append(tag)
            
    return tokens, tags

### Loading the Train, Validation and Test Data

In [3]:
train_tokens, train_tags = read_data('Data/train.txt')
validation_tokens, validation_tags = read_data('Data/validation.txt')
test_tokens, test_tags = read_data('Data/test.txt')

### Exploring the Data

In [9]:
for word in train_tokens[0]: print(word, end=" ")

RT <USR> : Online ticket sales for Ghostland Observatory extended until 6 PM EST due to high demand . Get them before they sell out ... 

In [10]:
for tag in train_tags[0]: print(tag, end=" ")

O O O O O O O B-musicartist I-musicartist O O O O O O O O O O O O O O O O O 

Each element loaded to train tokens is a tweet, which in turn is a list of words.

In [18]:
print("We have", len(train_tokens), "tweets")

We have 5795 tweets


Checking for missing tags/tweet in the test, train and validation sets

In [22]:
if len(train_tokens) != len(train_tags): print("train mismatch")
elif len(validation_tokens) != len(validation_tags): print("validation mismatch")
elif len(test_tokens) != len(test_tags): print("test mismatch")
else: 
    all_right = 1
    print("Data all set")
assert all_right == 1

Data all set


## Preparing the Dictionaries

We need 2 mappings for training the NN

1. token --> tokenID
2. tag --> tagID

tokenID addresses the row in the embedding matrix

tagID is the ID of the tag - to getDummy

In [25]:
from collections import defaultdict

def build_dict(tokens_or_tags, special_tokens):
#     tokens_or_tags is list of list of tokens/tags
#     special_tokens are some special tokens
    # Create a dict with default value 0
    tok2idx = defaultdict(lambda: 0)
    idx2tok = []
    k = 0
    
    for line in special_tokens:
        tok2idx[line] = k
        k += 1
        idx2tok.append(line)
        
    for tokens in tokens_or_tags:
        for token in tokens:
            if token not in tok2idx:
                tok2idx[token] = k
                k += 1
                idx2tok.append(token)
    return tok2idx, idx2tok

Special Tokens:

UNK : Unknown tokens - the ones found outside of the vocabulary

PAD : Padding the sentence to the same length to create batches of sentence

In [27]:
special_tokens = ['<UNK>', '<PAD>']
special_tags = ['O']

# Create the Dictionaries
token2idx, idx2token = build_dict(train_tokens + validation_tokens, special_tokens) # for tokens
tag2idx, idx2tag = build_dict(train_tags, special_tags) # for tags

In [34]:
tag2idx

defaultdict(<function __main__.build_dict.<locals>.<lambda>()>,
            {'O': 0,
             'B-musicartist': 1,
             'I-musicartist': 2,
             'B-product': 3,
             'I-product': 4,
             'B-company': 5,
             'B-person': 6,
             'B-other': 7,
             'I-other': 8,
             'B-facility': 9,
             'I-facility': 10,
             'B-sportsteam': 11,
             'B-geo-loc': 12,
             'I-geo-loc': 13,
             'I-company': 14,
             'I-person': 15,
             'B-movie': 16,
             'I-movie': 17,
             'B-tvshow': 18,
             'I-tvshow': 19,
             'I-sportsteam': 20})

We have 19 named entity tag along with a non_named_entity tag denoted by O

Function to create the mapping between tokens and IDs for a sentence

In [35]:
def words2idxs(tokens_list):
    return [token2idx[word] for word in tokens_list]

def tags2idxs(tags_list):
    return [tag2idx[tag] for tag in tags_list]

def idxs2words(idxs):
    return [idx2token[idx] for idx in idxs]

def idxs2tags(idxs):
    return [idx2tag[idx] for idx in idxs]

## Generating Batches

In [36]:
def batches_generator(batch_size, tokens, tags,
                      shuffle=True, allow_smaller_last_batch=True):
    """Generates padded batches of tokens and tags."""
    
    n_samples = len(tokens)
    if shuffle:
        order = np.random.permutation(n_samples)
    else:
        order = np.arange(n_samples)

    n_batches = n_samples // batch_size
    if allow_smaller_last_batch and n_samples % batch_size:
        n_batches += 1

    for k in range(n_batches):
        batch_start = k * batch_size
        batch_end = min((k + 1) * batch_size, n_samples)
        current_batch_size = batch_end - batch_start
        x_list = []
        y_list = []
        max_len_token = 0
        for idx in order[batch_start: batch_end]:
            x_list.append(words2idxs(tokens[idx]))
            y_list.append(tags2idxs(tags[idx]))
            max_len_token = max(max_len_token, len(tags[idx]))
            
        # Fill in the data into numpy nd-arrays filled with padding indices.
        x = np.ones([current_batch_size, max_len_token], dtype=np.int32) * token2idx['<PAD>']
        y = np.ones([current_batch_size, max_len_token], dtype=np.int32) * tag2idx['O']
        lengths = np.zeros(current_batch_size, dtype=np.int32)
        for n in range(current_batch_size):
            utt_len = len(x_list[n])
            x[n, :utt_len] = x_list[n]
            lengths[n] = utt_len
            y[n, :utt_len] = y_list[n]
        yield x, y, lengths

## Building a Bidirectional RNN with Tensorflow

We need both right and left context of a token. Hence I am using Bidirectional RNN.

In [37]:
import tensorflow as tf
import numpy as np

class BiLSTMModel():
    pass

### Creating the Placeholders

Placeholders are created for the following data we need to input into the RNN: -------

input_batch — sequences of words (the shape equals to [batch_size, sequence_len]);

ground_truth_tags — sequences of tags (the shape equals to [batch_size, sequence_len]);

lengths — lengths of not padded sequences (the shape equals to [batch_size]);

dropout_ph — dropout keep probability; this placeholder has a predefined value 1;

learning_rate_ph — learning rate; we need this placeholder because we want to change the value during training.


In [38]:
# Function that declares the placeholders to be fed into the model
def declare_placeholders(self):
    # Placeholders for input and ground truth output.
    self.input_batch = tf.placeholder(dtype=tf.int32, shape=[None, None], name='input_batch') 
    self.ground_truth_tags = tf.placeholder(dtype=tf.int32, shape=[None, None], name='ground_truth_tags')
  
    # Placeholder for lengths of the sequences.
    self.lengths = tf.placeholder(dtype=tf.int32, shape=[None], name='lengths') 
    
    # Placeholder for a dropout keep probability. default set to 0
    self.dropout_ph = tf.placeholder_with_default(tf.cast(1.0, tf.float32), shape=[])
    
    # Placeholder for a learning rate (tf.float32).
    self.learning_rate_ph = tf.placeholder(dtype=tf.float32, shape=[], name='learning_rate_placeholder')

In [None]:
# add the declare pllaceholder function t