In [1]:
import numpy as np
import numpy.random as random

from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense, Softmax, Dropout, SimpleRNN, Embedding, TimeDistributed

In [2]:
VOCAB_SIZE = 10000
MAX_SEQUENCE_LENGTH = 100
UNK_TOKEN = '__unk__'

# Load Data
In this exercise, we will be doing Part-of-Speech tag prediction for a sequence of words

In [3]:
POS_TAGS = {
	'NOTAG': 0,
	'#': 1,
	'$': 2,
	'&apos;&apos;': 3,
	',': 4,
	'-RRB-': 5,
	'.': 6,
	':': 7,
	'CC': 8,
	'CD': 9,
	'DT': 10,
	'EX': 11,
	'FW': 12,
	'IN': 13,
	'JJ': 14,
	'JJR': 15,
	'JJS': 16,
	'LS': 17,
	'MD': 18,
	'NN': 19,
	'NNP': 20,
	'NNPS': 21,
	'NNS': 22,
	'PDT': 23,
	'POS': 24,
	'PRP': 25,
	'PRP$': 26,
	'RB': 27,
	'RBR': 28,
	'RBS': 29,
	'RP': 30,
	'TO': 31,
	'UH': 32,
	'VB': 33,
	'VBD': 34,
	'VBG': 35,
	'VBN': 36,
	'VBP': 37,
	'VBZ': 38,
	'WDT': 39,
	'WP': 40,
	'WP$': 41,
	'WRB': 42,
	'``': 43
} 

text = []
labels = []
with open('data/text.en.txt', encoding='utf-8') as fp:
    for line in fp:
        text.append(line.strip().split(' '))

with open('data/labels.en.txt') as fp:
    for line in fp:
        labels.append([POS_TAGS[p] for p in line.strip().split(' ')])
        
assert(len(text) == len(labels))
for d, l in zip(text, labels):
    assert(len(d) == len(l))
    
data = [(d, l) for d,l in zip(text, labels)]

print("Loaded %d samples"%(len(data)))

Loaded 209772 samples


In [4]:
print(data[5]) # contains the tokens followed by their corresponding POS tags

(['&lt;', 'description', '&gt;', 'TED', 'Talk', 'Subtitles', 'and', 'Transcript', ':', 'With', 'vibrant', 'video', 'clips', 'captured', 'by', 'submarines', ',', 'David', 'Gallo', 'takes', 'us', 'to', 'some', 'of', 'Earth', '&apos;s', 'darkest', ',', 'most', 'violent', ',', 'toxic', 'and', 'beautiful', 'habitats', ',', 'the', 'valleys', 'and', 'volcanic', 'ridges', 'of', 'the', 'oceans', '&apos;', 'depths', ',', 'where', 'life', 'is', 'bizarre', ',', 'resilient', 'and', 'shockingly', 'abundant', '.', '&lt;', '/', 'description', '&gt;'], [10, 19, 13, 36, 19, 22, 8, 19, 7, 13, 14, 19, 22, 36, 13, 22, 4, 20, 20, 38, 25, 31, 10, 13, 20, 24, 16, 4, 29, 14, 4, 14, 8, 14, 22, 4, 10, 22, 8, 14, 22, 13, 10, 22, 24, 22, 4, 42, 19, 38, 14, 4, 14, 8, 27, 14, 6, 3, 20, 19, 6])


In [5]:
# Send random seed for reproducible results
random.seed(5)
random.shuffle(data)

total_instances = len(data)
num_train_instances = int(0.7 * total_instances)
num_dev_instances = int(0.1 * total_instances)
num_test_instances = int(0.2 * total_instances)

train = data[:num_train_instances]
dev = data[num_train_instances:num_train_instances + num_dev_instances]
test = data[num_train_instances + num_dev_instances:num_train_instances + num_dev_instances + num_test_instances]

print("Train Instances: %d"%(len(train)))
print("Dev Instances: %d"%(len(dev)))
print("Test Instances: %d"%(len(test)))

Train Instances: 146840
Dev Instances: 20977
Test Instances: 41954


In [6]:
train_data = [d for d,_ in train]
train_labels = [l for _,l in train]

dev_data = [d for d,_ in dev]
dev_labels = [l for _,l in dev]

test_data = [d for d,_ in test]
test_labels = [l for _,l in test]

In [7]:
# Prepare vocabulary
full_vocab = dict()
for instance in train_data:
    for token in instance:
        full_vocab[token] = 1 + full_vocab.get(token, 0)

# Sort vocabulary by occurence
sorted_vocab = sorted(full_vocab.keys(), key=lambda word: -full_vocab[word])

# Print some samples
print("Vocabulary size: %d"%(len(sorted_vocab)))
print("Most frequent tokens")
for i in range(10):
    print("\t%s: %d"%(sorted_vocab[i], full_vocab[sorted_vocab[i]]))
print("Least frequent tokens")
for i in range(1,11):
    print("\t%s: %d"%(sorted_vocab[-i], full_vocab[sorted_vocab[-i]]))

# We can choose to limit the vocab_size here to only a portion of the original vocab,
# i.e. ignore infrequent tokens to save on memory
vocab_size = VOCAB_SIZE
    
# Create final vocab
word2idx = {w: idx for idx, w in enumerate(sorted_vocab[:vocab_size])}
idx2word = {idx: w for idx, w in enumerate(sorted_vocab[:vocab_size])}


word2idx[UNK_TOKEN] = vocab_size
idx2word[vocab_size] = UNK_TOKEN
vocab_size = vocab_size + 1

Vocabulary size: 54562
Most frequent tokens
	,: 173469
	.: 138735
	the: 109915
	to: 68599
	of: 64398
	and: 59512
	a: 57597
	that: 48974
	I: 44784
	in: 40624
Least frequent tokens
	incapacitates: 1
	bankruptcies: 1
	IPOs: 1
	es: 1
	Dar: 1
	Separate: 1
	squashed: 1
	raking: 1
	Heroin: 1
	Anticipation: 1


## Filter text based on vocabulary
We will now have to replace words we do not have in the vocabulary with a special token, `__unk__` in this case

In [8]:
train_data = [[t if t in word2idx else UNK_TOKEN for t in instance] for instance in train_data]
dev_data = [[t if t in word2idx else UNK_TOKEN for t in instance] for instance in dev_data]
test_data = [[t if t in word2idx else UNK_TOKEN for t in instance] for instance in test_data]

print("Number of tokens filtered out as unknown:")
print("Train: %d/%d"%(len([1 for instance in train_data for t in instance if t == UNK_TOKEN]), sum([len(i) for i in train_data])))
print("Dev: %d/%d"%(len([1 for instance in dev_data for t in instance if t == UNK_TOKEN]), sum([len(i) for i in dev_data])))
print("Test: %d/%d"%(len([1 for instance in test_data for t in instance if t == UNK_TOKEN]), sum([len(i) for i in test_data])))

Number of tokens filtered out as unknown:
Train: 120360/2988546
Dev: 18232/426127
Test: 36205/854125


# Prepare data in tensor form
Our keras models finally take tensors as input and labels, so we need to modify our data to fit this form

In [9]:
## data_to_tensor
# Given a list of instances, where each instance is a list of tokens,
# this function does the following:
# 1: Replace each token with its corresponding index
# 2: Pad sequences to MAX_SEQUENCE_LENGTH (or truncate them if longer)
#       Padding is done with a unique element, in this case `vocab_size`
#       The network will learn that this unique element is padding and does not
#        mean anything semantically
# 3: Package everything nicely as a NUM_INSTANCES x MAX_SEQUENCE_LENGTH matrix
def data_to_tensor(data, pad_value=vocab_size):
    # First convert from words to indices
    idx_data = [[word2idx[t] for t in instance] for instance in data]
    
    # Create numpy representation
    return pad_sequences([np.array(d) for d in idx_data], maxlen=MAX_SEQUENCE_LENGTH, value=pad_value)

X_train = data_to_tensor(train_data)
y_train = to_categorical(pad_sequences(train_labels, maxlen=MAX_SEQUENCE_LENGTH, value=POS_TAGS['NOTAG']))

X_dev = data_to_tensor(dev_data)
y_dev = to_categorical(pad_sequences(dev_labels, maxlen=MAX_SEQUENCE_LENGTH, value=POS_TAGS['NOTAG']))

X_test = data_to_tensor(test_data)
y_test = to_categorical(pad_sequences(test_labels, maxlen=MAX_SEQUENCE_LENGTH, value=POS_TAGS['NOTAG']))

vocab_size = vocab_size + 1 # Add 1 for the padding token

In [10]:
print(X_train.shape)
print(y_train.shape)

(146840, 100)
(146840, 100, 44)


# Model Definition

In [11]:
model = Sequential()
model.add(Embedding(output_dim=25, input_dim=vocab_size, input_length=MAX_SEQUENCE_LENGTH))
model.add(SimpleRNN(30, return_sequences=True)) # Return output at every timestep
# Output of Simple RNN is of size 100x30
# We can't use a dense layer after this since that would take only a single output (summary or average)
model.add(TimeDistributed(Dense(len(POS_TAGS)))) # Apply dense layer for each timestep
# Output of TimeDistributed layer is 100x40 (40 being the # of POS tags)
model.add(TimeDistributed(Softmax()))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc'])

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 100, 25)           250050    
_________________________________________________________________
simple_rnn (SimpleRNN)       (None, 100, 30)           1680      
_________________________________________________________________
time_distributed (TimeDistri (None, 100, 44)           1364      
_________________________________________________________________
time_distributed_1 (TimeDist (None, 100, 44)           0         
Total params: 253,094
Trainable params: 253,094
Non-trainable params: 0
_________________________________________________________________


In [14]:
model.fit(X_train, y_train, epochs=10, validation_data=(X_dev, y_dev))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x1c107d8e448>

In [15]:
test_loss, test_acc = model.evaluate(X_test, y_test)
print("Test Set Accuracy: %0.2f%%"%(test_acc*100))

Test Set Accuracy: 98.74%
