# Part-of-Speech (POS) Tagging

In [1]:
from collections import Counter, defaultdict
import nltk
import numpy as np
import os
import sys
from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

In [2]:
nltk_data = os.getcwd()
nltk.data.path.append(nltk_data)
nltk.download("brown", download_dir=nltk_data)
nltk.download("universal_tagset", download_dir=nltk_data)

data = nltk.corpus.brown.tagged_sents(tagset="universal")
data = np.array([ [(word.lower(), tag) for word, tag in sentence] for sentence in data ])

EOS_TOK = "#EOS#"
UNK_TOK = "#UNK#"

all_tags = [EOS_TOK, UNK_TOK, "ADV", "NOUN", "ADP", "PRON", "DET", ".", "PRT", "VERB", "X", "NUM", "CONJ", "ADJ"]
TAG_PAD = 0
UNK_TAG = 1
n_tags = len(all_tags)

print("\nTotal number of sentences:", len(data))

[nltk_data] Downloading package brown to /home/dimitry/arch/jupyter...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package universal_tagset to
[nltk_data]     /home/dimitry/arch/jupyter...
[nltk_data]   Package universal_tagset is already up-to-date!

Total number of sentences: 57340


In [3]:
train_data, test_data = train_test_split(data, test_size=0.25)

word_counts = Counter()
for sentence in data:
    words, tags = zip(*sentence)
    word_counts.update(words)

all_words = [EOS_TOK, UNK_TOK] + list(list(zip(*word_counts.most_common(30000)))[0])
WORD_PAD = 0
UNK_WORD = 1
n_words = len(all_words)

print("Coverage =", float(sum(word_counts[w] for w in all_words)) / sum(word_counts.values()))

Coverage = 0.9829356385507306


In [4]:
word_to_id = defaultdict(lambda: UNK_WORD, { word: i for i, word in enumerate(all_words) })
tag_to_id = { tag: i for i, tag in enumerate(all_tags) }

def to_dataset(data, batch_size):
    words, tags = zip(*[ zip(*sent) for sent in data ])

    # convert words and tags to ids
    words = [[ word_to_id[word] for word in words ] for words in words]
    tags = [[ tag_to_id[tag] for tag in tags ] for tags in tags]

    # create Dataset of varying-length sequences using RaggedTensors
    dataset = tf.data.Dataset.from_tensor_slices(( tf.ragged.constant(words), tf.ragged.constant(tags) ))
    # convert RaggedTensors to regular Tensors (needed for padded_batch() below)
    dataset = dataset.map(lambda x, y: (x, y))
    # shuffle entire dataset
    dataset = dataset.shuffle(len(words))
    # create padded batches of same length
    dataset = dataset.padded_batch(batch_size, padded_shapes=([None], [None]), padding_values=(WORD_PAD, TAG_PAD))
    # convert tags to 1-hot encoded values
    dataset = dataset.apply(lambda ds: ds.map( lambda x, y: (x, tf.one_hot(y, n_tags)) ))
    
    return dataset

In [5]:
BATCH_SIZE=64

train_set = to_dataset(train_data, batch_size=BATCH_SIZE)
test_set = to_dataset(test_data, batch_size=BATCH_SIZE)

In [6]:
def masked_categorical_crossentropy(y_true, y_pred):
    k = y_pred.shape[-1]
    y_true = tf.reshape(y_true, shape=(-1, k))
    y_pred = tf.reshape(y_pred, shape=(-1, k))
    
    mask = y_true[:, TAG_PAD] != 1
    y_true = tf.boolean_mask(y_true, mask)
    y_pred = tf.boolean_mask(y_pred, mask)
    
    return keras.losses.categorical_crossentropy(y_true, y_pred)

def masked_categorical_accuracy(y_true, y_pred):
    k = y_pred.shape[-1]
    y_true = tf.reshape(y_true, shape=(-1, k))
    y_pred = tf.reshape(y_pred, shape=(-1, k))
    
    mask = y_true[:, TAG_PAD] != 1
    y_true = tf.boolean_mask(y_true, mask)
    y_pred = tf.boolean_mask(y_pred, mask)

    return keras.metrics.categorical_accuracy(y_true, y_pred)

In [7]:
keras.backend.clear_session()

In [8]:
input = layers.Input(shape=(None,))
inter_0 = input

inter_0 = layers.Embedding(n_words, 128)(inter_0)
inter_0 = layers.Dropout(.7)(inter_0)

filters = 256
dropout = .3
inter_1 = layers.Conv1D (filters, kernel_size=2, padding="same", dilation_rate= 1, activation="relu")(inter_0)
inter_1 = layers.Dropout(dropout)(inter_1)
inter_2 = layers.Conv1D (filters, kernel_size=2, padding="same", dilation_rate= 2, activation="relu")(inter_1)
inter_2 = layers.Dropout(dropout)(inter_2)
inter_3 = layers.Conv1D (filters, kernel_size=2, padding="same", dilation_rate= 4, activation="relu")(inter_2)
inter_3 = layers.Dropout(dropout)(inter_3)
inter_4 = layers.Conv1D (filters, kernel_size=2, padding="same", dilation_rate= 8, activation="relu")(inter_3)
inter_4 = layers.Dropout(dropout)(inter_4)
inter_5 = layers.Conv1D (filters, kernel_size=2, padding="same", dilation_rate=16, activation="relu")(inter_4)
inter_5 = layers.Dropout(dropout)(inter_5)
inter_6 = layers.Conv1D (filters, kernel_size=2, padding="same", dilation_rate=32, activation="relu")(inter_5)
inter_6 = layers.Dropout(dropout)(inter_6)

inter_0 = layers.Concatenate()([inter_0, inter_1, inter_2, inter_3, inter_4, inter_5])
inter_0 = layers.Bidirectional(layers.LSTM(256, return_sequences=True, dropout=.3))(inter_0)
inter_0 = layers.TimeDistributed(layers.Dense(n_tags, activation="softmax"))(inter_0)

output = inter_0
model = keras.models.Model(input, output,
    name="model_emb30k128_d7_6xconv1d256k2di_6xd3_lstm256d3_b64"
)

model.compile(optimizer="adam",
    loss=masked_categorical_crossentropy,
    metrics=[masked_categorical_accuracy]
)
model.summary()

Model: "model_emb30k128_d7_6xconv1d256k2di_6xd3_lstm256d3_b64"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, None, 128)    3840256     input_1[0][0]                    
__________________________________________________________________________________________________
dropout (Dropout)               (None, None, 128)    0           embedding[0][0]                  
__________________________________________________________________________________________________
conv1d (Conv1D)                 (None, None, 256)    65792       dropout[0][0]                    
______________________________________________

In [9]:
hist = model.fit(train_set,
    epochs=20,
    validation_data=test_set,
    verbose=1,
    callbacks=[keras.callbacks.TensorBoard(log_dir="logs/" + model.name, profile_batch=0)],
)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


![tensorboard](tensorboard.png)