# <u>Part of Speech Tagger
This notebook contains model for tagging POS in an English sentence. There are many POS tags. The model converts the sentence to POS tags. Tags used are:<br>
    
**ADJ - Adjective<br>
ADP - Adposition
ADV - Adverb<br>
PRT -	Particle<br> 
PRON - Pronoun<br>
.	   - Punctuation marks<br>
X	- Other	<br>
VERB - Verb<br>
CONJ	- Conjunction<br>
DET - Determiner / Article	
NOUN	- Noun	<br>
NUM - Numeral<br>**

In [0]:
import nltk
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import train_test_split

In [7]:
# import training data. 
# We will be using the nltk data 
nltk.download('brown')
nltk.download('universal_tagset')

# load training data from nltk library
all_tags = ['<EOS>','<UNK>','ADV', 'NOUN', 'ADP', 'PRON', 'DET',
            '.', 'PRT', 'VERB', 'X', 'NUM', 'CONJ', 'ADJ']
data = nltk.corpus.brown.tagged_sents(tagset='universal')
print(len(data))

[nltk_data] Downloading package brown to /content/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.
[nltk_data] Downloading package universal_tagset to
[nltk_data]     /content/nltk_data...
[nltk_data]   Unzipping taggers/universal_tagset.zip.
57340


In [8]:
data

[[('The', 'DET'), ('Fulton', 'NOUN'), ('County', 'NOUN'), ('Grand', 'ADJ'), ('Jury', 'NOUN'), ('said', 'VERB'), ('Friday', 'NOUN'), ('an', 'DET'), ('investigation', 'NOUN'), ('of', 'ADP'), ("Atlanta's", 'NOUN'), ('recent', 'ADJ'), ('primary', 'NOUN'), ('election', 'NOUN'), ('produced', 'VERB'), ('``', '.'), ('no', 'DET'), ('evidence', 'NOUN'), ("''", '.'), ('that', 'ADP'), ('any', 'DET'), ('irregularities', 'NOUN'), ('took', 'VERB'), ('place', 'NOUN'), ('.', '.')], [('The', 'DET'), ('jury', 'NOUN'), ('further', 'ADV'), ('said', 'VERB'), ('in', 'ADP'), ('term-end', 'NOUN'), ('presentments', 'NOUN'), ('that', 'ADP'), ('the', 'DET'), ('City', 'NOUN'), ('Executive', 'ADJ'), ('Committee', 'NOUN'), (',', '.'), ('which', 'DET'), ('had', 'VERB'), ('over-all', 'ADJ'), ('charge', 'NOUN'), ('of', 'ADP'), ('the', 'DET'), ('election', 'NOUN'), (',', '.'), ('``', '.'), ('deserves', 'VERB'), ('the', 'DET'), ('praise', 'NOUN'), ('and', 'CONJ'), ('thanks', 'NOUN'), ('of', 'ADP'), ('the', 'DET'), ('City

In [9]:
# convert the data into list of (word, tag) format for each sentence in the train_data
data = [[(word.lower(), tag) for word, tag in sentence] for sentence in data]
data[1]

[('the', 'DET'),
 ('jury', 'NOUN'),
 ('further', 'ADV'),
 ('said', 'VERB'),
 ('in', 'ADP'),
 ('term-end', 'NOUN'),
 ('presentments', 'NOUN'),
 ('that', 'ADP'),
 ('the', 'DET'),
 ('city', 'NOUN'),
 ('executive', 'ADJ'),
 ('committee', 'NOUN'),
 (',', '.'),
 ('which', 'DET'),
 ('had', 'VERB'),
 ('over-all', 'ADJ'),
 ('charge', 'NOUN'),
 ('of', 'ADP'),
 ('the', 'DET'),
 ('election', 'NOUN'),
 (',', '.'),
 ('``', '.'),
 ('deserves', 'VERB'),
 ('the', 'DET'),
 ('praise', 'NOUN'),
 ('and', 'CONJ'),
 ('thanks', 'NOUN'),
 ('of', 'ADP'),
 ('the', 'DET'),
 ('city', 'NOUN'),
 ('of', 'ADP'),
 ('atlanta', 'NOUN'),
 ("''", '.'),
 ('for', 'ADP'),
 ('the', 'DET'),
 ('manner', 'NOUN'),
 ('in', 'ADP'),
 ('which', 'DET'),
 ('the', 'DET'),
 ('election', 'NOUN'),
 ('was', 'VERB'),
 ('conducted', 'VERB'),
 ('.', '.')]

In [30]:
train_data, test_data = train_test_split(data,test_size=0.20)
print(len(train_data))
print(len(test_data))

45872
11468


### Building Vocabulary Mappings
We will now create the Vocabulary dictionary for the training data, the mappings from word to indices and vice-versa.

In [0]:
from collections import Counter, defaultdict

#### Create Vocabulary Dictionary

In [31]:
word_counts = Counter()
# we will use the top 11000 words for out dictionary only.
for sentence in train_data:
    words, tags = zip(*sentence)
    word_counts.update(words)

# take out the top words
top_words = list(zip(*word_counts.most_common(11000)))[0]
vocab = ['<EOS>','<UNK>'] + list(top_words) 
print(vocab)



#### Create vocabulary mappings

In [32]:
# create word to index mapping
# for every unknown word the dict will give index 1 which is <UNK>
word_to_idx = defaultdict(lambda:1, {word:idx for idx,word in tqdm(enumerate(vocab))})
# create reverse mapping
idx_to_word = {idx:word for word,idx in word_to_idx.items()}

11002it [00:00, 705818.88it/s]


#### Create tag mappings

In [33]:
# create word to index mapping
tag_to_idx = {tag:idx for idx,tag in tqdm(enumerate(all_tags))}
# create reverse mapping
idx_to_tag = {idx:tag for tag,idx in tag_to_idx.items()}

14it [00:00, 14884.73it/s]


### Prepare data for Keras model
We will use numerical representation for each word and feed to the model.

In [0]:
# converts the tokens to its numerical representation
# output: (m, max sequence length)
def convert_to_num(sentences, token_to_idx, pad=0, dtype='int32', time_major=False):
    # find the max sentence length
    max_sent_len = max(map(len, sentences))
    # create the matrix
    mat = np.empty([len(sentences), max_sent_len], dtype)
    # fill with padding
    mat.fill(pad)
    
    # convert to numerical mappings
    for i, sentence in enumerate(sentences):
        num_row = [token_to_idx[token] for token in sentence]
        mat[i, :len(num_row)] = num_row
        
    if time_major:
        return mat.T
    else:
        return mat

In [0]:
words_batch, tags_batch = zip(*[zip(*sentence) for sentence in train_data[1:3]])

In [17]:
print(convert_to_num(words_batch, word_to_idx))

[[  73    1   42 2591 2929    1    3   78    1    1    3    6    8  120
  3122    1    4    0]
 [  10   12 5359 3033    3   59 9158   19  178 4547   14    2 6048   24
  3340 2744    1    4]]


In [22]:
print(convert_to_num(tags_batch,tag_to_idx))

[[13  3  9  2 13  3  7  6  9  3  7 12  6 13 13  3  7  0]
 [ 6  9  3  3  7  5  9  4  3  3  4  6  3  4  3  3  3  7]]


## <U>Model
We will use Keras model for this.

In [0]:
import keras
import keras.layers as L
import sys
from keras.utils.np_utils import to_categorical
from keras.callbacks import LambdaCallback
import tensorflow as tf

We will use generator for feeding small batches at a time to the model.

In [0]:
# for generating the batches
def generate_model_batches(sentences, batch_size=32, pad=0):
    # no. of training examples
    m = np.arange(len(sentences))
    
    while True:
        # get a shuffled index list
        idx = np.random.permutation(m)
        
        # start yeilding batches
        for start in range(0, len(idx)-1, batch_size):
            batch_idx = idx[start:start+batch_size]
            batch_words, batch_tags = [], []
            
            # take out the words and tags from 'batch_size' no. of training examples
            for index in batch_idx:
                words, tags = zip(*sentences[index])
                batch_words.append(words)
                batch_tags.append(tags)
            
            # input x
            batch_words_num = convert_to_num(batch_words, word_to_idx, pad=0)
            batch_tags_num = convert_to_num(batch_tags, tag_to_idx, pad=0)
            
            # output labels 
            batch_tags_ohe = to_categorical(batch_tags_num, len(all_tags))
            yield batch_words_num, batch_tags_ohe

In [0]:
# for computing accuracy
def compute_accuracy(model):
    test_words, test_tags = zip(*[zip(*sentence) for sentence in test_data])
    test_words_num = convert_to_num(test_words, word_to_idx)
    test_tags_num = convert_to_num(test_tags, tag_to_idx)
    
    # get prediction tags
    predictions = model.predict(test_words_num, batch_size=128, verbose=1)
    pred_tags = predictions.argmax(axis=-1)
    
    # compute accuracy
    return float(np.sum(np.logical_and((test_words_num!=0), (pred_tags == test_tags_num))))  \
                /np.sum(test_words_num!=0)
                

In [0]:
# for computing accuracy at the end of epoch
def on_epoch_end(epoch, logs):
    sys.stdout.flush()
    print('\nValidation Accuracy: ' + str(compute_accuracy(model)*100) + ' %')
    sys.stdout.flush()

In [0]:
acc_callback = LambdaCallback(on_epoch_end=on_epoch_end)

In [54]:
model = keras.models.Sequential()
model.add(L.InputLayer([None],dtype='int32'))
model.add(L.Embedding(len(vocab),50))

model.add(L.Bidirectional(L.LSTM(64,return_sequences=True,activation='tanh')))
model.add(L.Dropout(0.35))
model.add(L.BatchNormalization())

stepwise_dense = L.TimeDistributed(L.Dense(len(all_tags),activation='softmax'))
model.add(stepwise_dense)

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_9 (InputLayer)         (None, None)              0         
_________________________________________________________________
embedding_9 (Embedding)      (None, None, 50)          550100    
_________________________________________________________________
bidirectional_7 (Bidirection (None, None, 128)         58880     
_________________________________________________________________
dropout_7 (Dropout)          (None, None, 128)         0         
_________________________________________________________________
batch_normalization_7 (Batch (None, None, 128)         512       
_________________________________________________________________
time_distributed_9 (TimeDist (None, None, 14)          1806      
Total params: 611,298
Trainable params: 611,042
Non-trainable params: 256
_________________________________________________________________


In [0]:
adam = keras.optimizers.Adam(clipvalue=1.5)
model.compile(adam, loss='categorical_crossentropy')

with tf.device('/gpu:0'):
  hist = model.fit_generator(generate_model_batches(train_data, batch_size=256),steps_per_epoch=len(train_data)/256,
                    callbacks=[acc_callback], epochs=10)

Epoch 1/10

Validation Accuracy: 93.15848757135909 %
Epoch 2/10


Validation Accuracy: 94.89670532217188 %
Epoch 3/10

In [0]:
# save the model and its weights
model.save_weights('drive/Colab Notebooks/weights.h5')
model.save('drive/Colab Notebooks/model.h5')

## Plots

In [0]:
import matplotlib.pyplot as plt

In [0]:
tr_loss = hist.History['loss']
tr_acc = hist.History['acc']



In [2]:
!apt-get install -y -qq software-properties-common python-software-properties module-init-tools
!add-apt-repository -y ppa:alessandro-strada/ppa 2>&1 > /dev/null
!apt-get update -qq 2>&1 > /dev/null
!apt-get -y install -qq google-drive-ocamlfuse fuse
from google.colab import auth
auth.authenticate_user()
from oauth2client.client import GoogleCredentials
creds = GoogleCredentials.get_application_default()
import getpass
!google-drive-ocamlfuse -headless -id={creds.client_id} -secret={creds.client_secret} < /dev/null 2>&1 | grep URL
vcode = getpass.getpass()
!echo {vcode} | google-drive-ocamlfuse -headless -id={creds.client_id} -secret={creds.client_secret}

Please, open the following URL in a web browser: https://accounts.google.com/o/oauth2/auth?client_id=32555940559.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive&response_type=code&access_type=offline&approval_prompt=force
··········
Please, open the following URL in a web browser: https://accounts.google.com/o/oauth2/auth?client_id=32555940559.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive&response_type=code&access_type=offline&approval_prompt=force
Please enter the verification code: Access token retrieved correctly.


In [0]:
!mkdir -p drive
!google-drive-ocamlfuse drive

In [4]:
!pip install tqdm

Collecting tqdm
[?25l  Downloading https://files.pythonhosted.org/packages/93/24/6ab1df969db228aed36a648a8959d1027099ce45fad67532b9673d533318/tqdm-4.23.4-py2.py3-none-any.whl (42kB)
[K    100% |████████████████████████████████| 51kB 2.1MB/s 
[?25hInstalling collected packages: tqdm
Successfully installed tqdm-4.23.4


In [5]:
!ls

datalab  drive
