<a href="https://colab.research.google.com/github/dude123studios/AdvancedDeepLearning/blob/main/POS_tagging.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import nltk
nltk.download('treebank')

[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Package treebank is already up-to-date!


True

In [2]:
import os 
import tensorflow as tf
import numpy as np

In [3]:
DATA_DIR = './data'

def download_and_read(dataset_dir, num_pairs = None):
  sent_filename = os.path.join(dataset_dir, 'treebank-sentences.txt')
  poss_filename = os.path.join(dataset_dir, 'treebank-poss.txt')
  if not (os.path.exists(sent_filename) and os.path.exists(poss_filename)):
    if not os.path.exists(dataset_dir):
      os.mkdir(dataset_dir)
    fsents = open(sent_filename, 'w')
    fposs = open(poss_filename, 'w')
    sentences = nltk.corpus.treebank.tagged_sents()
    for sent in sentences:
      fsents.write(' '.join([w for w, p in sent])+'\n')
      fposs.write(' '.join([p for w, p in sent])+'\n')
    fsents.close()
    fposs.close()
  sents, poss = [], []
  with open(sent_filename, 'r') as fsent:
    for idx, line in enumerate(fsent):
      sents.append(line.strip())
      if num_pairs is not None and idx >= num_pairs:
        break
  with open(poss_filename, 'r') as fposs:
    for idx, line in enumerate(fposs):
      poss.append(line.strip())
      if num_pairs is not None and idx >= num_pairs:
        break
  return sents, poss

In [4]:
sents, poss = download_and_read('./datasets')
assert (len(sents) == len(poss))
print('#Number of records: ', len(sents))

#Number of records:  3914


In [5]:
def tokenize_and_build_vocab(texts, vocab_size = None, lower=True):
  if vocab_size is None:
    tokenizer = tf.keras.preprocessing.text.Tokenizer(lower=lower)
  else:
    tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=vocab_size+1, oov_token='UNK',lower=lower)
  tokenizer.fit_on_texts(texts)
  if vocab_size is not None:
    tokenizer.word_index = {e:i for e, i in tokenizer.word_index.items() if i <= vocab_size+1}
  word2idx = tokenizer.word_index
  idx2word = {v:k for k,v in word2idx.items()}
  return word2idx, idx2word, tokenizer

In [6]:
word2idx_s, idx2word_s, tokenizer_s = tokenize_and_build_vocab(sents, vocab_size=9000)
word2idx_t, idx2word_t, tokenizer_t = tokenize_and_build_vocab(poss, vocab_size=38,lower=False)

In [7]:
sequence_lengths = np.array([len(s.split()) for s in sents])
print([(p, np.percentile(sequence_lengths, p))
  for p in [75, 80, 90, 95, 99, 100]
])

[(75, 33.0), (80, 35.0), (90, 41.0), (95, 47.0), (99, 58.0), (100, 271.0)]


In [8]:
max_seqlen = 271
sents_as_ints = tokenizer_s.texts_to_sequences(sents)
sents_as_ints = tf.keras.preprocessing.sequence.pad_sequences(
    sents_as_ints, maxlen=max_seqlen, padding='post')
poss_as_ints = tokenizer_t.texts_to_sequences(poss)
poss_as_ints = tf.keras.preprocessing.sequence.pad_sequences(
    poss_as_ints,maxlen = max_seqlen, padding='post')

poss_as_catigorical = []
for p in poss_as_ints:
  poss_as_catigorical.append(tf.keras.utils.to_categorical(p, num_classes=39, dtype='int32'))
poss_as_catigorical = tf.keras.preprocessing.sequence.pad_sequences(
    poss_as_catigorical, maxlen=max_seqlen)

dataset = tf.data.Dataset.from_tensor_slices((sents_as_ints, poss_as_catigorical))

idx2word_s[0], idx2word_t[0] = 'PAD', 'PAD'

In [9]:
dataset = dataset.shuffle(10000)
test_size = len(sents) // 5
val_size = (len(sents)-test_size) // 10
test_dataset = dataset.take(test_size)
val_dataset = dataset.skip(test_size).take(val_size)
train_dataset = dataset.skip(test_size + val_size)

batch_size = 64
train_dataset = train_dataset.batch(batch_size)
test_dataset = test_dataset.batch(batch_size)
val_dataset = val_dataset.batch(batch_size)

In [10]:
from tensorflow.keras.layers import *

In [11]:
class POSTaggingModel(tf.keras.models.Model):
  def __init__(self, vocab_input_size, vocab_output_size,
               embedding_dim, time_stamps, rnn_out, **kwargs):
    super(POSTaggingModel,self).__init__(**kwargs)
    self.embedding = Embedding(vocab_input_size, embedding_dim, input_length = time_stamps)
    self.dropout = SpatialDropout1D(0.2)
    self.rnn = GRU(rnn_out, return_sequences = True)
    self.dense = TimeDistributed(Dense(vocab_output_size))
    self.activation = Activation('softmax')

  def call(self, x):
    x = self.embedding(x)
    x = self.dropout(x)
    x = self.rnn(x)
    x = self.dense(x)
    x = self.activation(x)
    return x

In [12]:
def build_graph(vocab_input_size, vocab_output_size, embedding_dim, rnn_out):
  model = POSTaggingModel(vocab_input_size, vocab_output_size, embedding_dim, max_seqlen, rnn_out)
  model.build(input_shape=(batch_size, max_seqlen))
  model.summary()
  def masked_accuracy():
    def masked_accuracy_fn(ytrue, ypred):
        ytrue = tf.keras.backend.argmax(ytrue, axis=-1)
        ypred = tf.keras.backend.argmax(ypred, axis=-1)
 
        mask = tf.keras.backend.cast(
            tf.keras.backend.not_equal(ypred, 0), tf.int32)
        matches = tf.keras.backend.cast(
            tf.keras.backend.equal(ytrue, ypred), tf.int32) * mask
        numer = tf.keras.backend.sum(matches)
        denom = tf.keras.backend.maximum(tf.keras.backend.sum(mask), 1)
        accuracy =  numer / denom
        return accuracy
    return masked_accuracy_fn
  model.compile(loss='categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy', masked_accuracy()])
  return model

In [13]:
if not os.path.exists('./data'):
  os.mkdir('./data')
with tf.device('gpu:0'):
  num_epochs = 120
  best_model_file = os.path.join(DATA_DIR, 'POS_Tagging_Model.h5')
  checkpoint = tf.keras.callbacks.ModelCheckpoint(best_model_file,
                                                  save_weights_only = True,
                                                  save_best_only = True,
                                                  save_freq=610)
  tensorboard = tf.keras.callbacks.TensorBoard(log_dir='/data/tensorboard')
  model = build_graph(9001,39,128,256)
  history = model.fit(train_dataset,epochs=num_epochs,
                      validation_data=val_dataset,
                      callbacks=[checkpoint,tensorboard])
  model.save_weights('./data/final_pos_tagging_model.h5')

Model: "pos_tagging_model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        multiple                  1152128   
_________________________________________________________________
spatial_dropout1d (SpatialDr multiple                  0         
_________________________________________________________________
gru (GRU)                    multiple                  296448    
_________________________________________________________________
time_distributed (TimeDistri multiple                  10023     
_________________________________________________________________
activation (Activation)      multiple                  0         
Total params: 1,458,599
Trainable params: 1,458,599
Non-trainable params: 0
_________________________________________________________________
Epoch 1/120
Epoch 2/120
Epoch 3/120
Epoch 4/120
Epoch 5/120
Epoch 6/120
Epoch 7/120
Epoch 8/120
Epoch 9/1