In [14]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import nltk

In [15]:
nltk.download('treebank')

[nltk_data] Downloading package treebank to D:\Program
[nltk_data]     Files\Anaconda3\nltk_data...
[nltk_data]   Package treebank is already up-to-date!


True

In [16]:
nltk.download('universal_tagset')

[nltk_data] Downloading package universal_tagset to D:\Program
[nltk_data]     Files\Anaconda3\nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!


True

In [17]:
tagged_sentence = nltk.corpus.treebank.tagged_sents(tagset='universal')
print("Number of Tagged Sentences ", len(tagged_sentence))

Number of Tagged Sentences  3914


In [18]:
print(tagged_sentence[1])

[('Mr.', 'NOUN'), ('Vinken', 'NOUN'), ('is', 'VERB'), ('chairman', 'NOUN'), ('of', 'ADP'), ('Elsevier', 'NOUN'), ('N.V.', 'NOUN'), (',', '.'), ('the', 'DET'), ('Dutch', 'NOUN'), ('publishing', 'VERB'), ('group', 'NOUN'), ('.', '.')]


In [19]:
# ix = index
def word_to_ix(word, ix):
    return torch.tensor(ix[word], dtype = torch.long)

def char_to_ix(char, ix):
    return torch.tensor(ix[char], dtype = torch.long)

def tag_to_ix(tag, ix):
    return torch.tensor(ix[word], dtype = torch.long)

def sequence_to_idx(sequence, ix):
    return torch.tensor([ix[s] for s in sequence])

In [20]:
word_to_idx = {}
tag_to_idx = {}
char_to_idx = {}

for sentence in tagged_sentence:
    for word, pos_tag in sentence:
        if word not in word_to_idx.keys():
            word_to_idx[word] = len(word_to_idx)
        if pos_tag not in tag_to_idx.keys():
            tag_to_idx[pos_tag] = len(tag_to_idx)
        for char in word:
            if char not in char_to_idx.keys():
                char_to_idx[char] = len(char_to_idx)

In [21]:
word_to_idx

{'Pierre': 0,
 'Vinken': 1,
 ',': 2,
 '61': 3,
 'years': 4,
 'old': 5,
 'will': 6,
 'join': 7,
 'the': 8,
 'board': 9,
 'as': 10,
 'a': 11,
 'nonexecutive': 12,
 'director': 13,
 'Nov.': 14,
 '29': 15,
 '.': 16,
 'Mr.': 17,
 'is': 18,
 'chairman': 19,
 'of': 20,
 'Elsevier': 21,
 'N.V.': 22,
 'Dutch': 23,
 'publishing': 24,
 'group': 25,
 'Rudolph': 26,
 'Agnew': 27,
 '55': 28,
 'and': 29,
 'former': 30,
 'Consolidated': 31,
 'Gold': 32,
 'Fields': 33,
 'PLC': 34,
 'was': 35,
 'named': 36,
 '*-1': 37,
 'this': 38,
 'British': 39,
 'industrial': 40,
 'conglomerate': 41,
 'A': 42,
 'form': 43,
 'asbestos': 44,
 'once': 45,
 'used': 46,
 '*': 47,
 'to': 48,
 'make': 49,
 'Kent': 50,
 'cigarette': 51,
 'filters': 52,
 'has': 53,
 'caused': 54,
 'high': 55,
 'percentage': 56,
 'cancer': 57,
 'deaths': 58,
 'among': 59,
 'workers': 60,
 'exposed': 61,
 'it': 62,
 'more': 63,
 'than': 64,
 '30': 65,
 'ago': 66,
 'researchers': 67,
 'reported': 68,
 '0': 69,
 '*T*-1': 70,
 'The': 71,
 'fiber':

In [22]:
tag_to_idx

{'NOUN': 0,
 '.': 1,
 'NUM': 2,
 'ADJ': 3,
 'VERB': 4,
 'DET': 5,
 'ADP': 6,
 'CONJ': 7,
 'X': 8,
 'ADV': 9,
 'PRT': 10,
 'PRON': 11}

In [23]:
char_to_idx

{'P': 0,
 'i': 1,
 'e': 2,
 'r': 3,
 'V': 4,
 'n': 5,
 'k': 6,
 ',': 7,
 '6': 8,
 '1': 9,
 'y': 10,
 'a': 11,
 's': 12,
 'o': 13,
 'l': 14,
 'd': 15,
 'w': 16,
 'j': 17,
 't': 18,
 'h': 19,
 'b': 20,
 'x': 21,
 'c': 22,
 'u': 23,
 'v': 24,
 'N': 25,
 '.': 26,
 '2': 27,
 '9': 28,
 'M': 29,
 'm': 30,
 'f': 31,
 'E': 32,
 'D': 33,
 'p': 34,
 'g': 35,
 'R': 36,
 'A': 37,
 '5': 38,
 'C': 39,
 'G': 40,
 'F': 41,
 'L': 42,
 '*': 43,
 '-': 44,
 'B': 45,
 'K': 46,
 '3': 47,
 '0': 48,
 'T': 49,
 'I': 50,
 'Y': 51,
 "'": 52,
 'J': 53,
 '`': 54,
 'W': 55,
 'q': 56,
 'H': 57,
 'U': 58,
 '8': 59,
 '4': 60,
 '?': 61,
 'z': 62,
 '&': 63,
 'S': 64,
 '7': 65,
 '%': 66,
 '$': 67,
 ';': 68,
 'O': 69,
 ':': 70,
 'Q': 71,
 'Z': 72,
 '\\': 73,
 '/': 74,
 'X': 75,
 '@': 76,
 '!': 77,
 '#': 78}

In [24]:
word_vocab_size = len(word_to_idx)
tag_vocab_size = len(tag_to_idx)
char_vocab_size = len(char_to_idx)

print("Unique words: {}".format(word_vocab_size))
print("Unique words: {}".format(tag_vocab_size))
print("Unique words: {}".format(char_vocab_size))

Unique words: 12408
Unique words: 12
Unique words: 79


In [31]:
WORD_EMBEDDING_DIM = 1024
CHAR_EMBEDDING_DIM = 128
WORD_HIDDEN_DIM = 1024
CHAR_HIDDEN_DIM = 1024
EPOCH = 2

In [32]:
class DualLSTMTagger(nn.Module):
    def __init__(self, word_embedding_dim, word_hidden_dim, char_embedding_dim, \
                char_hidden_dim, word_vocab_size, char_vocab_size, tag_vocab_size):
        super(DualLSTMTagger, self).__init__()
        self.word_embedding = nn.Embedding(word_vocab_size, word_embedding_dim)
        self.char_embedding = nn.Embedding(char_vocab_size, char_embedding_dim)
        self.char_lstm = nn.LSTM(char_embedding_dim, char_hidden_dim)
        self.lstm = nn.LSTM(word_embedding_dim + char_hidden_dim, word_hidden_dim)
        self.hidden2tag = nn.Linear(word_hidden_dim, tag_vocab_size)
        
    def forward(self, sentence, words):
        embeds = self.word_embedding(sentence)
        char_hidden_final = []
        for word in words:
            char_embeds = self.char_embedding(word)
            _, (char_hidden, char_cell_state) = self.char_lstm(char_embeds.view(len(word), 1, -1))
            word_char_hidden_state = char_hidden.view(-1)
            char_hidden_final.append(word_char_hidden_state)

        char_hidden_final = torch.stack(tuple(char_hidden_final))
        combined = torch.cat((embeds, char_hidden_final), 1)
        lstm_out, _ = self.lstm(combined.view(len(sentence), 1, -1))
        tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
        tag_scores = F.log_softmax(tag_space, dim=1)

        return tag_scores

In [33]:
train = tagged_sentence[:20]

model = DualLSTMTagger(WORD_EMBEDDING_DIM, WORD_HIDDEN_DIM, CHAR_EMBEDDING_DIM,
                      CHAR_HIDDEN_DIM, word_vocab_size, char_vocab_size, tag_vocab_size)

loss_function = nn.NLLLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

#test sentence
seq = 'everybody eat the food .  I kept looking out the window , \
trying to find the one I was waiting for .'.split()

print('Running a check on the model before training.\n Sentences: \n{}'.format(''.join(seq)))

with torch.no_grad():
    words = [ torch.tensor(sequence_to_idx(s[0], char_to_idx), \
                          dtype = torch.long) for s in seq ]
    sentence = torch.tensor(sequence_to_idx(seq, word_to_idx), \
                           dtype = torch.long)
    
    tag_scores = model(sentence, words)
    _, indices = torch.max(tag_scores, 1)
    ret = []
    
    for i in range(len(indices)):
        for key, value in tag_to_idx.items():
            if indices[i] == value:
                ret.append((seq[i], key))
    print(ret)

Running a check on the model before training.
 Sentences: 
everybodyeatthefood.Ikeptlookingoutthewindow,tryingtofindtheoneIwaswaitingfor.
[('everybody', 'NOUN'), ('eat', 'NOUN'), ('the', 'NOUN'), ('food', 'NUM'), ('.', 'ADJ'), ('I', 'NOUN'), ('kept', 'ADV'), ('looking', 'NUM'), ('out', 'VERB'), ('the', 'NOUN'), ('window', 'ADV'), (',', 'ADV'), ('trying', 'DET'), ('to', '.'), ('find', 'CONJ'), ('the', 'CONJ'), ('one', 'DET'), ('I', 'CONJ'), ('was', 'NOUN'), ('waiting', 'DET'), ('for', 'CONJ'), ('.', 'ADJ')]

  words = [ torch.tensor(sequence_to_idx(s[0], char_to_idx), \
  sentence = torch.tensor(sequence_to_idx(seq, word_to_idx), \





In [34]:
print('Training started')
accuracy_list = []
loss_list = []
interval = round(len(train) / 100)
e_interval = round(EPOCH/10)

for epoch in range(EPOCH):
    acc = 0
    loss = 0
    i = 0
    
    for sentence_tag in train:
        i += 1
        words = [ torch.tensor(sequence_to_idx(s[0], char_to_idx), \
                              dtype = torch.long) for s in sentence_tag]
        sentence = [s[0] for s in sentence_tag]
        sentence = torch.tensor(sequence_to_idx(sentence, word_to_idx), \
                               dtype=torch.long)
        targets = [s[1] for s in sentence_tag]
        targets = torch.tensor(sequence_to_idx(targets, tag_to_idx), \
                              dtype=torch.long)
        model.zero_grad()
        tag_scores = model(sentence, words)
        
        loss = loss_function(tag_scores, targets)
        loss.backward()
        optimizer.step()
        loss += loss.item()
        _, indices = torch.max(tag_scores, 1)
        acc += torch.mean(torch.tensor(targets == indices, dtype=torch.float))
        # if i % interval == 0
        print("Epoch {} Running:\t Iteration {} Complete".format(epoch + 1, i), end = '\r', flush=True)
    loss = loss / len(train)
    acc = acc / len(train)
    loss_list.append(float(loss))
    accuracy_list.append(float(acc))
    print("Epoch {} completed, Loss : {}, \tAccuracy: {}".format(epoch + 1, np.mean(loss_list), np.mean(accuracy_list)))

Training started


  words = [ torch.tensor(sequence_to_idx(s[0], char_to_idx), \
  sentence = torch.tensor(sequence_to_idx(sentence, word_to_idx), \
  targets = torch.tensor(sequence_to_idx(targets, tag_to_idx), \


Epoch 1 Running:	 Iteration 1 Complete

  acc += torch.mean(torch.tensor(targets == indices, dtype=torch.float))


Epoch 1 completed, Loss : 0.09988049417734146, 	Accuracy: 0.5399650931358337
Epoch 2 completed, Loss : 0.07337447628378868, 	Accuracy: 0.6582813858985901


In [35]:
with torch.no_grad():
    words = [ torch.tensor(sequence_to_idx(s[0], char_to_idx), \
                          dtype = torch.long) for s in seq ]
    sentence = torch.tensor(sequence_to_idx(seq, word_to_idx), \
                           dtype = torch.long)
    
    tag_scores = model(sentence, words)
    _, indices = torch.max(tag_scores, 1)
    ret = []
    
    for i in range(len(indices)):
        for key, value in tag_to_idx.items():
            if indices[i] == value:
                ret.append((seq[i], key))
    print(ret)

[('everybody', 'PRON'), ('eat', 'PRON'), ('the', 'DET'), ('food', 'NOUN'), ('.', '.'), ('I', 'NUM'), ('kept', '.'), ('looking', 'VERB'), ('out', 'VERB'), ('the', 'DET'), ('window', 'NOUN'), (',', '.'), ('trying', 'VERB'), ('to', 'PRT'), ('find', 'DET'), ('the', 'DET'), ('one', 'DET'), ('I', 'NOUN'), ('was', 'VERB'), ('waiting', 'DET'), ('for', 'ADP'), ('.', '.')]


  words = [ torch.tensor(sequence_to_idx(s[0], char_to_idx), \
  sentence = torch.tensor(sequence_to_idx(seq, word_to_idx), \
