# PyTorch Tutorial: Linguistically-guided LSTM  

Unlike traditional machine learning methods, where features are dependent on specialized knowledge. Deep learning can automaticallly extract features. In NLP, one of the most common deep learning architectures is LSTM. It is designed to capture long-term dependencies between words. In this tutorial, we will show you how to code LSTM model for a POS tagger system from (almost) scratch using PyTorch.  In addition, we will explore PyTorch's ability to create a computational graph dynamically to create a graph that is guided by linguistic knowledge.
## Why PyTorch?
* PyTorch can build a computational graph dynamically.
![dynamic_graph](http://pytorch.org/static/img/dynamic_graph.gif)
* PyTorch is built to be deeply integrated into Python. You can use it naturally like you would use numpy / scipy / scikit-learn etc.

(Image reference:http://pytorch.org/about/)

In [1]:
%matplotlib inline

from itertools import chain
import nltk
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelBinarizer
import sklearn
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable


print(torch.__version__)


0.3.0.post4


In [5]:
USE_CUDA = torch.cuda.is_available()

FloatTensor = torch.cuda.FloatTensor if USE_CUDA else torch.FloatTensor
LongTensor = torch.cuda.LongTensor if USE_CUDA else torch.LongTensor
ByteTensor = torch.cuda.ByteTensor if USE_CUDA else torch.ByteTensor


## Brown Corpus
The Brown Corpus of Standard American English was the first million-word computer readable corpus created in 1961 at Brown University. We use the Universal Part-of-Speech tagset in this tutorial. Knowing which Part-Of-Speech a word belongs to tells us  about its grammatical function in the sentence and its likely neighboring words. Therefore,  POS tagging is an important process in many NLP applications (e.g. Word Sense Disambiguation, Named entity Recognition).



Universal Part-of-Speech Tagset 

|Tag | 	Meaning |	English Examples|
| --- | --- | --- |
|ADJ 	|adjective 	|new, good, high, special, big, local|
|ADP 	|adposition 	|on, of, at, with, by, into, under|
|ADV 	|adverb 	|really, already, still, early, now|
|CONJ 	|conjunction |and, or, but, if, while, although|
|DET 	|determiner, article |the, a, some, most, every, no, which|
|NOUN 	|noun 	|year, home, costs, time, Africa|
|NUM 	|numeral 	|twenty-four, fourth, 1991, 14:24|
|PRT 	|particle 	|at, on, out, over per, that, up, with|
|PRON 	|pronoun 	|he, their, her, its, my, I, us|
|VERB 	|verb 	|is, say, told, given, playing, would|
|. 	|punctuation marks 	|. , ; !|
|X 	|other 	|ersatz, esprit, dunno, gr8, univeristy|


(Reference: http://www.nltk.org/book/ch05.html)



In [28]:
tagged_sents=nltk.corpus.brown.tagged_sents(tagset='universal') #load the corpus from NLTK library

In [29]:
#split data into train set and test set
train_size = int(len(tagged_sents) * 0.9)
train_sents = tagged_sents[:train_size]
test_sents = tagged_sents[train_size:]

In [30]:
#a sample sentence from the training set
train_sents[0]

[('The', 'DET'),
 ('Fulton', 'NOUN'),
 ('County', 'NOUN'),
 ('Grand', 'ADJ'),
 ('Jury', 'NOUN'),
 ('said', 'VERB'),
 ('Friday', 'NOUN'),
 ('an', 'DET'),
 ('investigation', 'NOUN'),
 ('of', 'ADP'),
 ("Atlanta's", 'NOUN'),
 ('recent', 'ADJ'),
 ('primary', 'NOUN'),
 ('election', 'NOUN'),
 ('produced', 'VERB'),
 ('``', '.'),
 ('no', 'DET'),
 ('evidence', 'NOUN'),
 ("''", '.'),
 ('that', 'ADP'),
 ('any', 'DET'),
 ('irregularities', 'NOUN'),
 ('took', 'VERB'),
 ('place', 'NOUN'),
 ('.', '.')]

## Preparing the Data: Words, POS tags


indexing!

In [31]:
word_list=[]
pos_list=[]

for sent in train_sents:
    for word in sent:
        word_list.append(word[0])
        pos_list.append(word[1])


#Word to Index
word_list.append("UNK") #Special Token for unknown words
all_words = sorted(set(word_list))
all_pos = sorted(set(pos_list))
del word_list, pos_list
word_to_ix = dict((c, i) for i, c in enumerate(all_words)) #convert word to index 
pos_to_ix = dict((c, i) for i, c in enumerate(all_pos)) #convert pos to index



In [32]:
print(pos_to_ix) # POS tags

{'.': 0, 'ADJ': 1, 'ADP': 2, 'ADV': 3, 'CONJ': 4, 'DET': 5, 'NOUN': 6, 'NUM': 7, 'PRON': 8, 'PRT': 9, 'VERB': 10, 'X': 11}


In [33]:
ix_to_word = dict((v,k) for k,v in word_to_ix.items()) #convert index to word
ix_to_pos = dict((v,k) for k,v in pos_to_ix.items())  #convert index to word


## Preparing the Data for PyTorch

In [34]:
#split each input from its target

input_sent =[ [ word[0] for word in sent]for sent in train_sents ] #words only
train_targets =[ [ word[1] for word in sent]for sent in train_sents ] #POS only

input_test_sent =[ [ word[0] for word in sent]for sent in test_sents ] #words only
test_targets =[ [ word[1] for word in sent]for sent in test_sents ] #POS only
print(input_sent[0], "\n", train_targets[0])

['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', 'Friday', 'an', 'investigation', 'of', "Atlanta's", 'recent', 'primary', 'election', 'produced', '``', 'no', 'evidence', "''", 'that', 'any', 'irregularities', 'took', 'place', '.'] 
 ['DET', 'NOUN', 'NOUN', 'ADJ', 'NOUN', 'VERB', 'NOUN', 'DET', 'NOUN', 'ADP', 'NOUN', 'ADJ', 'NOUN', 'NOUN', 'VERB', '.', 'DET', 'NOUN', '.', 'ADP', 'DET', 'NOUN', 'VERB', 'NOUN', '.']


In [67]:
#Functions for converting input/target text sequences to PyTorch-compatible sequences 

def prepare_sequence_word(input_text):
    """
    Convert an input text sequence for training phrase
    """
    idxs = [word_to_ix[w] for w in input_text]
    tensor = LongTensor(idxs)
    idxs_feat=list()
    for word in input_text:
        if word[0].isupper():
            idxs_feat.append(1)
        else:
            idxs_feat.append(0)

    tensor_cap_feat = LongTensor(idxs_feat)
    return Variable(tensor), tensor_cap_feat

def prepare_sequence_word_test(input_text):
    """
    Convert an input text sequence for testing phrase
    """
    idxs = list()
    for word in input_text:
        if word in word_to_ix:
            idxs.append(word_to_ix[word])
        else:
            idxs.append(word_to_ix["UNK"]) #Use UNK tag for unknown word
   
    tensor = LongTensor(idxs)
    idxs_feat = list()
    for word in input_text:
        if word[0].isupper():
            idxs_feat.append(1)
        else:
            idxs_feat.append(0)

    tensor_cap_feat = LongTensor(idxs_feat)
    return Variable(tensor, volatile=True), tensor_cap_feat

def prepare_sequence_target_pos(input_label):
    """
    Convert an input target sequence for training phrase
    """
    idxs = [pos_to_ix[w] for w in input_label]
    tensor = LongTensor(idxs)
    return Variable(tensor)



In [36]:
print(prepare_sequence_word(input_sent[0]))
print(input_sent[0])

(Variable containing:
 16844
  7869
  5699
  8350
 10010
 45101
  7822
 19580
 34929
 39323
  3196
 43450
 42137
 28331
 42267
 18506
 38787
 29130
   321
 49753
 19857
 35023
 50336
 41203
   400
[torch.cuda.LongTensor of size 25 (GPU 0)]
, 
 1
 1
 1
 1
 1
 0
 1
 0
 0
 0
 1
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0
[torch.cuda.LongTensor of size 25 (GPU 0)]
)
['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', 'Friday', 'an', 'investigation', 'of', "Atlanta's", 'recent', 'primary', 'election', 'produced', '``', 'no', 'evidence', "''", 'that', 'any', 'irregularities', 'took', 'place', '.']


### The linguistically guided LSTM Neural Network Architecture for POS tagger



![lstm](https://cdn-images-1.medium.com/max/800/1*Hil-MwFIs_6l4naBNbUQXg.png)

##### This diagram represents a typical LSTM cell, what if we want an LSTM with control-flow????



In [60]:
class LSTM(nn.Module):
    def __init__(self, input_size, word_emb_dim , hidden_dim, output_size):
        super(LSTM, self).__init__()

        self.hidden_dim = hidden_dim

        self.word_embedding = nn.Embedding(input_size, word_emb_dim)

        ################################################################
        self.i2ig = nn.Linear(word_emb_dim, hidden_dim) #normal inputgate
        self.h2ig = nn.Linear(hidden_dim, hidden_dim) #normal inputgate
        
        self.i2ig_cap = nn.Linear(word_emb_dim, hidden_dim) #inputgate for capitalized word
        self.h2ig_cap = nn.Linear(hidden_dim, hidden_dim) #inputgate for capitalized word
        #####################################################################
        self.i2ctil = nn.Linear(word_emb_dim, hidden_dim) #c_tilde (new memory cell)
        self.h2ctil = nn.Linear(hidden_dim, hidden_dim) #c_tilde (new memory cell)
        
        self.i2fg = nn.Linear(word_emb_dim, hidden_dim) #forgetgate
        self.h2fg = nn.Linear(hidden_dim, hidden_dim) #forgetgate
        
        self.i2og = nn.Linear(word_emb_dim, hidden_dim) #outputgate
        self.h2og = nn.Linear(hidden_dim, hidden_dim) #outputgate
        
        self.hidden2tag = nn.Linear(hidden_dim, output_size) #output
        self.softmax = nn.LogSoftmax()
        
        
    def _inputGate(self, word_embed, hidden):
        """
        input gate: how much the current input matters according to the past hidden state 
        and the current input
        """
        i_t = F.sigmoid(self.i2ig(word_embed)+self.h2ig(hidden))
        return i_t
    
    def _inputGateCap(self, word_embed, hidden):
        """
        input gate for a capitalized word
        """
        i_t = F.sigmoid(self.i2ig_cap(word_embed)+self.h2ig_cap(hidden))
        return i_t
                        
    def _newMemoryCell(self, word_embed, hidden):
        """
        New memory cell: how much the current input will be remembered according to the past hidden state 
        and the current input 
        """
        c_tilda = F.tanh(self.i2ctil(word_embed)+self.h2ctil(hidden)) #new memory cell
        return c_tilda
    
    def _forgetGate(self, word_embed, hidden):
        """
        Forget gate: how much the past memory cell matters according to the past hidden state 
        and the current input
        """
        f_t = F.sigmoid(self.i2fg(word_embed)+self.h2fg(hidden))#forget gate
        return f_t
    
    def _cellState(self,c_prev,c_tilda,f_t,i_t):
        """
        Final memory cell: Decide how much the past memory cell will be forgotten according to the forget gate
        and decide how much the new memory cell matters according to the input gate. 
        Then it sums both decisions to generate the final memory cell (c_t)
        """
        c_t = (f_t * c_prev) + (i_t * c_tilda) 
        return c_t
    
    def _outputGate(self, word_embed, hidden, c_t):
        
        """
        Output gate: calculate the final hidden state according to the current input, the past hidden state
        , and the final memory cell
        """
        o_t =   F.sigmoid(self.i2og(word_embed)+self.h2og(hidden))
        h_t  =  o_t * F.tanh(c_t)
        return h_t 

    def forward(self, input_word, hidden, c_prev,feat):
   
        word_embed = self.word_embedding(input_word)
        f_t = self._forgetGate(word_embed, hidden)
        
        
        if feat==0:
            i_t = self._inputGate(word_embed, hidden)
        elif feat==1:
            i_t = self._inputGateCap(word_embed, hidden)
            
        c_tilde = self._newMemoryCell(word_embed, hidden)
        c_t = self._cellState(c_prev, c_tilde, f_t, i_t)#final memory cell
        h_t = self._outputGate(word_embed, hidden,c_t)#final hidden state
        lstm_out = self.hidden2tag(h_t)
        output = self.softmax(lstm_out)
        return output, h_t, c_t

    def initHidden(self):
        init_zeros = torch.zeros(1, self.hidden_dim).cuda() if USE_CUDA else torch.zeros(1, self.hidden_dim)
        return Variable(init_zeros)
    
    def initCellState(self):
        init_zeros = torch.zeros(1, self.hidden_dim).cuda() if USE_CUDA else torch.zeros(1, self.hidden_dim)
        return Variable(init_zeros)
#Initialization    
n_words = len(word_to_ix) #number of unique words
n_hidden = 128 # number of hidden dimensions
n_emb_dim = 64 # number of word vector dimension
n_categories = len(pos_to_ix) # number of POS categories
lstm = LSTM(n_words, n_emb_dim, n_hidden, n_categories) #initilize the model
if USE_CUDA:
    lstm = lstm.cuda()


## Training

In [61]:
criterion = nn.NLLLoss() #Last layer is nn.LogSoftMax, therefore NLLLoss is suitable
learning_rate = 0.001
optimizer = torch.optim.Adam(lstm.parameters(), lr=learning_rate) #ADAM OPTIMIZER

def train(category_tensor, sentence_tensor,feat_tensor):
    lstm.train(True)
    hidden = lstm.initHidden() #initialize hidden state
    cell_state = lstm.initCellState() #initiailze cell state
    lstm.zero_grad()# reset gradient to zero

    for i in range(sentence_tensor.size()[0]): # for each word in a sentence
        output, hidden, cell_state = lstm(sentence_tensor[i], hidden,cell_state,feat_tensor[i])#fwd
        if i != 0:
            all_outputs=torch.cat((all_outputs,output),0) #concat output vectors together to calculate loss in one go
        else:
            all_outputs=output

    loss = criterion(all_outputs, category_tensor)# calculate loss
    loss.backward()#backprop
    optimizer.step()#update parameters
    return all_outputs, loss.data[0]


In [64]:
import time
import math

n_iters = 20
print_every = 1

def timeSince(since):
    now = time.time()
    s = now - since
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

start = time.time()

for iter in range(1, n_iters + 1):#Epoch
    for sentence, pos_tags in zip(input_sent,train_targets): #Sentence
        sentence_tensor, feat_tensor = prepare_sequence_word(sentence) #preprocess input
        category_tensor = prepare_sequence_target_pos(pos_tags) #preprocess tags
        output, loss = train(category_tensor, sentence_tensor, feat_tensor) #fwd

    # Print iter number,time, loss
    if iter % print_every == 0:
        
        print('%d %d%% (%s) %.4f ' % (iter, iter / n_iters * 100, timeSince(start), loss))



1 5% (23m 16s) 0.4894 
2 10% (46m 29s) 0.2441 
3 15% (69m 43s) 0.1441 
4 20% (92m 56s) 0.0420 
5 25% (116m 8s) 0.0231 
6 30% (139m 20s) 0.0393 
7 35% (162m 32s) 0.0332 
8 40% (185m 45s) 0.0759 
9 45% (208m 58s) 0.0330 
10 50% (232m 10s) 0.0934 
11 55% (255m 24s) 0.0481 
12 60% (278m 40s) 0.0194 
13 65% (301m 57s) 0.0499 
14 70% (325m 15s) 0.0640 
15 75% (348m 32s) 0.0402 
16 80% (371m 49s) 0.0822 
17 85% (395m 10s) 0.0658 
18 90% (418m 38s) 0.0286 
19 95% (442m 54s) 0.0345 
20 100% (466m 28s) 0.0118 


# Save/Load model

In [None]:
# save model
#torch.save(lstm.state_dict(), "mylstm.pt")



In [None]:
# load model
# lstm = LSTM(n_words, n_emb_dim, n_hidden, n_categories) #initilize the model
# if USE_CUDA:
#     lstm = lstm.cuda()
# lstm.load_state_dict(torch.load("mylstm.pt"))



# Evaluation

In [71]:
def predict(input_sent):
    y_pred=[]
    lstm.train(False)
    hidden = lstm.initHidden()
    cell_state = lstm.initCellState()
    sentence_tensor, feat_tensor = prepare_sequence_word_test(input_sent)
    for i in range(sentence_tensor.size()[0]):
        output, hidden, cell_state = lstm(sentence_tensor[i], hidden,cell_state,feat_tensor[i] )
        output=output[0].data.tolist()
        out_ix=output.index(max(output))
        y_pred.append(ix_to_pos[out_ix])
    
    return y_pred


In [None]:
predict(input_test_sent[1])

In [None]:
test_sents[1]

In [72]:
#predict POS tags for all sentences in the testset
y_pred = []

for test_sent in input_test_sent:
    temp_pred = predict(test_sent)
    y_pred.append(temp_pred)



In [73]:
def pos_classification_report(y_true, y_pred):
 
    lb = LabelBinarizer()
    y_true_combined = lb.fit_transform(list(chain.from_iterable(y_true)))
    y_pred_combined = lb.transform(list(chain.from_iterable(y_pred)))
    tagset = sorted(set(lb.classes_)) 
    class_indices = {cls: idx for idx, cls in enumerate(lb.classes_)}
    
    return classification_report(
        y_true_combined,
        y_pred_combined,
        labels = [class_indices[cls] for cls in tagset],
        target_names = tagset,
        digits=4
    )

In [74]:
print(pos_classification_report(test_targets,y_pred))

             precision    recall  f1-score   support

          .     0.9999    0.9999    0.9999     15505
        ADJ     0.9004    0.8314    0.8645      5492
        ADP     0.9215    0.9270    0.9243      9630
        ADV     0.9014    0.8839    0.8926      5357
       CONJ     0.9952    0.9952    0.9952      3326
        DET     0.9668    0.9755    0.9711     10113
       NOUN     0.9395    0.9549    0.9472     17692
        NUM     0.8394    0.9548    0.8934       487
       PRON     0.9694    0.9740    0.9717      7353
        PRT     0.8595    0.8201    0.8393      3446
       VERB     0.9574    0.9712    0.9642     16917
          X     0.3750    0.0726    0.1216       124

avg / total     0.9493    0.9502    0.9495     95442

