# Structured Perceptron demo for deliverable 2

## Overview of the data

url = https://www.kaggle.com/abhinavwalia95/entity-annotated-corpus


Essential info about entities:

```
geo = Geographical Entity
org = Organization
per = Person
gpe = Geopolitical Entity
tim = Time indicator
art = Artifact
eve = Event
nat = Natural Phenomenon
```


In [1]:
import pandas as pd
import numpy as np
%matplotlib inline

import matplotlib
import numpy as np
import matplotlib.pyplot as plt
import os

In [2]:
!ls ../data/kaggle_ner

[1m[31mner.csv[m[m         [1m[31mner_dataset.csv[m[m


In [3]:
foldername = "kaggle_ner"
parent_path = "../data"

def stringbold(string):
    BOLD = '\033[1m'
    END = '\033[0m'
    return BOLD + string + END

def download_kaggle_ner(foldername, parent_path):

    url = "/https://www.kaggle.com/abhinavwalia95/entity-annotated-corpus/downloads/entity-annotated-corpus.zip"
    full_path = os.path.join(parent_path, foldername)
    
    if foldername not in os.listdir(parent_path):
        print("Folder {} not found in {}".format(stringbold(foldername),stringbold(parent_path)))
        print("Creating folder {}".format(stringbold(full_path)))
        print("\nDownload the data from \n{} \n Write 'ner_dataset.csv' and 'ner.csv' in {}  "\
              .format(stringbold(url),full_path))
        os.mkdir(full_path)


In [4]:
full_path = os.path.join(parent_path, foldername)
url = "/https://www.kaggle.com/abhinavwalia95/entity-annotated-corpus/downloads/entity-annotated-corpus.zip"
download_kaggle_ner(foldername, parent_path)

In [5]:
# This should print the head of the csv
! head ../data/kaggle_ner/ner_dataset.csv

Sentence #,Word,POS,Tag
Sentence: 1,Thousands,NNS,O
,of,IN,O
,demonstrators,NNS,O
,have,VBP,O
,marched,VBN,O
,through,IN,O
,London,NNP,B-geo
,to,TO,O
,protest,VB,O


## reading the data

In [6]:
data = pd.read_csv("../data/kaggle_ner/ner_dataset.csv",
                   encoding="latin1")

sentence_formatter = "Sentence: {}"

last_n = 2000
end   = data.index[data["Sentence #"] == sentence_formatter.format(last_n)][0]
data = data[0:end]

n_sentences = len(list(set(data["Sentence #"])))
first_n = 1
last_n = last_n -1
print(n_sentences)

2000


In [7]:
%%time 
sentence_formatter = "Sentence: {}"

for s_id in  range(first_n, last_n):
    print("current {}/{}".format(s_id,last_n), end="\r")
    sentence_id = sentence_formatter.format(s_id)
    sentence_id_next = sentence_formatter.format(s_id + 1)
    start = data.index[data["Sentence #"] == sentence_id][0]
    end   = data.index[data["Sentence #"] == sentence_id_next][0]
    data["Sentence #"][start:end] = sentence_id
    
sentence_id = sentence_formatter.format(last_n)
start = data.index[data["Sentence #"] == sentence_id][0]
end   = data.shape[0]
data["Sentence #"][start:end] = sentence_id


CPU times: user 7.47 s, sys: 160 ms, total: 7.63 s
Wall time: 7.57 s


Getting the information of a single sentence

In [8]:
index_example = 19
n_w = 15
sentence_id = "Sentence: {}".format(index_example)

df_sentence = data[data["Sentence #"]==sentence_id]
x = list(df_sentence["Word"])
y = list(df_sentence["Tag"])

for w,t in zip(x,y):
    w = w.ljust(n_w)
    print("{0} {1}".format(w,t))

Suspected       O
Islamist        O
rebels          O
have            O
fired           O
mortar          O
shells          O
at              O
the             O
palace          O
used            O
by              O
Somalia         B-geo
's              O
interim         O
President       B-per
Abdullahi       I-per
Yusuf           I-per
Ahmad           I-per
.               O


## Building X and Y

In [9]:
n_sentences

2000

In [10]:
X = []
Y = []

sentence_formatter = "Sentence: {}"

for i in range(1,n_sentences):
    s = sentence_formatter.format(i)
    X.append(list(data[data["Sentence #"]==s]["Word"].values))
    Y.append(list(data[data["Sentence #"]==s]["Tag"].values))

In [11]:
i = 0
xy = ["{}/{}".format(x,y) for x,y in zip(X[i],Y[i])]
" ".join(xy)

'Thousands/O of/O demonstrators/O have/O marched/O through/O London/B-geo to/O protest/O the/O war/O in/O Iraq/B-geo and/O demand/O the/O withdrawal/O of/O British/B-gpe troops/O from/O that/O country/O ./O'

In [12]:
X[0]

['Thousands',
 'of',
 'demonstrators',
 'have',
 'marched',
 'through',
 'London',
 'to',
 'protest',
 'the',
 'war',
 'in',
 'Iraq',
 'and',
 'demand',
 'the',
 'withdrawal',
 'of',
 'British',
 'troops',
 'from',
 'that',
 'country',
 '.']

In [13]:
def build_word_to_pos(X):

    word_to_pos = {}
    i = 0
    for s in X:
        for w in s:
            if w not in word_to_pos:
                word_to_pos[w] = i
                i +=1
                
    pos_to_word = {v: k for k, v in word_to_pos.items()}
    return word_to_pos, pos_to_word
            
def build_tag_to_pos(Y):
    tag_to_pos = {}
    i = 0
    for s in Y:
        for t in s:
            if t not in tag_to_pos:
                tag_to_pos[t] = i
                i +=1
    pos_to_tag = {v: k for k, v in tag_to_pos.items()}

    return tag_to_pos, pos_to_tag

In [14]:
word_to_pos, pos_to_word = build_word_to_pos(X)
tag_to_pos, pos_to_tag  = build_tag_to_pos(Y)

len(word_to_pos), len(tag_to_pos)

(7047, 17)

In [15]:
tag_to_pos

{'O': 0,
 'B-geo': 1,
 'B-gpe': 2,
 'B-per': 3,
 'I-geo': 4,
 'B-org': 5,
 'I-org': 6,
 'B-tim': 7,
 'B-art': 8,
 'I-art': 9,
 'I-per': 10,
 'I-gpe': 11,
 'I-tim': 12,
 'B-nat': 13,
 'B-eve': 14,
 'I-eve': 15,
 'I-nat': 16}

In [16]:
X_ids = [[word_to_pos[w] for w in s] for s in X]
Y_ids = [[tag_to_pos[t] for t in s] for s in Y]

In [17]:
len(X),len(Y)

(1999, 1999)

# Use Structured perceptron with the provided dada

Add new features to the structured perceptron to deal with particular classes found in your data


The class ExtendedFeatures can be expanded.

For example you can add inside the method add_emission_features the following:



```
        if word.istitle():
            # Generate feature name.
            feat_name = "uppercased::%s" % y_name
            # Get feature ID from name.
            feat_id = self.add_feature(feat_name)
            # Append feature.
            if feat_id != -1:
                features.append(feat_id)
```

In [19]:

import scipy
import numpy as np

import os,sys,inspect
currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(currentdir)
sys.path.insert(0,parentdir) 
import skseq

from skseq.sequences import sequence
from skseq.sequences.sequence import Sequence
from skseq.sequences.sequence_list import SequenceList
from skseq.sequences.label_dictionary import LabelDictionary

In [20]:
seq = Sequence(x=X[0], y=Y[0])
seq

Thousands/O of/O demonstrators/O have/O marched/O through/O London/B-geo to/O protest/O the/O war/O in/O Iraq/B-geo and/O demand/O the/O withdrawal/O of/O British/B-gpe troops/O from/O that/O country/O ./O 

In [21]:
sequence_list = SequenceList(LabelDictionary(word_to_pos), LabelDictionary(tag_to_pos))

In [22]:
for x,y in zip(X,Y):
    sequence_list.add_sequence(x,y, LabelDictionary(word_to_pos), LabelDictionary(tag_to_pos))

In [23]:
sequence_list[0]

0/0 1/0 2/0 3/0 4/0 5/0 6/1 7/0 8/0 9/0 10/0 11/0 12/1 13/0 14/0 9/0 15/0 1/0 16/2 17/0 18/0 19/0 20/0 21/0 

In [24]:
sequence_list[0].to_words(sequence_list=sequence_list)

'Thousands/O of/O demonstrators/O have/O marched/O through/O London/B-geo to/O protest/O the/O war/O in/O Iraq/B-geo and/O demand/O the/O withdrawal/O of/O British/B-gpe troops/O from/O that/O country/O ./O '

## building features form the data

In [25]:
feature_mapper = skseq.sequences.id_feature.IDFeatures(sequence_list)
feature_mapper.build_features()

In [26]:
import pprint
pprint.pprint(list(feature_mapper.__dict__.keys()))

['feature_dict',
 'feature_list',
 'add_features',
 'dataset',
 'node_feature_cache',
 'initial_state_feature_cache',
 'final_state_feature_cache',
 'edge_feature_cache']


In [27]:
len(feature_mapper.feature_dict)

7891

In [28]:
len(feature_mapper.feature_list)

1999

In [29]:
list(feature_mapper.feature_dict)[0:5]

['init_tag:O',
 'id:Thousands::O',
 'id:of::O',
 'prev_tag:O::O',
 'id:demonstrators::O']

In [30]:
list(feature_mapper.feature_dict)[0:10]

['init_tag:O',
 'id:Thousands::O',
 'id:of::O',
 'prev_tag:O::O',
 'id:demonstrators::O',
 'id:have::O',
 'id:marched::O',
 'id:through::O',
 'id:London::B-geo',
 'prev_tag:O::B-geo']

In [31]:
set([x.split(":")[0] for x in feature_mapper.feature_dict.keys()])

{'final_prev_tag', 'id', 'init_tag', 'prev_tag'}

In [32]:
# Set of features activated for the first example
feature_mapper.feature_list[1]

[[[0]],
 [[3],
  [3],
  [3],
  [3],
  [3],
  [3],
  [3],
  [3],
  [3],
  [3],
  [3],
  [3],
  [3],
  [3],
  [3],
  [3],
  [3],
  [44],
  [46],
  [3],
  [3],
  [3],
  [3],
  [3],
  [3],
  [3],
  [3],
  [3],
  [3]],
 [[28]],
 [[29],
  [2],
  [30],
  [31],
  [15],
  [13],
  [32],
  [33],
  [13],
  [34],
  [35],
  [36],
  [37],
  [38],
  [39],
  [40],
  [41],
  [42],
  [43],
  [45],
  [47],
  [48],
  [42],
  [17],
  [42],
  [49],
  [13],
  [50],
  [27],
  [42]]]

## Training structured perceptron

In [33]:
import skseq.readers.pos_corpus
corpus = skseq.readers.pos_corpus.PostagCorpus()

In [34]:
import skseq.sequences.structured_perceptron as spc

sp = spc.StructuredPerceptron(word_to_pos, tag_to_pos, feature_mapper)
sp.num_epochs = 5

In [35]:
sp.get_num_states(), sp.get_num_observations()

(17, 7047)

In [36]:
%%time
num_epochs = 15
sp.fit(feature_mapper.dataset, num_epochs)

Epoch: 0 Accuracy: 0.797048
Epoch: 1 Accuracy: 0.855730
Epoch: 2 Accuracy: 0.886423
Epoch: 3 Accuracy: 0.906794
Epoch: 4 Accuracy: 0.918017
Epoch: 5 Accuracy: 0.928113
Epoch: 6 Accuracy: 0.939628
Epoch: 7 Accuracy: 0.935144
Epoch: 8 Accuracy: 0.951617
Epoch: 9 Accuracy: 0.950310
Epoch: 10 Accuracy: 0.956146
Epoch: 11 Accuracy: 0.956868
Epoch: 12 Accuracy: 0.962389
Epoch: 13 Accuracy: 0.959910
Epoch: 14 Accuracy: 0.964011
CPU times: user 3min, sys: 789 ms, total: 3min 1s
Wall time: 3min


### How to give a new phrase to the structured perceptron

- Remember that the structured perceptron `viterbi_decode` method needs a `Sequence` object


- If you want to predict a set of tags for a new phrase you can create a `Sequence` object and fill the object with 0 values.

Let's look at a couple of  examples

In [37]:
p = "Sara had been to London for years yet she wanted to go back to Barcelona ."
new_seq = skseq.sequences.sequence.Sequence(x=p.split(), y=[int(0) for w in p.split()])

In [38]:
sp.viterbi_decode(new_seq)

(Sara/0 had/0 been/0 to/0 London/1 for/0 years/0 yet/0 she/0 wanted/0 to/0 go/0 back/0 to/0 Barcelona/0 ./0 ,
 279.66666666666674)

In [39]:
new_seq = skseq.sequences.sequence.Sequence(x=X[0], y=[int(0) for w in p.split()])

In [40]:
xy_hat = sp.viterbi_decode(new_seq)[0]
xy_hat

Thousands/0 of/0 demonstrators/0 have/0 marched/0 through/0 London/1 to/0 protest/0 the/0 war/0 in/0 Iraq/1 and/0 demand/0 the/0 withdrawal/0 of/0 British/2 troops/0 from/0 that/0 country/0 ./0 

In [41]:
xy_hat.to_words(sequence_list=sequence_list, only_tag_translation=True)

'Thousands/O of/O demonstrators/O have/O marched/O through/O London/B-geo to/O protest/O the/O war/O in/O Iraq/B-geo and/O demand/O the/O withdrawal/O of/O British/B-gpe troops/O from/O that/O country/O ./O '


## Torch introduction

Inside `torch.nn` there are several layer objects already build into `torch`. We can use them to build more complex neural networks.

- torch.nn.Linear
- torch.nn.Embedding
- torch.nn.LogSoftmax
- torch.nn.Dropout
- torch.nn.ReLU
- torch.nn.GRU


**It is important to notice that, in order to use  the forward method  already implemented in those layers we need to use as input data  formatted as `torch.Variable` type:**
```
n_input = 3
n_output = 2
sample = torch.autograd.Variable(torch.Tensor([5.,4.,3.]))
linear_layer = torch.nn.Linear(n_input, n_output)
linear_layer.forward(sample)
```


**Thefore if we define an example as a `torch.tensor` we will not be able to forward propagate it:**

```
n_input = 3
n_output = 2
sample = torch.Tensor([5.,4.,3.])
linear_layer = torch.nn.Linear(n_input, n_output)
linear_layer.forward(sample) # -------------------> this does not work
```

## About recurrent neural networks

- RNNCell does the forward pass for a single time step of a sequence (specially usefull if you want to do "custom" operatios at every time step).
- RNN applies the RNNCell forward pass to every time step of an input sequence (just like the traditional RNN)



In [43]:
import torch
x = torch.Tensor(np.random.rand(3,1))

### Linear layer in numpy



In [44]:

np.random.seed(1234)
W = np.random.rand(2, 3)
x = np.random.rand(3, 1)
W.shape, x.shape

((2, 3), (3, 1))

In [45]:
W

array([[0.19151945, 0.62210877, 0.43772774],
       [0.78535858, 0.77997581, 0.27259261]])

In [46]:
x

array([[0.27646426],
       [0.80187218],
       [0.95813935]])

In [47]:
np.matmul(W, x)

array([[0.97120417],
       [1.10374618]])

In [48]:
W @ x

array([[0.97120417],
       [1.10374618]])

### torch nn.Linear

In [49]:
n_input = 3
n_output = 2

torch.manual_seed(1000)
sample = torch.autograd.Variable(torch.Tensor([5.,4.,3.]))

linear_layer = torch.nn.Linear(n_input, n_output)
linear_layer

Linear(in_features=3, out_features=2, bias=True)

In [50]:
linear_layer.weight 

Parameter containing:
tensor([[-0.2091,  0.1311, -0.0672],
        [-0.2794, -0.2628,  0.1456]], requires_grad=True)

In [51]:
linear_layer.bias

Parameter containing:
tensor([-0.0681, -0.1556], requires_grad=True)

In [52]:
linear_layer.forward(sample)

tensor([-0.7907, -2.1668], grad_fn=<AddBackward0>)

In [53]:
# We can retrieve the weights and biases from the network to numpy as follows:
W_np = linear_layer.weight.data.numpy()
b_np = linear_layer.bias.data.numpy()
x_np = sample.data.numpy()
W_np @ x_np + b_np

array([-0.79068434, -2.166812  ], dtype=float32)

In [54]:
linear_layer.state_dict().keys()

odict_keys(['weight', 'bias'])

### nn.Embedding

In [55]:
n_input = 5000
n_output = 10
embedding = torch.nn.Embedding(n_input, n_output)
embedding

Embedding(5000, 10)

In [56]:
sample = torch.LongTensor([506])

In [57]:
embedding.forward(sample)

tensor([[ 0.0018,  1.7398,  0.0347, -0.1383, -0.0893, -0.4650, -0.1623,  0.1137,
         -0.3421, -0.2518]], grad_fn=<EmbeddingBackward>)

In [58]:
embedding.weight

Parameter containing:
tensor([[-0.3879,  1.2894, -0.9362,  ...,  0.2743, -0.8496,  0.3947],
        [ 0.0848,  0.1864,  0.0859,  ...,  1.0726,  1.0481,  1.0527],
        [-0.6424, -1.2234, -1.0794,  ..., -0.0482,  0.6610, -0.8908],
        ...,
        [-0.4186,  0.0305, -0.7265,  ...,  0.0622, -0.1281,  0.8795],
        [ 0.2722, -0.7068,  0.7342,  ...,  0.8290, -0.4435, -0.0754],
        [-0.4442,  0.8973, -1.2622,  ..., -1.2709, -1.1286,  0.7347]],
       requires_grad=True)

In [59]:
embedding.state_dict().keys()

odict_keys(['weight'])

### nn.GRU

In [60]:
np.random.seed(1234)
gru = torch.nn.GRU(6, 256)
sample = torch.autograd.Variable(torch.Tensor(np.random.rand(6).reshape(1,1,6)))
sample

tensor([[[0.1915, 0.6221, 0.4377, 0.7854, 0.7800, 0.2726]]])

In [61]:
type(gru.forward(sample)), len(gru.forward(sample))

(tuple, 2)

In [62]:
a,_ = gru.forward(sample)

In [63]:
a.size()

torch.Size([1, 1, 256])

In [64]:
import pprint
pprint.pprint(list(gru.state_dict().keys()))

['weight_ih_l0', 'weight_hh_l0', 'bias_ih_l0', 'bias_hh_l0']


### nn.GRU bidirectional

If we want to generate a representation that takes into account a sequence read from left to righ and from right to left we can use the `bidirectional=true` argument. This will double the number of parameters in our `gru` network. In fact this generates two GRU networks generating in the forward pass the concatenation of the output of both GRUs.

In [65]:
np.random.seed(1234)
gru = torch.nn.GRU(6, 256, bidirectional=True)
sample = torch.Tensor(np.random.rand(6).reshape(1,1,6))
sample

tensor([[[0.1915, 0.6221, 0.4377, 0.7854, 0.7800, 0.2726]]])

In [66]:
type(gru.forward(sample)), len(gru.forward(sample))

(tuple, 2)

In [67]:
a,_ = gru.forward(sample)

In [68]:
# Notice that the forward pass returns a 512 vector instead of 256. 
# This is because what is returned is the concatenation of two vectors: 
# one from the "left_to_right" GRU  and the other from the "right_to_left" GRU.
a.size()

torch.Size([1, 1, 512])

In [69]:
pprint.pprint(list(gru.state_dict().keys()))

['weight_ih_l0',
 'weight_hh_l0',
 'bias_ih_l0',
 'bias_hh_l0',
 'weight_ih_l0_reverse',
 'weight_hh_l0_reverse',
 'bias_ih_l0_reverse',
 'bias_hh_l0_reverse']


### Understanding shapes in a GRU bidirectional

In [70]:
torch.manual_seed(1234)
random_input = torch.FloatTensor(5, 1, 1).normal_()
random_input[:, 0, 0]

tensor([ 0.0461,  0.4024, -1.0115,  0.2167, -0.6123])

In [71]:
bi_grus = torch.nn.GRU(input_size=1, hidden_size=1, num_layers=1, batch_first=False, bidirectional=True)

In [72]:
reverse_gru = torch.nn.GRU(input_size=1, hidden_size=1, num_layers=1, batch_first=False, bidirectional=False)
reverse_gru.weight_ih_l0 = bi_grus.weight_ih_l0_reverse
reverse_gru.weight_hh_l0 = bi_grus.weight_hh_l0_reverse
reverse_gru.bias_ih_l0 = bi_grus.bias_ih_l0_reverse
reverse_gru.bias_hh_l0 = bi_grus.bias_hh_l0_reverse

In [73]:
bi_output, bi_hidden = bi_grus(random_input)

In [74]:
reverse_output, reverse_hidden = reverse_gru(random_input[np.arange(4, -1, -1), :, :])

In [75]:
reverse_output[:, 0, 0]

tensor([0.4095, 0.4667, 0.5444, 0.5134, 0.5124], grad_fn=<SelectBackward>)

In [76]:
bi_output[:, 0, 1]

tensor([0.5124, 0.5134, 0.5444, 0.4667, 0.4095], grad_fn=<SelectBackward>)

In [77]:
reverse_hidden

tensor([[[0.5124]]], grad_fn=<StackBackward>)

In [78]:
bi_hidden

tensor([[[0.4491]],

        [[0.5124]]], grad_fn=<StackBackward>)

### Stacking GRU units 

In [79]:
rnn = torch.nn.GRU(input_size = 10, hidden_size=20, num_layers=2)
rnn.input_size, rnn.hidden_size, rnn.num_layers

(10, 20, 2)

In [80]:
input = torch.randn(5, 3, 10)
h0 = torch.randn(2, 3, 20)
output, hn = rnn(input, h0)
output.size(), hn.size()

(torch.Size([5, 3, 20]), torch.Size([2, 3, 20]))

## Pytorch NER demo for deliverable 2

This demo is UNFINISHED.

It is basically the code found  https://cs230-stanford.github.io/pytorch-nlp.html
and it's important you take time to finish it and understand it. It's your job to make it work in a way that you like how it is behaving.

In [81]:
import torch
torch.__version__

'1.1.0'

In [82]:
train_sentences = []        
train_labels = []

for x in X:
    # replace each token by its index if it is in vocab
    # else use index of UNK
    s = [word_to_pos[token] if token in word_to_pos 
         else vocab['UNK'] for token in x]
    train_sentences.append(s)
    
for y in Y:
    # replace each label by its index
    l = [tag_to_pos[label] for label in y]
    train_labels.append(l)  


In [83]:
import torch.nn as nn
import torch.nn.functional as F

class Net(nn.Module):
    def __init__(self, params):
        super(Net, self).__init__()

        self.batch_size      = params.batch_size
        self.batch_max_len   = params.batch_max_len
        self.lstm_hidden_dim = params.lstm_hidden_dim
        
        # maps each token to an embedding_dim vector
        self.embedding = nn.Embedding(params.vocab_size, params.embedding_dim)

        # the LSTM takens embedded sentence
        self.lstm = nn.LSTM(params.embedding_dim, params.lstm_hidden_dim, batch_first=True)

        # fc layer transforms the output to give the final output layer
        self.fc = nn.Linear(params.lstm_hidden_dim, params.number_of_tags)

    def forward(self, s):
        # apply the embedding layer that maps each token to its embedding
        s = self.embedding(s)   # dim: batch_size x batch_max_len x embedding_dim

        # run the LSTM along the sentences of length batch_max_len
        s, _ = self.lstm(s)     # dim: batch_size x batch_max_len x lstm_hidden_dim                

        # reshape the Variable so that each row contains one token
        s = s.contiguous()
        s = s.view(self.batch_size, self.batch_max_len, self.lstm_hidden_dim)  # dim: batch_size*batch_max_len x lstm_hidden_dim

        # apply the fully connected layer and obtain the output for each token
        s = self.fc(s)          # dim: batch_size*batch_max_len x num_tags

        return F.log_softmax(s, dim=1)   # dim: batch_size*batch_max_len x num_tags

    def loss_fn(self, outputs, labels):
      # reshape labels to give a flat vector of length batch_size*seq_len
      labels = labels.view(-1)  

      # mask out 'PAD' tokens
      mask = (labels >= 0).float()

      # the number of tokens is the sum of elements in mask
      num_tokens = int(torch.sum(mask).data[0])

      # pick the values corresponding to labels and multiply by mask
      outputs = outputs[range(outputs.shape[0]), labels]*mask

      # cross entropy loss for all non 'PAD' tokens
      return -torch.sum(outputs)/num_tokens
    
    
    def optimize(self, batch):
        
        batch_cost = self.forward(batch)
        batch_cost.backward()

In [84]:
import collections

Params = collections.namedtuple("Params", 
                                ("vocab_size", 
                                 "embedding_dim",
                                 "lstm_hidden_dim",
                                 "number_of_tags",
                                 "batch_size",
                                 "batch_max_len"))

In [85]:
vocab_size       = len(word_to_pos)
embedding_dim    = 200
lstm_hidden_dim  = 100
number_of_tags   = len(tag_to_pos)
batch_size       = 5
natch_max_len    = 20

params = Params(vocab_size, embedding_dim, lstm_hidden_dim, number_of_tags, batch_size,natch_max_len)
params

Params(vocab_size=7047, embedding_dim=200, lstm_hidden_dim=100, number_of_tags=17, batch_size=5, batch_max_len=20)

In [86]:
net = Net(params)
net

Net(
  (embedding): Embedding(7047, 200)
  (lstm): LSTM(200, 100, batch_first=True)
  (fc): Linear(in_features=100, out_features=17, bias=True)
)

### Example in a minibatch

In [87]:
x = torch.autograd.Variable(torch.LongTensor([[10, 23, 506,123], 
                                              [10, 23, 0,0 ]]))
x

tensor([[ 10,  23, 506, 123],
        [ 10,  23,   0,   0]])

In [88]:
net.batch_max_len

20

In [89]:
# will not work because shapes are not correctly matched
net.forward(x).shape

RuntimeError: shape '[5, 20, 100]' is invalid for input of size 800

In [90]:
net.batch_size    = 2
net.batch_max_len = 4

In [91]:
net.forward(x).shape

torch.Size([2, 4, 17])

In [92]:
out = net.forward(x)

In [93]:
out.size()

torch.Size([2, 4, 17])

In [94]:
len(tag_to_pos)

17

Fantastic! We have now a model, but how do we pass data to the model?

In [95]:
# this does not work, not all elements in the minatch have the size number of words
x = torch.autograd.Variable(torch.LongTensor([[10, 23, 506,123], 
                                              [10, 23]]))

x.shape, net.embedding.forward(x).size()

ValueError: expected sequence of length 4 at dim 1 (got 2)

In [96]:
#this works though
x = torch.autograd.Variable(torch.LongTensor([[10, 23, 506,123], 
                                              [10, 23, 0,0 ]]))

x.shape, net.embedding.forward(x).size()

(torch.Size([2, 4]), torch.Size([2, 4, 200]))

In [97]:
net

Net(
  (embedding): Embedding(7047, 200)
  (lstm): LSTM(200, 100, batch_first=True)
  (fc): Linear(in_features=100, out_features=17, bias=True)
)

In [98]:
net.embedding.forward(x)

tensor([[[-1.0820, -0.3499, -0.2617,  ..., -1.8986, -0.6394, -1.3620],
         [-0.1558, -0.8640, -0.0093,  ...,  0.4334,  1.7331,  0.8474],
         [-1.5315,  0.5984,  0.3271,  ...,  0.0236, -0.6438,  1.7018],
         [ 1.6441,  1.1248,  0.5879,  ..., -0.3087,  0.9252, -0.1380]],

        [[-1.0820, -0.3499, -0.2617,  ..., -1.8986, -0.6394, -1.3620],
         [-0.1558, -0.8640, -0.0093,  ...,  0.4334,  1.7331,  0.8474],
         [-0.8533, -0.1075,  1.0569,  ..., -1.3179,  0.2390, -0.4962],
         [-0.8533, -0.1075,  1.0569,  ..., -1.3179,  0.2390, -0.4962]]],
       grad_fn=<EmbeddingBackward>)

In [99]:
x_embedded = net.embedding.forward(x)

In [100]:
lstm_out, _ = net.lstm.forward(x_embedded)

In [101]:
len(lstm_out)

2

In [102]:
# dim: batch_size x batch_max_len x lstm_hidden_dim
lstm_out.size()

torch.Size([2, 4, 100])

In [103]:
lstm_out.view(size =  (2,4, 100)).size()

torch.Size([2, 4, 100])

In [104]:
aux = lstm_out.contiguous()
aux.view(size =  (4,2,100)).size()

torch.Size([4, 2, 100])

### Computing the loss given the output


In [105]:
def loss_fn( outputs, labels):
    # reshape labels to give a flat vector of length batch_size*seq_len
    labels = labels.view(-1)  

    # mask out 'PAD' tokens
    mask = (labels >= 0).float()

    # the number of tokens is the sum of elements in mask
    num_tokens = int(torch.sum(mask).data[0])

    # pick the values corresponding to labels and multiply by mask
    outputs = outputs[range(outputs.shape[0]), labels]*mask

    # cross entropy loss for all non 'PAD' tokens
    return -torch.sum(outputs)/num_tokens
    

In [106]:
y.view(-1)

AttributeError: 'list' object has no attribute 'view'

In [107]:
y = torch.autograd.Variable(torch.LongTensor([[3, 6, 0, 0], 
                                              [1, 5, 0, 0]]))
y

tensor([[3, 6, 0, 0],
        [1, 5, 0, 0]])

In [108]:
# what is going on? Notice the dimension missmatch
loss_fn(out,y)

IndexError: invalid index of a 0-dim tensor. Use tensor.item() to convert a 0-dim tensor to a Python number

### Making an output batch

In [109]:
#y = torch.autograd.Variable(torch.LongTensor([[3, 6, 0, 0], 
#                                              [1, 5, 0, 0]]))

y = torch.LongTensor([[3, 6, 0, 0], [1, 5, 0, 0]])

In [110]:
out.size()

torch.Size([2, 4, 17])

In [111]:
y.size()

torch.Size([2, 4])

In [112]:
y

tensor([[3, 6, 0, 0],
        [1, 5, 0, 0]])

In [113]:
y_onehot = torch.zeros(y.size()[0], y.size()[1], len(tag_to_pos))

In [114]:
y_onehot[0]

tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])

In [115]:
y = torch.autograd.Variable(torch.LongTensor([[3, 6, 3, 0], 
                                              [1, 5, 0, 0]]))
y

tensor([[3, 6, 3, 0],
        [1, 5, 0, 0]])

In [116]:
y_onehot = torch.zeros(y.size()[0], y.size()[1], len(tag_to_pos))

for m,y_onehot_m in enumerate(y_onehot):
    for k,y_k in enumerate(y[m]):
        print("example in the batch {}, word position {}, tag value {}".format(m,k, int(y_k)))
        if int(y_k) == 0:
            y_onehot[m][k][0] = -1.
        else:
            y_onehot[m][k][int(y_k)] = 1.



example in the batch 0, word position 0, tag value 3
example in the batch 0, word position 1, tag value 6
example in the batch 0, word position 2, tag value 3
example in the batch 0, word position 3, tag value 0
example in the batch 1, word position 0, tag value 1
example in the batch 1, word position 1, tag value 5
example in the batch 1, word position 2, tag value 0
example in the batch 1, word position 3, tag value 0


In [117]:
y_onehot[0]

tensor([[ 0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.],
        [ 0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.],
        [ 0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.],
        [-1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.]])

In [118]:
y

tensor([[3, 6, 3, 0],
        [1, 5, 0, 0]])

In [119]:
# Compare this function with the one in the URL from standford
# Do you think this one makes sense?
def loss_fn( outputs, labels):
    
    # reshape labels to give a flat vector of length batch_size*seq_len
    labels = labels.view(-1)  

    # mask out 'PAD' tokens
    mask = (labels == -1).float()

    # the number of tokens is the sum of elements in mask
    num_tokens = int(torch.sum(mask))

    # pick the values corresponding to labels and multiply by mask
    outputs_vec = outputs.view(-1)
    outputs     = outputs_vec*mask

    # cross entropy loss for all non 'PAD' tokens
    return -torch.sum(outputs)/num_tokens
    

In [120]:
out.size(), y_onehot.size()

(torch.Size([2, 4, 17]), torch.Size([2, 4, 17]))

In [121]:
out = loss_fn(out,y_onehot)

In [122]:
out

tensor(1.4095, grad_fn=<DivBackward0>)

In [135]:
#x_ = torch.tensor(x)
output = loss_fn(net.forward(x), y_onehot)

In [136]:
output.backward()

In [137]:
net

Net(
  (embedding): Embedding(7047, 200)
  (lstm): LSTM(200, 100, batch_first=True)
  (fc): Linear(in_features=100, out_features=17, bias=True)
)

In [138]:
net.fc.weight

Parameter containing:
tensor([[ 0.0409, -0.0827, -0.0102,  ...,  0.0015,  0.0788,  0.0280],
        [-0.0629, -0.0377,  0.0444,  ...,  0.0102,  0.0899,  0.0045],
        [-0.0388,  0.0898, -0.0379,  ..., -0.0534, -0.0037,  0.0769],
        ...,
        [-0.0695, -0.0391, -0.0578,  ..., -0.0680, -0.0798,  0.0553],
        [ 0.0690,  0.0172, -0.0031,  ...,  0.0866,  0.0573, -0.0887],
        [ 0.0994, -0.0768, -0.0748,  ..., -0.0103,  0.0365, -0.0394]],
       requires_grad=True)

In [139]:
learning_rate = 0.1
# Apply gradients
for param in net.parameters():
    param.data.add_(-learning_rate * param.grad.data)

In [140]:
net.fc.weight

Parameter containing:
tensor([[ 0.0450, -0.0724, -0.0159,  ..., -0.0054,  0.0766,  0.0273],
        [-0.0629, -0.0377,  0.0444,  ...,  0.0102,  0.0899,  0.0045],
        [-0.0388,  0.0898, -0.0379,  ..., -0.0534, -0.0037,  0.0769],
        ...,
        [-0.0695, -0.0391, -0.0578,  ..., -0.0680, -0.0798,  0.0553],
        [ 0.0690,  0.0172, -0.0031,  ...,  0.0866,  0.0573, -0.0887],
        [ 0.0994, -0.0768, -0.0748,  ..., -0.0103,  0.0365, -0.0394]],
       requires_grad=True)

In [141]:
net.zero_grad()

### Using optimizers


You can also use optimizers build into pytorch
```
optimizer             = optim.SGD(network.parameters(), lr=learning_rate)
optimizer             = optim.Adam(network.parameters(), lr = learning_rate)
loss.backward()
optimizer.step()
optimizer.zero_grad()
```