#### First notebook: 

- Looks like we are going to build a feed-forward Neural Network for Dependence parsing

- Doing experimentation in this notebook



In [4]:
"""
CS224N 2019-20: Homework 3
parser_model.py: Feed-Forward Neural Network for Dependency Parsing
Sahil Chopra <schopra8@stanford.edu>
Haoshen Hong <haoshen@stanford.edu>
"""
import pickle
import os
import time

import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np


In [293]:

        
class ParserModel(nn.Module):
    """ Feedforward neural network with an embedding layer and single hidden layer.
    The ParserModel will predict which transition should be applied to a
    given partial parse configuration.

    PyTorch Notes:
        - Note that "ParserModel" is a subclass of the "nn.Module" class. In PyTorch all neural networks
            are a subclass of this "nn.Module".
        - The "__init__" method is where you define all the layers and their respective parameters
            (embedding layers, linear layers, dropout layers, etc.).
        - "__init__" gets automatically called when you create a new instance of your class, e.g.
            when you write "m = ParserModel()".
        - Other methods of ParserModel can access variables that have "self." prefix. Thus,
            you should add the "self." prefix layers, values, etc. that you want to utilize
            in other ParserModel methods.
        - For further documentation on "nn.Module" please see https://pytorch.org/docs/stable/nn.html.
    """
    def __init__(self, embeddings, n_features=36,
        hidden_size=200, n_classes=3, dropout_prob=0.5):
        """ Initialize the parser model.

        @param embeddings (Tensor): word embeddings (num_words, embedding_size)
        @param n_features (int): number of input features
        @param hidden_size (int): number of hidden units
        @param n_classes (int): number of output classes
        @param dropout_prob (float): dropout probability
        """
        super(ParserModel, self).__init__() # we can access methods & such from ParserModel
        self.n_features = n_features
        self.n_classes = n_classes
        self.dropout_prob = dropout_prob
        self.embed_size = embeddings.shape[1]
        self.hidden_size = hidden_size
        self.pretrained_embeddings = nn.Embedding(embeddings.shape[0], self.embed_size)
        self.pretrained_embeddings.weight = nn.Parameter(torch.tensor(embeddings))

        ### YOUR CODE HERE (~5 Lines)
        ### TODO:
        ###     1) Construct `self.embed_to_hidden` linear layer, initializing the weight matrix
        ###         with the `nn.init.xavier_uniform_` function with `gain = 1` (default)
        
        self.embed_to_hidden = nn.Linear(self.n_features * self.embed_size, self.hidden_size) # what tensor to pass? (dim_in, hidden_size)
        
        # TODO: Refresh memory on this - should I be doing embed x hidden? or is it embed x feat x hidden?
        # Will automatically build out the proper weight & bias terms, but we want weight
        nn.init.xavier_uniform_(self.embed_to_hidden.weight, gain = 1) # initialize
        
        ###     2) Construct `self.dropout` layer.
        self.dropout = nn.Dropout(self.dropout_prob) # we refer to our input value
        
        ###     3) Construct `self.hidden_to_logits` linear layer, initializing the weight matrix
        ###         with the `nn.init.xavier_uniform_` function with `gain = 1` (default)
        ###
        self.hidden_to_logits = nn.Linear(self.hidden_size, self.n_classes) # what tensor to pass? (hidden, num_classes)
        nn.init.xavier_uniform_(self.hidden_to_logits.weight, gain = 1) # initialize weight only
        
        ### Note: Here, we use Xavier Uniform Initialization for our Weight initialization.
        ###         It has been shown empirically, that this provides better initial weights
        ###         for training networks than random uniform initialization.
        ###         For more details checkout this great blogpost:
        ###             http://andyljones.tumblr.com/post/110998971763/an-explanation-of-xavier-initialization 
        ### Hints:
        ###     - After you create a linear layer you can access the weight
        ###       matrix via:
        ###         linear_layer.weight
        ###
        ### Please see the following docs for support:
        ###     Linear Layer: https://pytorch.org/docs/stable/nn.html#torch.nn.Linear
        ###     Xavier Init: https://pytorch.org/docs/stable/nn.html#torch.nn.init.xavier_uniform_
        ###     Dropout: https://pytorch.org/docs/stable/nn.html#torch.nn.Dropout


        ### END YOUR CODE

    def embedding_lookup(self, t):
        """ Utilize `self.pretrained_embeddings` to map input `t` from input tokens (integers)
            to embedding vectors.

            PyTorch Notes:
                - `self.pretrained_embeddings` is a torch.nn.Embedding object that we defined in __init__
                - Here `t` is a tensor where each row represents a list of features. Each feature is represented by an integer (input token).
                - In PyTorch the Embedding object, e.g. `self.pretrained_embeddings`, allows you to
                    go from an index to embedding. Please see the documentation (https://pytorch.org/docs/stable/nn.html#torch.nn.Embedding)
                    to learn how to use `self.pretrained_embeddings` to extract the embeddings for your tensor `t`.

            @param t (Tensor): input tensor of tokens (batch_size, n_features)

            @return x (Tensor): tensor of embeddings for words represented in t
                                (batch_size, n_features * embed_size)
        """
        ### YOUR CODE HERE (~1-3 Lines)
        ### TODO:
        ###     1) Use `self.pretrained_embeddings` to lookup the embeddings for the input tokens in `t`.
        
        # we pass in a tensor of features stored as integers 
        # the embeddings is of size (vocab, vector_size) where vector_size just represents our representational features
        # pretrained_embeddings class can handle recimport torch
        
        """
        import torch
        from torch import nn
        embedding = nn.Embedding(1000,128)
        test_case = embedding(torch.LongTensor([3,4]))
        assert embedding(torch.LongTensor([3])).sum().item() == test_case[0].sum().item() # we got the proper val
        
        # this would return a tensor with embeddings for the 3rd & 4th word from vocab of 1000 in positions 0 & 1
        # i then just sum up embeddings to confirm
        # info: https://stackoverflow.com/questions/50747947/embedding-in-pytorch
        """
        x = self.pretrained_embeddings(t)
        
        
        ###     2) After you apply the embedding lookup, you will have a tensor shape (batch_size, n_features, embedding_size).
        ###         Use the tensor `view` method to reshape the embeddings tensor to (batch_size, n_features * embedding_size)
        ###
        ### Note: In order to get batch_size, you may need use the tensor .size() function:
        ###         https://pytorch.org/docs/stable/tensors.html#torch.Tensor.size
        ###
        
        # helpful: https://stackoverflow.com/questions/42479902/how-does-the-view-method-work-in-pytorch
        # 3D tensor right now: (b, feat, embedding dim) -> (b, f * e)
        # use .size() ----> test_case.size()[0], test_case.size()[1] * test_case.size()[2]
        x = x.view(x.size()[0], x.size()[1] * x.size()[2])
        
        
        # ensure that our X has same number of rows as the input tensor 
        assert x.size()[0] == t.size()[0]
        
        # ensure our x has the same num of cols as feat * embedding dim
        assert x.size()[1] == t.size()[1] * self.pretrained_embeddings.weight.size()[1]
        
        
        ###  Please see the following docs for support:
        ###     Embedding Layer: https://pytorch.org/docs/stable/nn.html#torch.nn.Embedding
        ###     View: https://pytorch.org/docs/stable/tensors.html#torch.Tensor.view


        ### END YOUR CODE
        return x


    def forward(self, t):
        """ Run the model forward.

            Note that we will not apply the softmax function here because it is included in the loss function nn.CrossEntropyLoss

            PyTorch Notes:
                - Every nn.Module object (PyTorch model) has a `forward` function.
                - When you apply your nn.Module to an input tensor `t` this function is applied to the tensor.
                    For example, if you created an instance of your ParserModel and applied it to some `t` as follows,
                    the `forward` function would called on `t` and the result would be stored in the `output` variable:
                        model = ParserModel()
                        output = model(t) # this calls the forward function
                - For more details checkout: https://pytorch.org/docs/stable/nn.html#torch.nn.Module.forward

        @param t (Tensor): input tensor of tokens (batch_size, n_features)

        @return logits (Tensor): tensor of predictions (output after applying the layers of the network)
                                 without applying softmax (batch_size, n_classes)
        """
        ###  YOUR CODE HERE (~3-5 lines)
        ### TODO:
        ###     1) Apply `self.embedding_lookup` to `t` to get the embeddings
        
        # create output of our embedding matrix called "embeddings"
        self.embeddings = self.embedding_lookup(t)
        
        ###     2) Apply `embed_to_hidden` linear layer to the embeddings
        
        # we are now passing our (batch, 1080 feat) through weights to build our 'z' (output prior to passing into activation
        # this is just W^T * X + b 
        # adding a line to convert embeddings to float - my testing was failing probably due to my input values being integers
        self.z =  self.embed_to_hidden(self.embeddings.float())
        
        
        ###     3) Apply relu non-linearity to the output of step 2 to get the hidden units.
        # looks like this is in the functional portion of torch.nn from documentaiton
        self.a = F.relu(self.z)
        
        ###     4) Apply dropout layer to the output of step 3.
        # we now have a dropout layer to pass embeddings through
        # rrather, pass a through
        # need to recall that our weights will be scaled according to our likelihood of being dropped out 
        # this handles proper signal output
        self.drop_a = self.dropout(self.a)
        
        ###     5) Apply `hidden_to_logits` layer to the output of step 4 to get the logits.
        ###
        
        self.z2 = self.hidden_to_logits(self.drop_a) # this will eventually be passed to a softmax layer
        
        ### Note: We do not apply the softmax to the logits here, because
        ### the loss function (torch.nn.CrossEntropyLoss) applies it more efficiently.
        ###
        ### Please see the following docs for support:
        ###     ReLU: https://pytorch.org/docs/stable/nn.html?highlight=relu#torch.nn.functional.relu


        ### END YOUR CODE
        #return logits


In [294]:
embedding = nn.Embedding(1000,128)
test_case = embedding(torch.LongTensor([3,4]))
assert embedding(torch.LongTensor([3])).sum().item() == test_case[0].sum().item() # we got the proper val

In [295]:
test_case.size()

torch.Size([2, 128])

In [296]:
# building embeddings that are 100 x 30 --embeddings will be overwritten
embeddings = np.random.random_sample((100, 30)).astype(dtype = 'float64')

# convert to float

print(embeddings.shape) # 100 x 30 ---> 100 inputs, each with 30 vals
print(embeddings[1:2,:])

(100, 30)
[[0.42389749 0.91604898 0.7669048  0.74621228 0.79671346 0.17066938
  0.6062048  0.32924545 0.03539561 0.23327253 0.31414712 0.14020396
  0.59638958 0.155752   0.29738047 0.94842817 0.09997582 0.6856703
  0.06868365 0.33907189 0.70830388 0.64982319 0.1295096  0.42927301
  0.27109608 0.32810999 0.38914617 0.15096077 0.86548612 0.1461038 ]]


In [297]:
# instantiate: 
model = ParserModel(embeddings)

In [298]:
model.embed_to_hidden

Linear(in_features=1080, out_features=200, bias=True)

In [299]:
model.pretrained_embeddings(torch.LongTensor([3]))

tensor([[0.8641, 0.9854, 0.7399, 0.8110, 0.4895, 0.0596, 0.2673, 0.4251, 0.5998,
         0.0398, 0.8565, 0.6324, 0.5488, 0.3934, 0.6756, 0.8744, 0.4459, 0.1679,
         0.8272, 0.1401, 0.1169, 0.8471, 0.7919, 0.7971, 0.1145, 0.8239, 0.9562,
         0.8513, 0.5928, 0.3438]], dtype=torch.float64,
       grad_fn=<EmbeddingBackward>)

In [300]:
model.pretrained_embeddings.weight

Parameter containing:
tensor([[0.4203, 0.5863, 0.4584,  ..., 0.7357, 0.4661, 0.1332],
        [0.4239, 0.9160, 0.7669,  ..., 0.1510, 0.8655, 0.1461],
        [0.6994, 0.8283, 0.3989,  ..., 0.9111, 0.8091, 0.1646],
        ...,
        [0.0866, 0.1870, 0.9377,  ..., 0.8824, 0.6490, 0.4211],
        [0.3144, 0.9208, 0.4147,  ..., 0.7599, 0.9560, 0.9084],
        [0.3581, 0.1411, 0.7591,  ..., 0.4304, 0.4998, 0.9074]],
       dtype=torch.float64, requires_grad=True)

In [301]:
# build indices tensor
inds = torch.randint(0, 100, (4, 36), dtype=torch.long)
print(f"We input a tensor of word indices of size: {inds.size()[0]} samples, and {inds.size()[1]} words each")
output = model.embedding_lookup(inds)
# our output: should be (batch, word * embed dim)
print(f"Our output tensor is: {output.size()}") # makes sense, we are converting a word into 30 features, so each row is a long vector os 36 * 30, 1080

We input a tensor of word indices of size: 4 samples, and 36 words each
Our output tensor is: torch.Size([4, 1080])


In [302]:
model.forward(inds)

In [303]:
# let's confirm relu worked - looks good
print(f'Pre-relu output: {model.z[0][:20]}')
print(f'Post-relu output: {model.a[0][:20]}')

Pre-relu output: tensor([ 2.0081e-01,  3.6685e-01,  9.5539e-01,  1.4285e-02,  6.9633e-01,
         7.6659e-01, -5.8838e-02, -1.5154e-01, -2.1040e-02,  1.1938e+00,
        -6.4937e-03,  5.5015e-05, -8.2114e-01,  8.1163e-02,  5.3801e-01,
        -6.1641e-01,  7.8655e-01, -5.2092e-01, -1.6474e+00,  2.6407e-01],
       grad_fn=<SliceBackward>)
Post-relu output: tensor([2.0081e-01, 3.6685e-01, 9.5539e-01, 1.4285e-02, 6.9633e-01, 7.6659e-01,
        0.0000e+00, 0.0000e+00, 0.0000e+00, 1.1938e+00, 0.0000e+00, 5.5015e-05,
        0.0000e+00, 8.1163e-02, 5.3801e-01, 0.0000e+00, 7.8655e-01, 0.0000e+00,
        0.0000e+00, 2.6407e-01], grad_fn=<SliceBackward>)


In [304]:
model.drop_a[0][:20] # some of our values were randoly flagged as 0? looks like it doubles weight of others?

tensor([4.0162e-01, 7.3369e-01, 1.9108e+00, 2.8570e-02, 0.0000e+00, 1.5332e+00,
        0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 1.1003e-04,
        0.0000e+00, 1.6233e-01, 0.0000e+00, 0.0000e+00, 1.5731e+00, 0.0000e+00,
        0.0000e+00, 0.0000e+00], grad_fn=<SliceBackward>)

In [305]:
# understanding dropout: randomly selects weights to set to 0
# helpful: https://discuss.pytorch.org/t/unclear-behaviour-of-dropout/22890/2
# this will scale according to the dropout size...so dropout_p = 0.5, then we double the weight
inp = torch.tensor([1.,2.,3.,4.,5.,6.])
d2 = torch.nn.Dropout(0.5)
print(d2(inp)) # scales 2x

d10 = torch.nn.Dropout(0.9) # this is 1 - p, so our prob of being dropped is very high & we need to amplify weights out
print(d10(inp)) # scales 10

tensor([ 0.,  0.,  0.,  0., 10., 12.])
tensor([ 0.,  0.,  0.,  0., 50.,  0.])


In [308]:
# checking our output for final layer: we initialized with 3 classes, and passed in 4 observations
# our output at this point should be a : 4 x 3 tensorw, which has 3 preds per observations
print(model.z2)

tensor([[-0.0161, -0.9356, -1.2237],
        [ 0.1997,  0.2710,  0.0748],
        [-0.6898,  0.5865,  0.8704],
        [-1.0014,  1.4029,  1.3682]], grad_fn=<AddmmBackward>)
