# Week 6: Feedforward neural networks

In [None]:
import nltk
from nltk.data import find
import gensim
from nltk.corpus import brown
import re
import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

We are going to implement the 2-layered feedforward model from today's slides. Here is its architecture:

<img src="NN-2layer.png" width=500>

Fill in both the initialization function and the forward function to replicate this model's layers, you should only include the layers in the purple box as the final sigmoid activation function will be included in the loss function defined below. Assume that the model can have *input_dim* number of input features and that the second layer has *hidden_dim* number of features.

In [None]:
class Model(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(Model, self).__init__()
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.output_dim = output_dim
        ## TO DO

    
        ##

    def forward(self, x):
        ## TO DO

        
        ##
        return z2

Next, let's prepare our data, we are going to try to predict the POS tag of words in the news articles in the Brown corpus as a function of their word2vec embedding features. The first step is to get our list of (token,tag) pairs.

In [None]:
tagged_words = [(token.lower(), tag) for (token, tag) in brown.tagged_words(categories='news') if re.match(r'\w', token)]

In [None]:
len(tagged_words)

In [None]:
tagged_words[5]

We can now load in our pretrained word embeddings.

In [None]:
word2vec_sample = str(find('models/word2vec_sample/pruned.word2vec.txt'))
pretrained_embeddings = gensim.models.KeyedVectors.load_word2vec_format(word2vec_sample, binary=False)

Let's filter our dataset to only include pairs where an embedding for the token exists.

In [None]:
tagged_dataset = [(token, tag) for (token, tag) in tagged_words if token in pretrained_embeddings]

In [None]:
len(tagged_dataset)

Let's get the list of unique tags in our dataset which will serve as our output categories.

In [None]:
tag_list = list(set([tag for (token, tag) in tagged_dataset]))
len(tag_list)

Now let's create our pytorch Dataset. You will need to complete the \__getitem__ function. Given an index, it should find the (token, tag) pair in the dataset at that index and return *inputs*, which should be a torch tensor of the embedding of the token, as well as the *label*, which should be a torch tensor of type long of the tag's index in tag_list.

In [None]:
class TagDataset(Dataset):
    def __init__(self, tagged_dataset, pretrained_embeddings, tag_list):
        self.tagged_dataset = tagged_dataset
        self.pretrained_embeddings = pretrained_embeddings
        self.tag_list = tag_list

    def __getitem__(self, idx):
        ## TO DO

        
        ##
        return inputs, label

    def __len__(self):
        return len(self.tagged_dataset)



Let's create our dataset!

In [None]:
data = TagDataset(tagged_dataset, pretrained_embeddings, tag_list)

Let's now define our hyperparameters. Fill in the missing values.

In [None]:
# Model Hyperparameters
input_dim = ## TO DO ##
hidden_dim = 250
output_dim = ## TO DO ##

# Training Hyperparameters
epochs = 3
batch_size = 32
lr = 0.001

We can now create our model and dataloader.

In [None]:
model = Model(input_dim, hidden_dim, output_dim)
dataloader = DataLoader(data, batch_size, shuffle=True)

Next we need to initialize our loss function and the optimizer we will use for backpropagation. Check out this page to see what nn.CrossEntropyLoss implements: [https://pytorch.org/docs/stable/generated/torch.nn.CrossEntropyLoss.html]

In [None]:
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
criterion = nn.CrossEntropyLoss()

In [None]:
loss_data = []
step = 1
model.train()
for epoch in range(epochs):
    for inputs, labels in dataloader:
        # clear gradients from optimizer
        optimizer.zero_grad()
        # get logits from model
        logits = model(inputs)
        # calculate the loss
        loss = criterion(logits, labels)
        # keep track of loss so we can plot it after
        loss_data.append((step, loss.item()))
        step+=1
        # calculate the gradients
        loss.backward()
        # add the gradients to model parameters based on learning rate
        optimizer.step()

Finally, let's plot the loss to see what it does:

In [None]:
plt.plot(*zip(*loss_data))
plt.show()