In [32]:
import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from nltk import word_tokenize

# Defining the NN Model

GRU NN with last output through a sigmoid or a FNN with sigmoid to classify sentiment

In [35]:
# Hyperparameters

input_size = 28
sequence_length = 28
num_layers = 2
hidden_size = 256
num_classes = 2
learning_rate = 0.001
batch_size = 64
num_epoch = 1


In [36]:
# GRU RNN

class GRU(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super(GRU, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.gru = nn.GRU(input_size, hidden_size, num_layers, batch_first=True)
        #self.fc = nn.Linear(hidden_size * sequence_length, num_classes) # All outputs of time t
        self.fc = nn.Linear(hidden_size, num_classes) # Only last output
        
    def foward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size)
        
        # Forward Prop
        out, _ = self.gru(x, h0)
        out = out.reshape(out.shape[0], -1)
        #out = self.fc(out) # All outputs of time t
        out = self.fc(out[:, -1, :]) # Only last output
        
        return out
    

# Loading and preprocessing the data

Using torchtext: 
- https://torchtext.readthedocs.io/en/latest/index.html
- https://galhever.medium.com/sentiment-analysis-with-pytorch-part-1-data-preprocessing-a51c80cc15fb
- https://github.com/bentrevett/pytorch-sentiment-analysis/blob/master/1%20-%20Simple%20Sentiment%20Analysis.ipynb

In [37]:
# Loading train data

from loader import load_train
train = load_train()

In [41]:
# Load embeddings

import gensim.models

googEmbs = gensim.models.KeyedVectors.load_word2vec_format(
                                '../embeddings/googlenews.bin', binary=True)
print('loading finished')

loading finished


In [None]:
"""# Preprocessing helper function - called by torchtext "Field" class
## Gets word embeddings

def preprocessing(sentences): # Input sentence as string
    
    cleaned_sentence = []
    for sent in sentences:
        
    
    sentence = word_tokenize(sentence) # Preprocessing step: Tokenize
    
    emb_sent = []
    for word in sentence:
        try:
            emb_word = googEmbs[word]
        except KeyError:
            emb_word = np.zeros((300,))
        
        emb_sent.append(emb_word)
    
    return emb_sent # A list of word embeddings"""

In [56]:
# Preprocessing data

from torchtext.legacy import data
import spacy 

#spacy.cli.download("en_core_web_sm")

TEXT = data.Field(tokenize = 'spacy',
                  tokenizer_language = 'en_core_web_sm',
                  #preprocessing=preprocessing,
                  fix_length=200, # should be max_document_length
                  lower=True
                 )
LABEL = data.LabelField(dtype = torch.float)

In [57]:
# Formating train_x and train_y

train_x = list(map(TEXT.preprocess, train["reviewText"]))
train_x = TEXT.pad(train_x)
train_y = list(map(LABEL.preprocess, train["sentiment"]))

In [64]:
# Building vocab

TEXT.build_vocab(train_x, max_size=5000, vectors='glove.42B.300d')
LABEL.build_vocab(train_x)
vocab_size = len(TEXT.vocab)

.vector_cache/glove.42B.300d.zip:  10%|█         | 190M/1.88G [04:37<41:03, 685kB/s]      


KeyboardInterrupt: 

In [59]:
vocab = TEXT.vocab

In [63]:
vocab["hope"]

350