In [94]:
import torch
import torch.nn as nn
import pandas as pd
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\eppu\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [69]:
# Load tokens (1000 most frequent words in reviews)
filename = "../data/tokens_list_1k.csv"
tokens = pd.read_csv(filename)
n_tokens = len(tokens)

# Store in Pandas series and index for fast checking of indices
ts = pd.Series(tokens["token"], tokens.index)
ts_i = pd.Index(ts)

In [82]:
ts_i.get_loc("the")

1

In [86]:
"""
To represent a single token, we use a “one-hot vector” of 
size <1 x n_tokens>. A one-hot vector is filled with 0s
except for a 1 at index of the current token
"""

# Find token index from tokens, e.g. "the" = 1
def token_to_index(token):
    try:
        return ts_i.get_loc(token)
    except:
        return -1

# Turn a token into a <1 x n_tokens> Tensor
def token_to_tensor(token):
    tensor = torch.zeros(1, n_tokens)
    i = token_to_index(token)
    if i >= 0:
        tensor[0][i] = 1
    return tensor

# Turn a review text into a <text_length x 1 x n_tokens> Tensor,
# or an array of one-hot token vectors
def text_to_tensor(text):
    text_tokens = word_tokenize(text)
    tensor = torch.zeros(len(text_tokens), 1, n_tokens)
    for i, token in enumerate(text_tokens):
        j = token_to_index(token)
        if i >= 0:
            tensor[i][0][j] = 1
    return tensor

In [81]:
token_to_tensor("the")

tensor([[0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0.,

In [91]:
text_to_tensor("the game is good.")

tensor([[[0., 1., 0.,  ..., 0., 0., 0.]],

        [[0., 0., 0.,  ..., 0., 0., 0.]],

        [[0., 0., 0.,  ..., 0., 0., 0.]],

        [[0., 0., 0.,  ..., 0., 0., 0.]],

        [[1., 0., 0.,  ..., 0., 0., 0.]]])

In [95]:
# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Hyper-parameters
num_epochs = 2
batch_size = 100
learning_rate = 0.001

input_size = 1000
hidden_size = 256
num_layers = 1

In [102]:
# Recurrent neural network with one hidden layer
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers):
        super(RNN, self).__init__()
        self.num_layers = num_layers
        self.hidden_size = hidden_size
        self.rnn = nn.RNN(input_size, hidden_size, num_layers, batch_first=True)
        self.linear = nn.Linear(hidden_size, 1)
        
    def forward(self, x):
        # Set initial hidden states
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device)
        
        # Input data and hidden state shape (n = text number of tokens)
        # x: (n, 1000), h0: (n, 256)
        
        # Forward propagate RNN
        out, _ = self.rnn(x, h0)
        
        # Output tensor shape
        # out: (n, 256)
        
        # Decode the hidden state of the last time step
        out = out[-1, :]
        # out: (1, 256)
         
        out = self.linear(out)
        # out: (1, 1)
        return out


In [103]:
# Create model
model = RNN(input_size, hidden_size, num_layers).to(device)

# Loss and optimizer
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [104]:
model

RNN(
  (rnn): RNN(1000, 256, batch_first=True)
  (linear): Linear(in_features=256, out_features=1, bias=True)
)