In [0]:
import torch

import time
import csv

Below are some helpers, and a few lines to nudge you to change your Colab runtime to GPU (just in case you hadn't already).

In [37]:
def tlog(msg):
    print('{}  {}'.format(time.asctime(), msg))


# If possible, we should be running on GPU
if not torch.cuda.is_available():
    print('If you are running this notebook in Colab, go to the Runtime menu and select "Change runtime type" to switch to GPU.')
else:
    print('GPU ready to go!')

GPU ready to go!


In [0]:
# indices
I_PHRASE_ID = 0
I_SENTENCE_ID = 1
I_PHRASE = 2
I_LABEL = 3
I_TOKEN_LIST = 4

# constants
NULL_TOKEN = '<NULLTOKEN>'
MAX_SENTENCE_LENGTH = 200


class RottenTomatoesDataset(torch.nn.Module):
    def __init__(self):
        self.classes = [0,1,2,3,4] # sentiment scores
        
        raw_rows = [] # raw input
        
        with open('train.tsv') as tsvfile:
            tlog('Loading training data...')
            reader = csv.reader(tsvfile, delimiter='\t')
            count = 0
            exceptions = 0
            max_sentence = 0
            for row in reader: # assuming sorted by sentenceid, phraseid
                if count > 0: # skip header
                    phraseID = int(row[I_PHRASE_ID])
                    sentenceID = int(row[I_SENTENCE_ID])
                    label = int(row[I_LABEL])
                    if phraseID > 0 and sentenceID > 0 and label >= 0:
                        row[I_PHRASE_ID] = phraseID
                        row[I_SENTENCE_ID] = sentenceID
                        row[I_LABEL] = label
                        raw_rows.append(row)
                        max_sentence = max(max_sentence, sentenceID)
                    else:
                        print('EXCEPTION')
                        print(row)
                        exceptions += 1
                count += 1
            
            
            # gather tokens
            (self.vocab_map, self.enriched_rows) = self.build_vocab_and_map_phrases(raw_rows)
            
            # break into training & validation
            tlog('Splitting training and validation sets...')
            i = 0
            while self.enriched_rows[i][I_SENTENCE_ID] < (max_sentence * 0.8):
                i += 1
            self.training_rows, self.validation_rows = self.enriched_rows[:i], self.enriched_rows[i:]

            # wrap it up
            self.training = True
            tlog('Finished loading training data:')
            tlog('  {} exceptions in {} rows ({} good records)'.format(exceptions, count, count - exceptions))
            tlog('  token count {}'.format(len(vocab)))

    # helpers
    def build_vocab_and_map_phrases(self, raw_rows):
        tlog('Building vocabulary...')
        vocab = set()
        last_sentence_parsed = 0
        for row in raw_rows:
            sent_id = row[I_SENTENCE_ID]
            if sent_id > last_sentence_parsed:
                tokens = row[I_PHRASE].split(' ')
                for token in tokens:
                    vocab.add(token) # make them unique
        vocab = list(vocab)
        vocab.append(NULL_TOKEN)
        vocab_map = {vocab[i]: i for i in range(len(vocab))}
        
        tlog('Mapping phrases to one-hot vectors...')
        
        enriched_rows = raw_rows
        for i, row in enumerate(enriched_rows):
            if i % 10000 == 0: tlog('  mapping row {} of {}'.format(i, len(raw_rows)))
            token_list = []
            tokens = row[I_PHRASE].split(' ')
            for token in tokens:
                if token in vocab:
                    token_list.append(vocab_map[token])
                else:
                    token_list.append(vocab_map[NULL_TOKEN])
            token_list = torch.tensor(token_list, dtype=torch.long)
            padded_token_list = torch.zeros(MAX_SENTENCE_LENGTH)
            padded_token_list[:len(token_list)] = token_list
            row.append(padded_token_list)
            
        tlog('Finished vocabulary mapping')
        return vocab_map, enriched_rows
    
    
    # two states, training and validation
    def train(self):
        self.training = True
    
    def validate(self):
        self.training = False
    
    def current_dataset(self):
        if self.training:
            return self.training_rows
        return self.validation_rows

    # the obligatory
    def __len__(self):
        return len(self.current_dataset())
    
    def __getitem__(self, idx):
        row = self.current_dataset()[idx]
        return row[I_TOKEN_LIST], row[I_LABEL]


I've set up a single dataset class that (crudely) splits the set between training and validation sets with a roughly 80/20 split - see the cell below for usage.

In [51]:
dataset = RottenTomatoesDataset()
dataset.train()
print(len(dataset))
dataset.validate()
print(len(dataset))

Wed Apr 24 05:27:52 2019  Loading training data...
Wed Apr 24 05:27:53 2019  Building vocabulary...
Wed Apr 24 05:27:53 2019  Mapping phrases to one-hot vectors...
Wed Apr 24 05:27:53 2019    mapping row 0 of 156060
Wed Apr 24 05:28:01 2019    mapping row 10000 of 156060
Wed Apr 24 05:28:09 2019    mapping row 20000 of 156060
Wed Apr 24 05:28:18 2019    mapping row 30000 of 156060
Wed Apr 24 05:28:28 2019    mapping row 40000 of 156060
Wed Apr 24 05:28:38 2019    mapping row 50000 of 156060
Wed Apr 24 05:28:47 2019    mapping row 60000 of 156060
Wed Apr 24 05:28:57 2019    mapping row 70000 of 156060
Wed Apr 24 05:29:07 2019    mapping row 80000 of 156060
Wed Apr 24 05:29:18 2019    mapping row 90000 of 156060
Wed Apr 24 05:29:28 2019    mapping row 100000 of 156060
Wed Apr 24 05:29:38 2019    mapping row 110000 of 156060
Wed Apr 24 05:29:49 2019    mapping row 120000 of 156060
Wed Apr 24 05:29:59 2019    mapping row 130000 of 156060
Wed Apr 24 05:30:09 2019    mapping row 140000 of 15

NameError: ignored

The simple RNN-based network below transforms the input as follows:

1) Initially, the input tensor has a one-hot vector with the same dimensionality as the dataset's total vocabulary. The Embedding layer converts this to a denser representation as a floating-point vector.

2) The RNN layer maintains a hidden state that allows it to capture context from short sequences.

3) Finally, the Linear layer classifies the phrase into one of our five sentiment classes (0-4).

In [46]:
for _, (sen, label) in enumerate(dataset):
    print(sen)
    break
    

tensor([ 6786.,  2956., 11505.,  3376.,   655.,  1990., 16465.,  3547.,  8038.,
         3766.,  4429.,   129.,  1046., 17724., 17706.,     0.,     0.,     0.,
            0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
            0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
            0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
            0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
            0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
            0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
            0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
            0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
            0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
            0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
            0.,     0.,     0.,     0., 

In [0]:
class SentimentSeeker(torch.nn.Module):
    
    def __init__(self, vocab_size, embedding_dim, hidden_dim, n_classes):
        super(SentimentSeeker, self).__init__()
        
        self.embed = torch.nn.Embedding(vocab_size, embedding_dim)
        self.rnn = torch.nn.RNN(embedding_dim, hidden_dim)
        self.classifier = torch.nn.Linear(hidden_dim, n_classes)
        
    def forward(self, sen): # input vector of max sentence length containing one-hots
        embedded = self.embed(sen) # adds embedded dim
        output, hidden = self.rnn(embedded)
        return self.classfier(hidden.squeeze(0))

In [0]:
model = SentimentSeeker(len(dataset.vocab_map), 32, 128, len(dataset.classes))
# dataset.classes

In [0]:
# training constants
N_EPOCHS = 2
BATCH_SIZE = 4

dataloader = torch.utils.data.DataLoader(dataset, shuffle=True, batch_size=BATCH_SIZE)
model = SentimentSeeker(len(dataset.vocab_map), 32, 128, dataset.classes)

# training loop
for epoch in range(N_EPOCHS):
    tlog('Epoch {} of {}'.format(epoch, N_EPOCHS))
    
    for batch_idx, ((_, _, token_list), label) in enumerate(dataloader):
        sentences = 
