In [0]:
import torch

import time
import csv

Below are some helpers, and a few lines to nudge you to change your Colab runtime to GPU (just in case you hadn't already).

In [46]:
def tlog(msg):
    print('{}  {}'.format(time.asctime(), msg))


# If possible, we should be running on GPU
if not torch.cuda.is_available():
    print('If you are running this notebook in Colab, go to the Runtime menu and select "Change runtime type" to switch to GPU.')
else:
    print('GPU ready to go!')

GPU ready to go!


In [0]:
# indices
I_PHRASE_ID = 0
I_SENTENCE_ID = 1
I_PHRASE = 2
I_LABEL = 3
I_TOKEN_LIST = 4

class RottenTomatoesDataset(torch.nn.Module):
    def __init__(self):
        raw_rows = [] # raw input
        vocab = set() # build vocabulary for embeddings
        
        with open('train.tsv') as tsvfile:
            tlog('Loading training data...')
            reader = csv.reader(tsvfile, delimiter='\t')
            count = 0
            exceptions = 0
            max_sentence = 0
            for row in reader: # assuming sorted by sentenceid, phraseid
                if count > 0: # skip header
                    phraseID = int(row[I_PHRASE_ID])
                    sentenceID = int(row[I_SENTENCE_ID])
                    label = int(row[I_LABEL])
                    if phraseID > 0 and sentenceID > 0 and label >= 0:
                        row[I_PHRASE_ID] = phraseID
                        row[I_SENTENCE_ID] = sentenceID
                        row[I_LABEL] = label
                        raw_rows.append(row)
                        max_sentence = max(max_sentence, sentenceID)
                    else:
                        print('EXCEPTION')
                        print(row)
                        exceptions += 1
                count += 1
            
            # break into training & validation
            tlog('Splitting training and validation sets...')
            i = 0
            while raw_rows[i][I_SENTENCE_ID] < (max_sentence * 0.8):
                i += 1
            self.training_rows, self.validation_rows = raw_rows[:i], raw_rows[i:]
            
            # gather tokens and set up embedding
            last_sentence_parsed = 0
            for row in raw_rows:
                sent_id = row[I_SENTENCE_ID]
                if sent_id > last_sentence_parsed:
                    tokens = row[I_PHRASE].split(' ')
                    for token in tokens:
                        vocab.add(token) # make them unique
            vocab = list(vocab)

            # wrap it up
            self.training = True
            tlog('Finished loading training data:')
            tlog('  {} exceptions in {} rows ({} good records)'.format(exceptions, count, count - exceptions))
            tlog('  token count {}'.format(len(vocab)))
    
    def train(self):
        self.training = True
    
    def validate(self):
        self.training = False
    
    def current_dataset(self):
        if self.training:
            return self.training_rows
        return self.validation_rows

    def __len__(self):
        return len(self.current_dataset())
    
    def __getitem__(self, idx):
        row = self.current_dataset()[idx]
        return (row[I_PHRASE_ID], row[I_SENTENCE_ID], row[I_PHRASE]), row[I_LABEL]


I've set up a single dataset class that (crudely) splits the set between training and validation sets with a roughly 80/20 split - see the cell below for usage.

In [60]:
dataset = RottenTomatoesDataset()
dataset.train()
print(len(dataset))
dataset.validate()
print(len(dataset))

Wed Apr 24 00:48:35 2019  Loading training data...
Wed Apr 24 00:48:35 2019  Splitting training and validation sets...
Wed Apr 24 00:48:35 2019  Finished loading training data:
Wed Apr 24 00:48:35 2019    0 exceptions in 156061 rows (156061 good records)
Wed Apr 24 00:48:35 2019    token count 18227
127101
28959


In [0]:
# model

In [0]:
# training loop