In [24]:
import random
import os

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from torchtext import data

from torchtext import datasets

https://github.com/bentrevett/pytorch-sentiment-analysis/blob/master/1%20-%20Simple%20Sentiment%20Analysis.ipynb

In [2]:
SEED = 1234

torch.manual_seed(SEED)
# torch.cuda.manual_seed(SEED)

TEXT = data.Field(tokenize='spacy')
LABEL = data.LabelField(tensor_type=torch.FloatTensor)

In [3]:
%%time
train, test = datasets.IMDB.splits(TEXT, LABEL)

Wall time: 6min 46s


In [4]:
print('train.fields:', train.fields)

train.fields: {'text': <torchtext.data.field.Field object at 0x000001E7B801B160>, 'label': <torchtext.data.field.LabelField object at 0x000001E7B801B240>}


In [5]:
print('vars(train[0]):', vars(train[0]))

vars(train[0]): {'text': ['Bromwell', 'High', 'is', 'a', 'cartoon', 'comedy', '.', 'It', 'ran', 'at', 'the', 'same', 'time', 'as', 'some', 'other', 'programs', 'about', 'school', 'life', ',', 'such', 'as', '"', 'Teachers', '"', '.', 'My', '35', 'years', 'in', 'the', 'teaching', 'profession', 'lead', 'me', 'to', 'believe', 'that', 'Bromwell', 'High', "'s", 'satire', 'is', 'much', 'closer', 'to', 'reality', 'than', 'is', '"', 'Teachers', '"', '.', 'The', 'scramble', 'to', 'survive', 'financially', ',', 'the', 'insightful', 'students', 'who', 'can', 'see', 'right', 'through', 'their', 'pathetic', 'teachers', "'", 'pomp', ',', 'the', 'pettiness', 'of', 'the', 'whole', 'situation', ',', 'all', 'remind', 'me', 'of', 'the', 'schools', 'I', 'knew', 'and', 'their', 'students', '.', 'When', 'I', 'saw', 'the', 'episode', 'in', 'which', 'a', 'student', 'repeatedly', 'tried', 'to', 'burn', 'down', 'the', 'school', ',', 'I', 'immediately', 'recalled', '.........', 'at', '..........', 'High', '.', 'A

In [6]:
train, valid = train.split(random_state=random.seed(SEED))

In [7]:
print('len(train):', len(train))
print('len(test):', len(test))
print('len(valid):', len(valid))

len(train): 17500
len(test): 25000
len(valid): 7500


In [8]:
TEXT.build_vocab(train, max_size=25000)
LABEL.build_vocab(train)
print('len(TEXT.vocab):', len(TEXT.vocab))
print('len(LABEL.vocab):', len(LABEL.vocab))

len(TEXT.vocab): 25002
len(LABEL.vocab): 2


In [9]:
print(TEXT.vocab.freqs.most_common(20))

[('the', 203563), (',', 192482), ('.', 165215), ('and', 109444), ('a', 109122), ('of', 100703), ('to', 93766), ('is', 76328), ('in', 61257), ('I', 54000), ('it', 53504), ('that', 49185), ('"', 44283), ("'s", 43315), ('this', 42438), ('-', 36691), ('/><br', 35752), ('was', 35033), ('as', 30386), ('with', 29774)]


In [10]:
# We can also see the vocabulary directly using either the stoi (string to int) or itos (int to string) method.
print(TEXT.vocab.itos[:10])
print(LABEL.vocab.stoi)

['<unk>', '<pad>', 'the', ',', '.', 'and', 'a', 'of', 'to', 'is']
defaultdict(<function _default_unk_index at 0x000001E7B7FE9158>, {'neg': 0, 'pos': 1})


In [11]:
"""
BucketIterator first sorts of the examples using the sort_key, here we use the length of the sentences, 
and then partitions them into buckets. When the iterator is called it returns a batch of examples from the same bucket. 
This will return a batch of examples where each example is a similar length, minimizing the amount of padding.
"""

BATCH_SIZE = 64

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train, valid, test), 
    batch_size=BATCH_SIZE, 
    sort_key=lambda x: len(x.text), 
    repeat=False)

In [12]:
class RNN(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
        super().__init__()
        
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        self.rnn = nn.RNN(embedding_dim, hidden_dim)
        self.fc = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, x):

        #x = [sent len, batch size]
        
        embedded = self.embedding(x)
        
        #embedded = [sent len, batch size, emb dim]
        
        output, hidden = self.rnn(embedded)
        
        #output = [sent len, batch size, hid dim]
        #hidden = [1, batch size, hid dim]
        
        assert torch.equal(output[-1,:,:], hidden.squeeze(0))
        
        return self.fc(hidden.squeeze(0))

In [13]:
INPUT_DIM = len(TEXT.vocab) # dimensionality of one-hot vectors
EMBEDDING_DIM = 100 # dimensionality of word embeddings
HIDDEN_DIM = 256 #dimensionality of hidden states
OUTPUT_DIM = 1 # scalar class labels

model = RNN(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM)

In [14]:
# update step: stochastic gradient descent
optimizer = optim.SGD(model.parameters(), lr=1e-3) #lr: learning rate

In [15]:
# loss function: binary cross entropy with logits
criterion  = nn.BCEWithLogitsLoss()

In [16]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = model.to(device)
criterion = criterion.to(device)

In [17]:
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    #round predictions to the closest integer
    rounded_preds = torch.round(F.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum()/len(correct)
    return acc

In [18]:
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    # set model in 'training mode' to turn on dropout and batch normalization
    model.train()
    
    for batch in iterator:
        # reset the gradients before processing the next batch
        optimizer.zero_grad()
        # feed batch text into model, and change the dimensionality of predictions from [batch size, 1] to [batch size]
        predictions = model(batch.text).squeeze(1)
        # calculate loss and accuracy for this batch
        loss = criterion(predictions, batch.label)
        acc = binary_accuracy(predictions, batch.label)
        # calculate gradient of each parameter
        loss.backward()
        # update parameters using the gradients and optimizer algorithm
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [19]:
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    # set model in 'evaluation mode' to turn off dropout and batch normalization
    model.eval()
    # do not calculate gradients (speeds up computation)
    with torch.no_grad():
    
        for batch in iterator:

            predictions = model(batch.text).squeeze(1)
            
            loss = criterion(predictions, batch.label)
            
            acc = binary_accuracy(predictions, batch.label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [20]:
# Now we train the model through multiple epochs, each epoch being a complete pass through all examples in the split
N_EPOCHS = 5

for epoch in range(N_EPOCHS):

    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    print(f'Epoch: {epoch+1:02}, Train Loss: {train_loss:.3f}, Train Acc: {train_acc*100:.2f}%, Val. Loss: {valid_loss:.3f}, Val. Acc: {valid_acc*100:.2f}%')

  return Variable(arr, volatile=not train)


Epoch: 01, Train Loss: 0.694, Train Acc: 50.26%, Val. Loss: 0.696, Val. Acc: 49.92%
Epoch: 02, Train Loss: 0.693, Train Acc: 49.92%, Val. Loss: 0.696, Val. Acc: 49.80%
Epoch: 03, Train Loss: 0.693, Train Acc: 50.21%, Val. Loss: 0.696, Val. Acc: 50.38%
Epoch: 04, Train Loss: 0.693, Train Acc: 49.83%, Val. Loss: 0.696, Val. Acc: 49.68%
Epoch: 05, Train Loss: 0.693, Train Acc: 50.10%, Val. Loss: 0.696, Val. Acc: 50.74%


In [21]:
test_loss, test_acc = evaluate(model, test_iterator, criterion)

print(f'Test Loss: {test_loss:.3f}, Test Acc: {test_acc*100:.2f}%')

  return Variable(arr, volatile=not train)


Test Loss: 0.710, Test Acc: 47.29%


In [25]:
modeldir = os.path.join('output', 'models')

In [30]:
# Save the model to use for interence later.
trainset = 'IMDB'
modeltype = 'simpleRNN'
netname = trainset + '-' + modeltype + '-epoch' + str(epoch) + '.pth'
torch.save(model.state_dict(), os.path.join(modeldir, 'state_dict', netname))

In [31]:
# Load the model. By default, it's loaded in training state, so change it to eval.
model2 = RNN(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM)
model2.load_state_dict(torch.load(os.path.join(modeldir, 'state_dict' , netname)))
model.eval()

RNN(
  (embedding): Embedding(25002, 100)
  (rnn): RNN(100, 256)
  (fc): Linear(in_features=256, out_features=1, bias=True)
)

In [32]:
test_loss2, test_acc2 = evaluate(model2, test_iterator, criterion)

print(f'Test Loss: {test_loss2:.3f}, Test Acc: {test_acc2*100:.2f}%')

  return Variable(arr, volatile=not train)


Test Loss: 0.710, Test Acc: 47.29%


In [40]:
# Save the full model to resume training later, i.e. including state of optimizer, epochs, score, etc.
state = { 
    'epoch': epoch,
    'state_dict': model.state_dict(),
    'optimizer': optimizer.state_dict()
}
netstatename = trainset + '-' + modeltype + '-epoch' + str(epoch) + '-fullstate.pth'
torch.save(state, os.path.join(modeldir, 'full_state', netstatename))

In [43]:
# Load all the states to resume training
state3 = torch.load(os.path.join(modeldir, 'full_state', netstatename))

model3 = RNN(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM)
model3.load_state_dict(state3['state_dict'])

optimizer3 = optim.SGD(model.parameters(), lr=1e-3)
optimizer3.load_state_dict(state['optimizer'])

epoch3 = state3['epoch']

In [53]:
N_EPOCHS3 = 6

for epoch in range(epoch3, N_EPOCHS):

    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    print(f'Epoch: {epoch+1:02}, Train Loss: {train_loss:.3f}, Train Acc: {train_acc*100:.2f}%, Val. Loss: {valid_loss:.3f}, Val. Acc: {valid_acc*100:.2f}%')

  return Variable(arr, volatile=not train)


Epoch: 05, Train Loss: 0.693, Train Acc: 49.63%, Val. Loss: 0.696, Val. Acc: 49.59%
