In [0]:
!pip install torch
!pip install torchtext
!python -m spacy download en


# K80 gpu for 12 hours
import torch
from torch import nn, optim
from torchtext import data, datasets

print('GPU:', torch.cuda.is_available())

Collecting torch
[?25l  Downloading https://files.pythonhosted.org/packages/7e/60/66415660aa46b23b5e1b72bc762e816736ce8d7260213e22365af51e8f9c/torch-1.0.0-cp36-cp36m-manylinux1_x86_64.whl (591.8MB)
[K    100% |████████████████████████████████| 591.8MB 28kB/s 
tcmalloc: large alloc 1073750016 bytes == 0x61892000 @  0x7f9c7645b2a4 0x591a07 0x5b5d56 0x502e9a 0x506859 0x502209 0x502f3d 0x506859 0x504c28 0x502540 0x502f3d 0x506859 0x504c28 0x502540 0x502f3d 0x506859 0x504c28 0x502540 0x502f3d 0x507641 0x502209 0x502f3d 0x506859 0x504c28 0x502540 0x502f3d 0x507641 0x504c28 0x502540 0x502f3d 0x507641
[?25hInstalling collected packages: torch
Successfully installed torch-1.0.0
Collecting torchtext
[?25l  Downloading https://files.pythonhosted.org/packages/c6/bc/b28b9efb4653c03e597ed207264eea45862b5260f48e9f010b5068d64db1/torchtext-0.3.1-py3-none-any.whl (62kB)
[K    100% |████████████████████████████████| 71kB 2.7MB/s 
Installing collected packages: torchtext
Successfully installed torcht

In [0]:
torch.manual_seed(123)

TEXT = data.Field(tokenize='spacy')
LABEL = data.LabelField(dtype=torch.float)
train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)

downloading aclImdb_v1.tar.gz


aclImdb_v1.tar.gz: 100%|██████████| 84.1M/84.1M [00:07<00:00, 11.4MB/s]


In [0]:
print('len of train data:', len(train_data))
print('len of test data:', len(test_data))

len of train data: 25000
len of test data: 25000


In [0]:
print(train_data.examples[15].text)
print(train_data.examples[15].label)

['I', 'loved', 'this', 'film', '.', 'I', 'thought', 'it', 'would', 'be', 'easy', 'to', 'watch', ',', 'and', 'easy', 'to', 'forget', '.', 'I', 'ran', 'out', 'after', 'watching', 'this', 'to', 'buy', 'the', 'DVD', ',', 'obv', 'not', 'easily', 'forgotten!<br', '/><br', '/>The', 'script', 'is', 'brilliant', ',', 'and', 'the', 'casting', 'could', "n't", 'be', 'more', 'perfect', '.', 'Each', 'character', 'has', 'their', 'moment', ',', 'and', 'I', 'laughed', 'hard', 'throughout', 'this', 'film', ',', 'comedic', 'timing', 'was', 'spot', '-', 'on.<br', '/><br', '/', '>']
pos


In [0]:
# word2vec, glove
TEXT.build_vocab(train_data, max_size=10000, vectors='glove.6B.100d')
LABEL.build_vocab(train_data)


batchsz = 100
device = torch.device('cuda')
train_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, test_data),
    batch_size = batchsz,
    device=device
)

.vector_cache/glove.6B.zip: 862MB [01:10, 12.1MB/s]                           
100%|█████████▉| 399630/400000 [00:21<00:00, 19336.50it/s]

In [0]:
class RNN(nn.Module):
    
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        """
        """
        super(RNN, self).__init__()
        
        # [0-10001] => [100]
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        # [100] => [256]
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, num_layers=2, 
                           bidirectional=True, dropout=0.5)
        # [256*2] => [1]
        self.fc = nn.Linear(hidden_dim*2, 1)
        self.dropout = nn.Dropout(0.5)
        
        
    def forward(self, x):
        """
        x: [seq_len, b] vs [b, 3, 28, 28]
        """
        # [seq, b, 1] => [seq, b, 100]
        embedding = self.dropout(self.embedding(x))
        
        # output: [seq, b, hid_dim*2]
        # hidden/h: [num_layers*2, b, hid_dim]
        # cell/c: [num_layers*2, b, hid_di]
        output, (hidden, cell) = self.rnn(embedding)
        
        # [num_layers*2, b, hid_dim] => 2 of [b, hid_dim] => [b, hid_dim*2]
        hidden = torch.cat([hidden[-2], hidden[-1]], dim=1)
        
        # [b, hid_dim*2] => [b, 1]
        hidden = self.dropout(hidden)
        out = self.fc(hidden)
        
        return out

In [0]:
rnn = RNN(len(TEXT.vocab), 100, 256)

pretrained_embedding = TEXT.vocab.vectors
print('pretrained_embedding:', pretrained_embedding.shape)
rnn.embedding.weight.data.copy_(pretrained_embedding)
print('embedding layer inited.')

optimizer = optim.Adam(rnn.parameters(), lr=1e-3)
criteon = nn.BCEWithLogitsLoss().to(device)
rnn.to(device)


pretrained_embedding: torch.Size([10002, 100])
embedding layer inited.


RNN(
  (embedding): Embedding(10002, 100)
  (rnn): LSTM(100, 256, num_layers=2, dropout=0.5, bidirectional=True)
  (fc): Linear(in_features=512, out_features=1, bias=True)
  (dropout): Dropout(p=0.5)
)

In [0]:
import numpy as np

def binary_acc(preds, y):
    """
    get accuracy
    """
    preds = torch.round(torch.sigmoid(preds))
    correct = torch.eq(preds, y).float()
    acc = correct.sum() / len(correct)
    return acc

def train(rnn, iterator, optimizer, criteon):
    
    avg_acc = []
    rnn.train()
    
    for i, batch in enumerate(iterator):
        
        # [seq, b] => [b, 1] => [b]
        pred = rnn(batch.text).squeeze(1)
        # 
        loss = criteon(pred, batch.label)
        acc = binary_acc(pred, batch.label).item()
        avg_acc.append(acc)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if i%10 == 0:
            print(i, acc)
        
    avg_acc = np.array(avg_acc).mean()
    print('avg acc:', avg_acc)
    
    
def eval(rnn, iterator, criteon):
    
    avg_acc = []
    
    rnn.eval()
    
    with torch.no_grad():
        for batch in iterator:

            # [b, 1] => [b]
            pred = rnn(batch.text).squeeze(1)

            #
            loss = criteon(pred, batch.label)

            acc = binary_acc(pred, batch.label).item()
            avg_acc.append(acc)
        
    avg_acc = np.array(avg_acc).mean()
    
    print('>>test:', avg_acc)
        
    
    

In [0]:
for epoch in range(10):
    
    eval(rnn, test_iterator, criteon)
    train(rnn, train_iterator, optimizer, criteon)

>>test: 0.7121199841499328
0 0.7400000095367432
10 0.7799999713897705
20 0.7299999594688416
30 0.699999988079071
40 0.7699999809265137
50 0.7299999594688416
60 0.7599999904632568
70 0.6899999976158142
80 0.75
90 0.699999988079071
100 0.6699999570846558
110 0.7400000095367432
120 0.7899999618530273
130 0.8199999928474426
140 0.85999995470047
150 0.7899999618530273
160 0.8299999833106995
170 0.8299999833106995
180 0.8199999928474426
190 0.8700000047683716
200 0.8499999642372131
210 0.7899999618530273
220 0.9099999666213989
230 0.8299999833106995
240 0.8399999737739563
avg acc: 0.8027199811935425
>>test: 0.8592799797058105
0 0.7999999523162842
10 0.8799999952316284
20 0.8999999761581421
30 0.85999995470047
40 0.8799999952316284
50 0.8899999856948853
60 0.8199999928474426
70 0.7899999618530273
80 0.9099999666213989
90 0.8499999642372131
100 0.8799999952316284
110 0.85999995470047
120 0.8899999856948853
130 0.8700000047683716
140 0.8899999856948853
150 0.85999995470047
160 0.889999985694885