In [1]:
%%capture
!pip install https://3388-217161669-gh.circle-artifacts.com/0/wheels/torch-1.7.0a0-cp36-cp36m-linux_x86_64.whl
!pip install https://3388-217161669-gh.circle-artifacts.com/0/wheels/nestedtensor-0.0.1.dev20201173-cp36-cp36m-linux_x86_64.whl

In [2]:
import re
import requests
import io
import tarfile
import csv
import torch
import torch.nn as nn
import random
import sys
import concurrent.futures
import time
from collections import Counter
from collections import namedtuple

import torch
import nestedtensor

URL = "https://github.com/le-scientifique/torchDatasets/raw/master/dbpedia_csv.tar.gz"

In [3]:
Point = namedtuple('Point', 'label text')

def get_data(URL):
    r = requests.get(URL)
    file_like_object = io.BytesIO(r.content)
    tar = tarfile.open(fileobj=file_like_object)
    d = {}
    for member in tar.getmembers():
        if member.isfile() and member.name.endswith('csv'):
            k = 'train' if 'train' in member.name else 'test'
            d[k] = tar.extractfile(member)
    return d


def preprocess(iterator):
    def _preprocess(line):
        line = line.decode('UTF-8')
        line = line.lower()
        line = re.sub(r'[^0-9a-zA-Z,\s]', "", line)
        line = line.split(',')
        label = int(line[0]) - 1
        text = (" ".join(line[1:])).split()
        if len(line) > 2:
            return Point(label=label, text=text)
    for line in iterator:
        yield _preprocess(line)


def build_vocab(iterator):
    counter = Counter()
    labels = set()
    for point in iterator:
        counter.update(point.text)
        labels.add(point.label)
    vocab = {}
    for i, (word, count) in enumerate(counter.most_common()):
        vocab[word] = i

    return vocab, labels

In [4]:
data = get_data(URL)
data = {k: list(preprocess(v)) for (k, v) in data.items()}
vocab, labels = build_vocab(data['train'])
UNK = len(vocab)

In [5]:
class TextSentiment(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_class):
        super().__init__()
        self.embedding = nn.EmbeddingBag(vocab_size, embed_dim, sparse=True)
        self.fc = nn.Linear(embed_dim, num_class)
        self.init_weights()

    def init_weights(self):
        initrange = 0.5
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.fc.weight.data.uniform_(-initrange, initrange)
        self.fc.bias.data.zero_()

    def forward(self, text):
        return self.fc(self.embedding(text))

In [6]:
embed_dim = 10
model = TextSentiment(len(vocab) + 1, embed_dim, len(labels))
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=1.0)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1, gamma=0.95)

In [7]:
def create_batch(data):
    data = nestedtensor.nested_tensor(
        [torch.tensor(list(map(lambda x: vocab.get(x, UNK), tokens))) for tokens in data], dtype=torch.int64)
    return data

def yield_data(data):
    random.shuffle(data)
    labels = []
    batch_data = []
    for i, point in enumerate(data):
        # Stop accumulating lines of text once we reach 4000 tokens or more
        # This yields variable batch sizes, but with consistent memory pressure
        if sum(map(len, batch_data), 0) < 4000:
            labels.append(torch.tensor(point.label))
            batch_data.append(point.text)
        else:                    
            yield (nestedtensor.nested_tensor(labels, dtype=torch.int64), create_batch(batch_data))
            labels = []
            batch_data = []

In [8]:
num_tokens = sum(map(lambda x: len(x.text), data['train']))
print("Total number of tokens: {}".format(num_tokens))
for epoch in range(2):
    i = 0
    t0 = time.time()
    for labels, future in yield_data(data['train']):
        batch = future
        optimizer.zero_grad()
        output = model(batch)
        loss = criterion(output, labels).mean()
        loss.backward()
        optimizer.step()
        sys.stderr.write(
            "\rtime: {:3.0f}s epoch: {:3.0f} lr: {:3.6f} loss: {:3.6f}".format(
                time.time() - t0, 
                epoch, 
                scheduler.get_last_lr()[0],
                loss, 
            )
        )
        sys.stderr.flush()
        i += batch.numel()
        if i > 1000000:
            scheduler.step()
            i = 0
    sys.stderr.write('\n')

Total number of tokens: 27205880


time: 372s epoch:   0 lr: 0.263520 loss: 0.244233
time: 376s epoch:   1 lr: 0.069443 loss: 0.343512


In [9]:
num_correct = 0
total_num = 0
for tb in yield_data(data['test']):
  output = model(tb[1]).to_tensor().argmax(1)
  num_correct += (tb[0].to_tensor() == output).sum().item()
  total_num += len(output)

print("Test accuracy: {}".format(float(num_correct) / float(total_num)))

Test accuracy: 0.9362489876200394
