In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import spacy

from torchtext.datasets import IMDB, AG_NEWS


In [2]:

train, test = AG_NEWS(split=('train','test'))

In [3]:
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.nn.utils.rnn import pad_sequence


tokenizer_en = get_tokenizer('spacy', language='en_core_web_sm')
tokenizer_de = get_tokenizer('spacy', language='de_core_news_sm')

In [4]:
def yield_tokens(data_iter):
    for label, text in data_iter:
        yield tokenizer_en(text)


In [5]:
vocab = build_vocab_from_iterator(yield_tokens(train), min_freq=2, specials=['<unk>', '<pad>', '<sos>', '<eos>'])

In [6]:
vocab.set_default_index(vocab["<unk>"])

In [7]:
len(vocab)

62544

In [8]:
vocab([ 'here', 'is' , 'the', 'example' ] )


[540, 27, 4, 6113]

In [9]:
vocab(['<sos>', 'what', 'the', 'fuck', 'is' , 'that', '<eos>'])

PAD_IDX = vocab['<pad>']
SOS_IDX = vocab['<sos>']
EOS_IDX = vocab['<eos>']

print(PAD_IDX, SOS_IDX, EOS_IDX)

1 2 3


In [10]:

from torch.utils.data import DataLoader
devce = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")

In [11]:
text_pipeline = lambda x : vocab(tokenizer_en(x))
label_pipeline = lambda x : int(x) - 1 

In [12]:
def collate_batch(batch):
    label_list, text_list = [], []
    for (_label, _text) in batch:
        label_list.append(label_pipeline(_label))
        processed_text = torch.tensor( text_pipeline(_text), dtype=torch.int64)
        text_list.append(torch.cat( [torch.tensor([SOS_IDX]), processed_text, torch.tensor([EOS_IDX])], dim=0))
    label_list = torch.tensor(label_list, dtype=torch.int64)
    text_list = pad_sequence( text_list, padding_value=PAD_IDX )
    return label_list, text_list

In [13]:


a = torch.tensor([2])
b = torch.rand(10)
c = torch.tensor([3])
torch.cat( [a,b,c] , dim=0 ).shape

torch.Size([12])

In [14]:
train, test = AG_NEWS(split=('train','test'))
train_loader = DataLoader(train, batch_size=8, shuffle=False, collate_fn=collate_batch)

In [15]:
next(iter(train_loader))

(tensor([2, 2, 2, 2, 2, 2, 2, 2]),
 tensor([[    2,     2,     2,     2,     2,     2,     2,     2],
         [  448, 17833,   168,    76,   168,   350,  2662,  1384],
         [  574,  5254,    12, 13733,   110,  1307,  4300,  1280],
         [ 2060,  6389,  2766,   168,  7054,   325, 12847,   600],
         [50439,  7996, 12832,  9297,     7,     5,    11, 17529],
         [ 1199, 10809,   350,    34,   132,   378,  3789,    46],
         [ 1721,    17,    78, 12877,     8,  1964,  1906,  1661],
         [    4,    33,  1956,  1592,    96,  1126,    17,    17],
         [ 1383,    18,    17,  8215,   151,  7232,    36,  2254],
         [   17,    33,    33,    17,     5,    17,    18,    18],
         [   33,     8,    18,    33,  8588,    33,    36,  2254],
         [   18,  5027,    33,    18,    45,    18,     8,     8],
         [   33,   932,     8,    33, 15532,    33, 10133,  3243],
         [    8,   357, 10361,     8,     7,     8,    10,   165],
         [ 4036, 17833,   7