In [1]:
import torch
from torchtext.datasets import Multi30k
from torchtext.data import Field, BucketIterator


SRC = Field(tokenize = "spacy",
            tokenizer_language="de",
            init_token = '<sos>',
            eos_token = '<eos>',
            batch_first=True,
            lower = True)

TRG = Field(tokenize = "spacy",
            tokenizer_language="en",
            init_token = '<sos>',
            eos_token = '<eos>',
            batch_first=True,
            lower = True)

train_data, valid_data, test_data = Multi30k.splits(exts = ('.de', '.en'),
                                                    fields = (SRC, TRG))

SRC.build_vocab(train_data, min_freq = 2)
TRG.build_vocab(train_data, min_freq = 2)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

BATCH_SIZE = 128

train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data),
    batch_size = BATCH_SIZE,
    device = device,
    shuffle=True)

In [5]:
SRC.__str__()

'<torchtext.data.field.Field object at 0x00000262E2D4F100>'

In [6]:
train_iterator.__len__()

227

In [8]:
sample_iter = iter(train_iterator)
sample = next(sample_iter)
sample


[torchtext.data.batch.Batch of size 128 from MULTI30K]
	[.src]:[torch.cuda.LongTensor of size 128x27 (GPU 0)]
	[.trg]:[torch.cuda.LongTensor of size 128x25 (GPU 0)]

In [8]:
i = 0
for key in SRC.vocab.stoi:
    print(f"{key} : {SRC.vocab.stoi[key]}")
    i += 1
    if i == 20:
        break

<unk> : 0
<pad> : 1
<sos> : 2
<eos> : 3
. : 4
ein : 5
einem : 6
in : 7
eine : 8
, : 9
und : 10
mit : 11
auf : 12
mann : 13
einer : 14
der : 15
frau : 16
die : 17
zwei : 18
einen : 19


In [9]:
i = 0
for key in TRG.vocab.stoi:
    print(f"{key} : {TRG.vocab.stoi[key]}")
    i += 1
    if i == 20:
        break

<unk> : 0
<pad> : 1
<sos> : 2
<eos> : 3
a : 4
. : 5
in : 6
the : 7
on : 8
man : 9
is : 10
and : 11
of : 12
with : 13
woman : 14
, : 15
two : 16
are : 17
to : 18
people : 19


In [5]:
print(sample.src.size())
sample.src

torch.Size([128, 27])


tensor([[   2,    5,  171,  ...,    1,    1,    1],
        [   2,    5,   13,  ...,    1,    1,    1],
        [   2,    5,   49,  ...,    1,    1,    1],
        ...,
        [   2,   30,   10,  ...,    1,    1,    1],
        [   2, 1700, 4822,  ...,    1,    1,    1],
        [   2,    5, 3734,  ...,    1,    1,    1]], device='cuda:0')

In [10]:
print(sample.trg.size())
sample.trg

torch.Size([128, 28])


tensor([[   2,    4,   61,  ...,    1,    1,    1],
        [   2,    4,    9,  ...,    1,    1,    1],
        [   2,    4,   55,  ...,    1,    1,    1],
        ...,
        [   2,   30,   11,  ...,    1,    1,    1],
        [   2, 1227,    0,  ...,    1,    1,    1],
        [   2,    4,  192,  ...,    1,    1,    1]], device='cuda:0')