In [16]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from torchtext.datasets import Multi30k
from torchtext.data import Field, BucketIterator

import spacy 
import numpy as np 
import time

In [5]:
SEED = 1234
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [7]:
spacy_en = spacy.load('en_core_web_sm')
spacy_de = spacy.load('de_core_news_sm')

def tokenize_de(text):
    return [tok.text for tok in spacy_en.tokenizer(text)]

def tokenize_en(text):
    return [tok.text for tok in spacy_de.tokenizer(text)]

In [10]:
SRC = Field(tokenize = tokenize_en,
            init_token='<sos>',
            eos_token='<eos>',
            lower=True)

TRG = Field(tokenize=tokenize_de,
            init_token='<sos>',
            eos_token='<eos>',
            lower=True)

In [11]:
train_data, valid_data, test_data = Multi30k.splits(exts=('.en', '.de'), fields=(SRC, TRG))

downloading training.tar.gz
I0114 17:13:37.135039 21952 utils.py:65] Downloading file training.tar.gz to .data\multi30k\training.tar.gz.
.data\multi30k\training.tar.gz: 100%|██████████| 1.21M/1.21M [00:02<00:00, 562kB/s]
I0114 17:13:39.288971 21952 utils.py:73] File .data\multi30k\training.tar.gz downloaded.
downloading validation.tar.gz
I0114 17:13:40.246121 21952 utils.py:65] Downloading file validation.tar.gz to .data\multi30k\validation.tar.gz.
.data\multi30k\validation.tar.gz: 100%|██████████| 46.3k/46.3k [00:00<00:00, 162kB/s]
I0114 17:13:40.537895 21952 utils.py:73] File .data\multi30k\validation.tar.gz downloaded.
downloading mmt_task1_test2016.tar.gz
I0114 17:13:41.536474 21952 utils.py:65] Downloading file mmt_task1_test2016.tar.gz to .data\multi30k\mmt_task1_test2016.tar.gz.
.data\multi30k\mmt_task1_test2016.tar.gz: 100%|██████████| 66.2k/66.2k [00:00<00:00, 108kB/s]
I0114 17:13:42.154136 21952 utils.py:73] File .data\multi30k\mmt_task1_test2016.tar.gz downloaded.


In [14]:
BATCH_SIZE = 128
SRC.build_vocab(train_data, min_freq=2)
TRG.build_vocab(train_data, min_freq=2)

train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data),
    batch_size=BATCH_SIZE, device='cuda'
)


torch.int64