In [39]:
%load_ext autoreload
%autoreload 2

import spacy, random, math, time
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torchtext.datasets import TranslationDataset, Multi30k, IWSLT
from torchtext.data import Field, BucketIterator, RawField, Dataset

from models.gcn import GCNLayer
from tqdm import tqdm

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


#### Multi30k
---

In [8]:
SEED = 11747
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [9]:
spacy_de = spacy.load('de')
spacy_en = spacy.load('en')

def tokenize_de(text, reverse=False):
    """
    Tokenizes German text from a string into a list of strings (tokens) and reverses it
    """
    tokens = [tok.text for tok in spacy_de.tokenizer(text)]
    return tokens[::-1] if reverse else tokens
    
def tokenize_en(text, reverse=False):
    """
    Tokenizes English text from a string into a list of strings (tokens)
    """
    tokens = [tok.text for tok in spacy_en.tokenizer(text)]
    return tokens[::-1] if reverse else tokens
    
def batch_graph(grhs):
    """ batch a list of graphs
    @param grhs: list(tensor,...) 
    """
    b = len(grhs)  # batch size
    graph_dims = [len(g) for g in grhs]
    s = max(graph_dims)  # max seq length
    
    G = torch.zeros([b, s, s])
    for i, g in enumerate(grhs):
        s_ = graph_dims[i]
        G[i,:s_,:s_] = g
    return G

In [5]:
REVERSE = True
SRC = Field(tokenize = lambda text: tokenize_de(text, REVERSE), 
            init_token = '<sos>', 
            eos_token = '<eos>', 
            lower = True)
TGT = Field(tokenize = tokenize_en, 
            init_token = '<sos>', 
            eos_token = '<eos>', 
            lower = True)
GRH = RawField(postprocessing=batch_graph)
data_fields = [('src', SRC), ('trg', TGT), ('grh', GRH)]

train_data = Dataset(torch.load("data/Multi30k/train_data.pt"), data_fields)
valid_data = Dataset(torch.load("data/Multi30k/valid_data.pt"), data_fields)
test_data = Dataset(torch.load("data/Multi30k/test_data.pt"), data_fields)

In [6]:
BATCH_SIZE = 128
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size = BATCH_SIZE, 
    sort_key = lambda x: len(x.src),
    sort_within_batch=False,
    device = device)

In [7]:
SRC.build_vocab(train_data, min_freq = 2)
TGT.build_vocab(train_data, min_freq = 2)

print(f"Number of training examples: {len(train_data.examples)}")
print(f"Number of validation examples: {len(valid_data.examples)}")
print(f"Number of testing examples: {len(test_data.examples)}")
print(f"Unique tokens in source (de) vocabulary: {len(SRC.vocab)}")
print(f"Unique tokens in target (en) vocabulary: {len(TGT.vocab)}")

Number of training examples: 29000
Number of validation examples: 1014
Number of testing examples: 1000
Unique tokens in source (de) vocabulary: 7855
Unique tokens in target (en) vocabulary: 5893


In [8]:
from collections import Counter

def get_sentence_lengths(dataset):
    src_counter = Counter()
    tgt_counter = Counter()
    for exp in dataset:
        src_counter[len(exp.src)] += 1
        tgt_counter[len(exp.trg)] += 1
    return src_counter, tgt_counter

def counter2array(counter):
    result = []
    for k in counter:
        result.extend([k for _ in range(counter[k])])
    return np.array(result)

In [9]:
src_c, tgt_c = get_sentence_lengths(train_data)
src_lengths = counter2array(src_c)
tgt_lengths = counter2array(tgt_c)

print("maximum src, tgt sent lengths: ")
np.quantile(src_lengths, 1), np.quantile(tgt_lengths, 1)

maximum src, tgt sent lengths: 


(44, 41)

#### Experiment with just GRU
---

In [10]:
class GRUEncoder(nn.Module):
    def __init__(self, ninp, nembed, enc_nhid, dec_nhid, nlayers, dropout):
        super(GRUEncoder, self).__init__()
        self.enc_nhid = enc_nhid
        self.dec_nhid = dec_nhid
        self.nlayers = nlayers
        self.embedding = nn.Embedding(ninp, nembed)
        self.rnn = nn.GRU(nembed, enc_nhid, nlayers, bidirectional=True, dropout=dropout)
        self.fc = nn.Linear(enc_nhid*2, dec_nhid)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, src):
        # src: (src len, b)
        s, b = src.shape
        embedded = self.dropout(self.embedding(src))
        out, hidden = self.rnn(embedded)
        hidden = hidden.transpose(1,0).reshape(b, self.nlayers, -1).transpose(1,0)
        hidden = torch.tanh(self.fc(hidden))
        return out, hidden

In [11]:
class GRUDecoder(nn.Module):
    def __init__(self, nout, nembed, enc_nhid, dec_nhid, nlayers, dropout):
        super(GRUDecoder, self).__init__()
        self.nout = nout
        self.enc_nhid = enc_nhid
        self.dec_nhid = dec_nhid
        self.nlayers = nlayers
        self.embedding = nn.Embedding(nout, nembed)
        self.rnn = nn.GRU(nembed, dec_nhid, nlayers, bidirectional=False, dropout=dropout)
        self.fc_out = nn.Linear(dec_nhid, nout)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x, hidden):
        x = x.unsqueeze(0)
        embedded = self.dropout(self.embedding(x))
        out, hidden = self.rnn(embedded, hidden)
        pred = self.fc_out(out.squeeze(0))
        return pred, hidden

In [12]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        
    def forward(self, src, tgt, teacher_forcing_ratio=0.5):
        # src: (src_len, b)
        # tgt: (tgt_len, b)
        tgt_len, b = tgt.shape
        tgt_vocab_size = self.decoder.nout
        
        # tensor to store decoder outputs
        outs = torch.zeros(tgt_len, b, tgt_vocab_size).to(self.device)
        
        enc_out, hidden = self.encoder(src)
        x = tgt[0]
        for t in range(1, tgt_len):
            out, hidden = self.decoder(x, hidden)
            outs[t] = out
            teacher_force = random.random() < teacher_forcing_ratio
            top1 = out.argmax(1)
            x = tgt[t] if teacher_force else top1
            
        return outs

In [13]:
INPUT_DIM = len(SRC.vocab)
OUTPUT_DIM = len(TGT.vocab)
ENC_EMB_DIM = 250
DEC_EMB_DIM = 250
ENC_HID_DIM = 500
DEC_HID_DIM = 500
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5
NLAYERS = 2

enc = GRUEncoder(INPUT_DIM, ENC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, NLAYERS, ENC_DROPOUT)
dec = GRUDecoder(OUTPUT_DIM, DEC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, NLAYERS, DEC_DROPOUT)
model = Seq2Seq(enc, dec, device).to(device)

In [14]:
def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.uniform_(param.data, -0.08, 0.08)
        
model.apply(init_weights)

Seq2Seq(
  (encoder): GRUEncoder(
    (embedding): Embedding(7855, 250)
    (rnn): GRU(250, 500, num_layers=2, dropout=0.5, bidirectional=True)
    (fc): Linear(in_features=1000, out_features=500, bias=True)
    (dropout): Dropout(p=0.5)
  )
  (decoder): GRUDecoder(
    (embedding): Embedding(5893, 250)
    (rnn): GRU(250, 500, num_layers=2, dropout=0.5)
    (fc_out): Linear(in_features=500, out_features=5893, bias=True)
    (dropout): Dropout(p=0.5)
  )
)

In [15]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 16,282,893 trainable parameters


In [16]:
optimizer = optim.Adam(model.parameters())
TGT_PAD_IDX = TGT.vocab.stoi[TGT.pad_token]
criterion = nn.CrossEntropyLoss(ignore_index = TGT_PAD_IDX)

In [17]:
def train_epoch(model, iterator, optimizer, criterion, clip):
    model.train()
    train_loss = 0
    for i, batch in enumerate(iterator):
        src = batch.src
        tgt = batch.trg
        optimizer.zero_grad()
        out = model(src, tgt)
        out_dim = out.shape[-1]
        out = out[1:].view(-1, out_dim)
        tgt = tgt[1:].view(-1)
        
        loss = criterion(out, tgt)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        train_loss += loss.item()
    return train_loss/len(iterator)


def evaluate(model, iterator, criterion):
    model.eval()
    loss = 0
    with torch.no_grad():
        for i, batch in enumerate(iterator):
            src = batch.src
            tgt = batch.trg
            out = model(src, tgt, 0) # no teacher forcing here
            out_dim = out.shape[-1]
            out = out[1:].view(-1, out_dim)
            tgt = tgt[1:].view(-1)
            loss += criterion(out, tgt).item()
    return loss/len(iterator)


def epoch_time(start, end):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [36]:
N_EPOCHS = 10
CLIP = 1

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    start_time = time.time()
    train_loss = train_epoch(model, train_iterator, optimizer, criterion, CLIP)
    valid_loss = evaluate(model, valid_iterator, criterion)
    end_time = time.time()
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut1-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')

Epoch: 01 | Time: 0m 30s
	Train Loss: 4.831 | Train PPL: 125.397
	 Val. Loss: 4.563 |  Val. PPL:  95.885
Epoch: 02 | Time: 0m 31s
	Train Loss: 3.986 | Train PPL:  53.834
	 Val. Loss: 4.439 |  Val. PPL:  84.731
Epoch: 03 | Time: 0m 31s
	Train Loss: 3.609 | Train PPL:  36.943
	 Val. Loss: 4.183 |  Val. PPL:  65.545
Epoch: 04 | Time: 0m 30s
	Train Loss: 3.323 | Train PPL:  27.757
	 Val. Loss: 4.023 |  Val. PPL:  55.895
Epoch: 05 | Time: 0m 30s
	Train Loss: 3.146 | Train PPL:  23.243
	 Val. Loss: 3.973 |  Val. PPL:  53.149
Epoch: 06 | Time: 0m 30s
	Train Loss: 2.957 | Train PPL:  19.247
	 Val. Loss: 3.837 |  Val. PPL:  46.382
Epoch: 07 | Time: 0m 31s
	Train Loss: 2.827 | Train PPL:  16.898
	 Val. Loss: 3.701 |  Val. PPL:  40.492
Epoch: 08 | Time: 0m 30s
	Train Loss: 2.698 | Train PPL:  14.857
	 Val. Loss: 3.591 |  Val. PPL:  36.259
Epoch: 09 | Time: 0m 30s
	Train Loss: 2.602 | Train PPL:  13.487
	 Val. Loss: 3.630 |  Val. PPL:  37.707
Epoch: 10 | Time: 0m 30s
	Train Loss: 2.513 | Train PPL

#### GCN Incorporated
---

In [18]:
class GCNEncoder(nn.Module):
    def __init__(self, ninp, nembed, nhid, nlayers, dropout):
        super(GCNEncoder, self).__init__()
        self.nhid = nhid
        self.nlayers = nlayers
        self.embedding = nn.Embedding(ninp, nembed)
        assert(nlayers > 0)
        layers = [GCNLayer(nembed, nhid)] + [GCNLayer(nhid, nhid) for _ in range(nlayers-1)]
        self.layers = nn.ModuleList(layers)
        self.linear = nn.Linear(2*nhid, nhid)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x, A):
        """
        x: (seq len, b)
        A: (b, seq len, seq len)
        """
        x = x.t()
        b = x.shape[0]
        x = self.embedding(x)  # x: (b, seq len, ninp)
        x = self.dropout(x)
        hidden = []
        for layer in self.layers:
            x = layer(x, A) 
            hidden.append(x[:,0,:])
            
        # pooling
        mean = x.mean(dim=1)
        maxm = x.max(dim=1)[0]
        x = torch.cat((mean, maxm), dim=1)
        out = self.linear(self.dropout(x))
        hidden = torch.stack(hidden)
        return out, hidden

        
        

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        
    def forward(self, src, grh, tgt, teacher_forcing_ratio=0.5):
        # src: (src_len, b)
        # tgt: (tgt_len, b)
        tgt_len, b = tgt.shape
        tgt_vocab_size = self.decoder.nout
        
        # tensor to store decoder outputs
        outs = torch.zeros(tgt_len, b, tgt_vocab_size).to(self.device)
        
        enc_out, hidden = self.encoder(src, grh)
        x = tgt[0]
        for t in range(1, tgt_len):
            out, hidden = self.decoder(x, hidden)
            outs[t] = out
            teacher_force = random.random() < teacher_forcing_ratio
            top1 = out.argmax(1)
            x = tgt[t] if teacher_force else top1
            
        return outs

In [19]:
REVERSE = False
SRC = Field(tokenize = lambda text: tokenize_de(text, REVERSE), 
            init_token = '<sos>', 
            eos_token = '<eos>', 
            lower = True)
TGT = Field(tokenize = tokenize_en, 
            init_token = '<sos>', 
            eos_token = '<eos>', 
            lower = True)
GRH = RawField(postprocessing=batch_graph)
data_fields = [('src', SRC), ('trg', TGT), ('grh', GRH)]

train_data = Dataset(torch.load("data/Multi30k/train_data.pt"), data_fields)
valid_data = Dataset(torch.load("data/Multi30k/valid_data.pt"), data_fields)
test_data = Dataset(torch.load("data/Multi30k/test_data.pt"), data_fields)

BATCH_SIZE = 128
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size = BATCH_SIZE, 
    sort_key = lambda x: len(x.src),
    sort_within_batch=False,
    device = device)

SRC.build_vocab(train_data, min_freq = 2)
TGT.build_vocab(train_data, min_freq = 2)

print(f"Number of training examples: {len(train_data.examples)}")
print(f"Number of validation examples: {len(valid_data.examples)}")
print(f"Number of testing examples: {len(test_data.examples)}")
print(f"Unique tokens in source (de) vocabulary: {len(SRC.vocab)}")
print(f"Unique tokens in target (en) vocabulary: {len(TGT.vocab)}")

Number of training examples: 29000
Number of validation examples: 1014
Number of testing examples: 1000
Unique tokens in source (de) vocabulary: 7855
Unique tokens in target (en) vocabulary: 5893


In [59]:
INPUT_DIM = len(SRC.vocab)
OUTPUT_DIM = len(TGT.vocab)
ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
ENC_HID_DIM = 700
DEC_HID_DIM = 700
N_LAYERS = 3
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5

enc = GCNEncoder(INPUT_DIM, ENC_EMB_DIM, ENC_HID_DIM, N_LAYERS, ENC_DROPOUT)
dec = GRUDecoder(OUTPUT_DIM, DEC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, N_LAYERS, DEC_DROPOUT)
model = Seq2Seq(enc, dec, device).to(device)

In [60]:
def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.uniform_(param.data, -0.08, 0.08)
        
model.apply(init_weights)

def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 17,692,681 trainable parameters


In [25]:
def train_epoch(model, iterator, optimizer, criterion, clip):
    model.train()
    train_loss = 0
    for i, batch in tqdm(enumerate(iterator)):
        src = batch.src.to(device)
        tgt = batch.trg.to(device)
        grh = batch.grh.to(device)
        optimizer.zero_grad()
        out = model(src, grh, tgt)
        out_dim = out.shape[-1]
        out = out[1:].view(-1, out_dim)
        tgt = tgt[1:].view(-1)
        
        loss = criterion(out, tgt)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        train_loss += loss.item()
    return train_loss/len(iterator)


def evaluate(model, iterator, criterion):
    model.eval()
    loss = 0
    with torch.no_grad():
        for i, batch in tqdm(enumerate(iterator)):
            src = batch.src.to(device)
            tgt = batch.trg.to(device)
            grh = batch.grh.to(device)
            out = model(src, grh, tgt, 0) # no teacher forcing here
            out_dim = out.shape[-1]
            out = out[1:].view(-1, out_dim)
            tgt = tgt[1:].view(-1)
            loss += criterion(out, tgt).item()
    return loss/len(iterator)


def epoch_time(start, end):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [62]:
N_EPOCHS = 10
CLIP = 1

optimizer = optim.Adam(model.parameters())
TGT_PAD_IDX = TGT.vocab.stoi[TGT.pad_token]
criterion = nn.CrossEntropyLoss(ignore_index = TGT_PAD_IDX)
best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    start_time = time.time()
    train_loss = train_epoch(model, train_iterator, optimizer, criterion, CLIP)
    valid_loss = evaluate(model, valid_iterator, criterion)
    end_time = time.time()
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut1-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')

227it [00:34,  6.53it/s]
8it [00:00, 23.75it/s]
0it [00:00, ?it/s]

Epoch: 01 | Time: 0m 35s
	Train Loss: 4.839 | Train PPL: 126.312
	 Val. Loss: 4.743 |  Val. PPL: 114.730


227it [00:35,  6.43it/s]
8it [00:00, 22.96it/s]
1it [00:00,  5.82it/s]

Epoch: 02 | Time: 0m 35s
	Train Loss: 4.181 | Train PPL:  65.412
	 Val. Loss: 4.654 |  Val. PPL: 105.049


227it [00:35,  6.48it/s]
8it [00:00, 23.75it/s]
1it [00:00,  6.34it/s]

Epoch: 03 | Time: 0m 35s
	Train Loss: 4.000 | Train PPL:  54.601
	 Val. Loss: 4.544 |  Val. PPL:  94.064


227it [00:35,  6.42it/s]
8it [00:00, 23.54it/s]
0it [00:00, ?it/s]

Epoch: 04 | Time: 0m 35s
	Train Loss: 3.843 | Train PPL:  46.650
	 Val. Loss: 4.494 |  Val. PPL:  89.501


227it [00:35,  6.45it/s]
8it [00:00, 23.30it/s]
1it [00:00,  5.42it/s]

Epoch: 05 | Time: 0m 35s
	Train Loss: 3.761 | Train PPL:  42.976
	 Val. Loss: 4.446 |  Val. PPL:  85.265


227it [00:35,  6.45it/s]
8it [00:00, 23.37it/s]
1it [00:00,  6.43it/s]

Epoch: 06 | Time: 0m 35s
	Train Loss: 3.655 | Train PPL:  38.661
	 Val. Loss: 4.479 |  Val. PPL:  88.132


227it [00:35,  6.44it/s]
8it [00:00, 23.43it/s]
1it [00:00,  6.60it/s]

Epoch: 07 | Time: 0m 35s
	Train Loss: 3.587 | Train PPL:  36.115
	 Val. Loss: 4.461 |  Val. PPL:  86.611


227it [00:35,  6.42it/s]
8it [00:00, 23.79it/s]
1it [00:00,  6.27it/s]

Epoch: 08 | Time: 0m 35s
	Train Loss: 3.503 | Train PPL:  33.217
	 Val. Loss: 4.475 |  Val. PPL:  87.828


227it [00:35,  6.45it/s]
8it [00:00, 23.57it/s]
1it [00:00,  6.22it/s]

Epoch: 09 | Time: 0m 35s
	Train Loss: 3.465 | Train PPL:  31.980
	 Val. Loss: 4.474 |  Val. PPL:  87.710


227it [00:35,  6.47it/s]
8it [00:00, 23.74it/s]

Epoch: 10 | Time: 0m 35s
	Train Loss: 3.390 | Train PPL:  29.670
	 Val. Loss: 4.441 |  Val. PPL:  84.826





#### GCN and GRU Combined

In [27]:
class GCNGRUEncoder(nn.Module):

    def __init__(self, ninp, nembed, enc_nhid, dec_nhid, nlayers, dropout, device):
        super(GCNGRUEncoder, self).__init__()
        self.enc_nhid = enc_nhid
        self.dec_nhid = dec_nhid
        self.nlayers = nlayers
        self.embedding = nn.Embedding(ninp, nembed)
        self.device = device
        rnns = [nn.GRU(nembed, enc_nhid, 1, bidirectional=True)] + \
               [nn.GRU(enc_nhid*2, enc_nhid, 1, bidirectional=True) 
                for _ in range(nlayers-1)]
        self.rnns = nn.ModuleList(rnns)
        layers = [GCNLayer(nembed, enc_nhid)] + [GCNLayer(enc_nhid, enc_nhid) 
                                                   for _ in range(nlayers-1)]
        self.gcns = nn.ModuleList(layers)
        self.proj = nn.Linear(enc_nhid*2, enc_nhid)
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, x, A):
        """
        x: (seq len, b)
        A: (b, seq len, seq len)
        """
        s, b = x.shape
        embedded = self.dropout(self.embedding(x))
        hiddens = []
        hidden = torch.zeros(2, b, self.enc_nhid).to(self.device)
        gcn_out, gru_out = embedded.transpose(1,0), embedded
        for i in range(self.nlayers):
            gcn_out = self.gcns[i](gcn_out, A)
            gru_out, hidden = self.rnns[i](gru_out, hidden)
            gru_out = self.dropout(gru_out)            
            gru_out = gcn_out.repeat(1,1,2).transpose(0,1) + gru_out
            gcn_out = self.proj(gru_out.transpose(0,1))
            hidden += gcn_out.max(1)[0].repeat(2,1,1)
            hiddens.append(hidden.sum(dim=0))
        hidden = torch.stack(hiddens)
        return gru_out, hidden

In [28]:
INPUT_DIM = len(SRC.vocab)
OUTPUT_DIM = len(TGT.vocab)
ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
ENC_HID_DIM = 512
DEC_HID_DIM = 512
N_LAYERS = 3
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

enc = GCNGRUEncoder(INPUT_DIM, ENC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, N_LAYERS, ENC_DROPOUT, device)
dec = GRUDecoder(OUTPUT_DIM, DEC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, N_LAYERS, DEC_DROPOUT)
model = Seq2Seq(enc, dec, device).to(device)

In [None]:
REVERSE = False
SRC = Field(tokenize = lambda text: tokenize_de(text, REVERSE), 
            init_token = '<sos>', 
            eos_token = '<eos>', 
            lower = True)
TGT = Field(tokenize = tokenize_en, 
            init_token = '<sos>', 
            eos_token = '<eos>', 
            lower = True)
GRH = RawField(postprocessing=batch_graph)
data_fields = [('src', SRC), ('trg', TGT), ('grh', GRH)]

train_data = Dataset(torch.load("data/Multi30k/train_data.pt"), data_fields)
valid_data = Dataset(torch.load("data/Multi30k/valid_data.pt"), data_fields)
test_data = Dataset(torch.load("data/Multi30k/test_data.pt"), data_fields)

BATCH_SIZE = 128
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size = BATCH_SIZE, 
    sort_key = lambda x: len(x.src),
    sort_within_batch=False,
    device = device)

SRC.build_vocab(train_data, min_freq = 2)
TGT.build_vocab(train_data, min_freq = 2)

print(f"Number of training examples: {len(train_data.examples)}")
print(f"Number of validation examples: {len(valid_data.examples)}")
print(f"Number of testing examples: {len(test_data.examples)}")
print(f"Unique tokens in source (de) vocabulary: {len(SRC.vocab)}")
print(f"Unique tokens in target (en) vocabulary: {len(TGT.vocab)}")

In [None]:
def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.uniform_(param.data, -0.08, 0.08)
        
model.apply(init_weights)

def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

In [None]:
N_EPOCHS = 10
CLIP = 1

optimizer = optim.Adam(model.parameters())
TGT_PAD_IDX = TGT.vocab.stoi[TGT.pad_token]
criterion = nn.CrossEntropyLoss(ignore_index = TGT_PAD_IDX)
best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    start_time = time.time()
    train_loss = train_epoch(model, train_iterator, optimizer, criterion, CLIP)
    valid_loss = evaluate(model, valid_iterator, criterion)
    end_time = time.time()
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut1-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')

#### GCN Attention Encoder

In [271]:
from models.gru_attn import Attention, GRUDecoder

class GCNEncoder(nn.Module):
    def __init__(self, ninp, nembed, enc_nhid, dec_nhid, nlayers, dropout):
        super(GCNEncoder, self).__init__()
        self.enc_nhid = enc_nhid
        self.dec_nhid = dec_nhid
        self.nlayers = nlayers
        self.embedding = nn.Embedding(ninp, nembed)
        assert(nlayers > 0)
        layers = [GCNLayer(nembed, enc_nhid)] + \
                 [GCNLayer(enc_nhid, enc_nhid) for _ in range(nlayers-1)]
        self.layers = nn.ModuleList(layers)
        self.out_linear = nn.Linear(enc_nhid, 2*enc_nhid)
        self.hid_linear = nn.Linear(enc_nhid, dec_nhid)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x, A):
        """
        x: (seq len, b)
        A: (b, seq len, seq len)
        """
        x = x.t()
        b = x.shape[0]
        x = self.embedding(x)  # x: (b, seq len, ninp)
        x = self.dropout(x)
        hidden = []
        for layer in self.layers:
            x = layer(x, A) 
            hidden.append(x[:,0,:])
            
        # pooling
        out = self.out_linear(self.dropout(x)).transpose(1,0)
        hidden = torch.stack(hidden)
        hidden = self.hid_linear(hidden)
        return out, hidden


In [272]:
class GCN2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super(GCN2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        
    def forward(self, src, grh, tgt, teacher_forcing_ratio=0.5):
        # src: (src_len, b)
        # tgt: (tgt_len, b)
        tgt_len, b = tgt.shape
        tgt_vocab_size = self.decoder.nout
        
        # tensor to store decoder outputs
        outs = torch.zeros(tgt_len, b, tgt_vocab_size).to(self.device)
        
        enc_out, hidden = self.encoder(src, grh)
        x = tgt[0]
        attns = []
        for t in range(1, tgt_len):
            out, hidden, attn = self.decoder(x, hidden, enc_out)
            attns.append(attn)
            outs[t] = out
            teacher_force = random.random() < teacher_forcing_ratio
            top1 = out.argmax(1)
            x = tgt[t] if teacher_force else top1
        attns = torch.stack(attns, dim=1).squeeze()
        return outs, attns

In [273]:
REVERSE = False
SRC = Field(tokenize = lambda text: tokenize_de(text, REVERSE), 
            init_token = '<sos>', 
            eos_token = '<eos>', 
            lower = True)
TGT = Field(tokenize = tokenize_en, 
            init_token = '<sos>', 
            eos_token = '<eos>', 
            lower = True)
GRH = RawField(postprocessing=batch_graph)
data_fields = [('src', SRC), ('trg', TGT), ('grh', GRH)]

train_data = Dataset(torch.load("data/Multi30k/train_data.pt"), data_fields)
valid_data = Dataset(torch.load("data/Multi30k/valid_data.pt"), data_fields)
test_data = Dataset(torch.load("data/Multi30k/test_data.pt"), data_fields)

BATCH_SIZE = 128
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size = BATCH_SIZE, 
    sort_key = lambda x: len(x.src),
    sort_within_batch=False,
    device = device)

SRC.build_vocab(train_data, min_freq = 2)
TGT.build_vocab(train_data, min_freq = 2)

print(f"Number of training examples: {len(train_data.examples)}")
print(f"Number of validation examples: {len(valid_data.examples)}")
print(f"Number of testing examples: {len(test_data.examples)}")
print(f"Unique tokens in source (de) vocabulary: {len(SRC.vocab)}")
print(f"Unique tokens in target (en) vocabulary: {len(TGT.vocab)}")

Number of training examples: 29000
Number of validation examples: 1014
Number of testing examples: 1000
Unique tokens in source (de) vocabulary: 7855
Unique tokens in target (en) vocabulary: 5893


In [274]:
INPUT_DIM = len(SRC.vocab)
OUTPUT_DIM = len(TGT.vocab)
ENC_EMB_DIM = 250
DEC_EMB_DIM = 250
ENC_HID_DIM = 500
DEC_HID_DIM = 500
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5
NLAYERS = 2

attn = Attention(ENC_HID_DIM, DEC_HID_DIM)
enc = GCNEncoder(INPUT_DIM, ENC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, NLAYERS, ENC_DROPOUT)
dec = GRUDecoder(OUTPUT_DIM, DEC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, NLAYERS, DEC_DROPOUT, attn)
model = GCN2Seq(enc, dec, device).to(device)

In [275]:
def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.uniform_(param.data, -0.08, 0.08)
        
model.apply(init_weights)

def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 19,765,143 trainable parameters


In [278]:
def train_epoch(model, iterator, optimizer, criterion, clip):
    model.train()
    train_loss = 0
    for i, batch in tqdm(enumerate(iterator)):
        src = batch.src.to(device)
        tgt = batch.trg.to(device)
        grh = batch.grh.to(device)
        optimizer.zero_grad()
        out, attns = model(src, grh, tgt)
        out_dim = out.shape[-1]
        out = out[1:].view(-1, out_dim)
        tgt = tgt[1:].view(-1)
        
        loss = criterion(out, tgt)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        train_loss += loss.item()
    return train_loss/len(iterator)


def evaluate(model, iterator, criterion):
    model.eval()
    loss = 0
    with torch.no_grad():
        for i, batch in tqdm(enumerate(iterator)):
            src = batch.src.to(device)
            tgt = batch.trg.to(device)
            grh = batch.grh.to(device)
            out, attns = model(src, grh, tgt, 0) # no teacher forcing here
            out_dim = out.shape[-1]
            out = out[1:].view(-1, out_dim)
            tgt = tgt[1:].view(-1)
            loss += criterion(out, tgt).item()
    return loss/len(iterator)

def epoch_time(start, end):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [None]:
N_EPOCHS = 10
CLIP = 1

optimizer = optim.Adam(model.parameters())
TGT_PAD_IDX = TGT.vocab.stoi[TGT.pad_token]
criterion = nn.CrossEntropyLoss(ignore_index = TGT_PAD_IDX)
best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    start_time = time.time()
    train_loss = train_epoch(model, train_iterator, optimizer, criterion, CLIP)
    valid_loss = evaluate(model, valid_iterator, criterion)
    end_time = time.time()
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut1-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')





























0it [00:00, ?it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A



























1it [00:19, 19.51s/it][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A

#### GCNGRU Attention Encoder

#### Transformer with Attention

#### Baby test for GCN Encoder

In [77]:
for b in train_iterator:
    x = b.src.to(device)
    A = b.grh.to(device)
    break

out, hidden = enc(x, A)

In [41]:
hidden.shape

torch.Size([2, 128, 512])