In [1]:
import os
import time
import torch
import torch.nn as nn
from torch import Tensor

from vocabulary import Vocab
from utils import *
from embedding import PositionalEncoding, Embeddings
from layers import *
from criterion import KLLossMasked
from optimizer import NoamOpt

from torch.nn import TransformerEncoder, TransformerEncoderLayer

In [2]:
log_every_iter = 100
validate_every_iter = 10000

In [3]:
#create_data()

In [4]:
class PositionalEncoding(nn.Module):

    def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 5000):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-np.log(10000.0) / d_model))
        pe = torch.zeros(max_len, 1, d_model)
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x: Tensor) -> Tensor:
        """
        Arguments:
            x: Tensor, shape ``[seq_len, batch_size, embedding_dim]``
        """
        x = x + self.pe[:x.size(0)]
        return self.dropout(x)

In [5]:
class TransformerModel(nn.Module):
    def __init__(self, ntoken: int, d_model: int, nhead: int, d_hid: int,
                 nlayers: int, dropout: float = 0.5):
        super().__init__()
        self.model_type = 'Transformer'
        self.pos_encoder = PositionalEncoding(d_model, dropout)
        encoder_layers = TransformerEncoderLayer(d_model, nhead, d_hid, dropout)
        self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers)
        self.embedding = nn.Embedding(ntoken, d_model)
        self.d_model = d_model
        self.linear = nn.Linear(d_model, ntoken)

        self.init_weights()

    def init_weights(self) -> None:
        initrange = 0.1
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.linear.bias.data.zero_()
        self.linear.weight.data.uniform_(-initrange, initrange)

    def forward(self, src: Tensor, src_mask: Tensor = None) -> Tensor:
        """
        Arguments:
            src: Tensor, shape ``[seq_len, batch_size]``
            src_mask: Tensor, shape ``[seq_len, seq_len]``

        Returns:
            output Tensor of shape ``[seq_len, batch_size, ntoken]``
        """
        src = self.embedding(src) * np.sqrt(self.d_model)
        src = self.pos_encoder(src)
        output = self.transformer_encoder(src, src_mask)
        output = self.linear(output)
        return output

In [6]:
class Generator(nn.Module):
    def __init__(self):
        super(Generator, self).__init__()

    def forward(self, x):
        result, _ = torch.max(x, dim=0)

        return result

In [7]:
directory = 'model_v1/'
if not os.path.isdir(directory):
    os.mkdir(directory)
model_save_path = 'bert.checkpoint'
model_save_path = os.path.join(directory, model_save_path)

In [8]:
small_size = False
use_checkpoint = False
use_cuda = True
device = torch.device("cuda:0" if use_cuda else "cpu")


d_model = 512
d_hid = 2048
n_heads = 8
n_encoders = 6
dropout = 0.1

batch_size = 512
train_iter = report_loss = cum_loss = valid_num = 0
report_samples = cum_samples = 0

In [9]:
vocab = Vocab()
ntoken = len(vocab.char2id)
print(ntoken)
model = TransformerModel(ntoken, d_model, n_heads, d_hid, n_encoders, dropout)
for p in model.parameters():
    if p.dim() > 1:
        nn.init.xavier_uniform_(p)
model = model.to(device)

generator = Generator()
for p in generator.parameters():
    if p.dim() > 1:
        nn.init.xavier_uniform_(p)
generator = generator.to(device)

28


In [10]:
optimizer = torch.optim.Adam(model.parameters(), lr=1e-9, betas=(0.9, 0.99), eps=1e-9)
#criterion = nn.CrossEntropyLoss(reduction='sum')
criterion = nn.KLDivLoss(reduction='sum')

train_data = read_train_data(filepath="./pairs_train.txt", small = small_size)
dev_data = read_dev_data(filepath="./pairs_valid.txt", small = small_size)

In [11]:
hist_valid_scores = []
current_epoch = 0
current_train_iter = 0

In [12]:

for epoch in range(current_epoch, 1000):
    print("=" * 30)
    model.train()
    start = time.time()
    train_data_iter = create_words_batch(train_data, vocab, mini_batch=batch_size, shuffle=False, device = device)
    for i, batch in enumerate(train_data_iter):
        output = model.forward(batch.src.transpose(1, 0), None)
        generator_mask = torch.zeros(batch.src.shape[0], ntoken, device = device)
        
        generator_mask = generator_mask.scatter_(1, batch.src, mask_token)


        x = generator(output)
        x = x.masked_fill(generator_mask == mask_token, -1e9)
        x = nn.LogSoftmax(dim=1)(x)
        y = batch.tgt.masked_fill(batch.tgt == generator_mask, 0)
        y = y/(torch.sum(y, dim=1, keepdim=True) + 1e-12)
        #y = nn.LogSoftmax(dim=1)(y)
        loss = criterion(x, y)
        batch_loss = loss.data
        #print(x)
        if(i % 100 == 0):
            print("epoch %d step %d batchloss:"%(epoch, i), loss.item())
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

epoch 0 step 0 batchloss: 1173.3553466796875
epoch 0 step 100 batchloss: 1165.57470703125
epoch 0 step 200 batchloss: 1151.71728515625


In [None]:
def run():
    directory = 'model/'
    if not os.path.isdir(directory):
        os.mkdir(directory)
    model_save_path = 'bert.checkpoint'
    model_save_path = os.path.join(directory, model_save_path)

    small_size = False
    use_checkpoint = False
    use_cuda = True
    device = torch.device("cuda:0" if use_cuda else "cpu")

    vocab = Vocab()
    V = len(vocab.char2id)
    d_model = 256
    d_ff = 1024
    h = 4
    n_encoders = 4

    batch_size = 32
    train_iter = report_loss = cum_loss = valid_num = 0
    report_samples = cum_samples = 0

    self_attn = MultiHeadedAttention(h=h, d_model=d_model, d_k=d_model // h, d_v=d_model // h, dropout=0.1)
    feed_forward = FullyConnectedFeedForward(d_model=d_model, d_ff=d_ff)
    position = PositionalEncoding(d_model, dropout=0.1)
    embedding = nn.Sequential(Embeddings(d_model=d_model, vocab=V), position)

    encoder = Encoder(self_attn=self_attn, feed_forward=feed_forward, size=d_model, dropout=0.1)
    generator = Generator(d_model=d_model, vocab_size=V)
    model = Bert(encoder=encoder, embedding=embedding, generator=generator, n_layers=n_encoders)
    for p in model.parameters():
        if p.dim() > 1:
            nn.init.xavier_uniform_(p)

    model = model.to(device)

    opt = torch.optim.Adam(model.parameters(), lr=1e-9, betas=(0.9, 0.98), eps=1e-9)
    model_opt = NoamOpt(d_model, 0.001, 4000, opt)
    #criterion = nn.KLDivLoss(reduction="batchmean")
    criterion = nn.CrossEntropyLoss()
    if use_cuda:
        criterion.cuda(device=device)
    vocab = Vocab()

    train_data = read_train_data(filepath="./pairs_train.txt", small = small_size)
    dev_data = read_dev_data(filepath="./pairs_valid.txt", small = small_size)

    
    hist_valid_scores = []


    current_epoch = 0
    current_train_iter = 0

    for epoch in range(current_epoch, 100):
        print("=" * 30)
        model.train()

        start = time.time()
        train_data_iter = create_words_batch(train_data, vocab, mini_batch=batch_size, shuffle=False,
                                                device=model.device)
        for i, batch in enumerate(train_data_iter):
            out = model.forward(batch.src, batch.src_mask)
            generator_mask = torch.zeros(batch.src.shape[0], V, device=model.device)
            generator_mask = generator_mask.scatter_(1, batch.src, 1)

            #batch_loss = loss_compute(out, batch.tgt, generator_mask)

            x = model.generator(out, generator_mask)
            x = x.masked_fill(generator_mask == 1, -1e9)
            x = nn.Softmax(dim=1)(x)
            y = batch.tgt
            #y = batch.tgt.masked_fill(batch.tgt == generator_mask, -1e9)
            #y = nn.LogSoftmax(dim=1)(y)
            loss = criterion(x, y)
            batch_loss = loss.data
            print("step %d batchloss:"%(train_iter), loss)
            loss.backward()
            model_opt.step()
            model_opt.optimizer.zero_grad()

            batch_loss_val = batch_loss.item()
            report_loss += batch_loss_val
            cum_loss += batch_loss_val
            report_samples += batch_size
            cum_samples += batch_size

            train_iter += 1

            if train_iter % log_every_iter == 0:
                elapsed = time.time() - start
                print(f'epoch {epoch}, iter {train_iter}, avg. loss {report_loss / report_samples:.2f} time elapsed {elapsed:.2f}sec')
                start = time.time()
                report_loss = report_samples = 0

            if train_iter % validate_every_iter == 0:
                print(f'epoch {epoch}, iter {train_iter}, cum. loss {cum_loss / cum_samples:.2f} examples {cum_samples}')
                cum_samples = cum_loss = 0.

                print('begin evaluation...')
                valid_num += 1
                acc = evaluate_acc(model, vocab, dev_data, device=model.device)
                print(f'validation: iter {train_iter}, dev. acc {acc:.4f}')

                valid_metric = acc

                is_better = len(hist_valid_scores) == 0 or valid_metric > max(hist_valid_scores)
                hist_valid_scores.append(valid_metric)

                if is_better:
                    print('save currently the best model to [%s]' % model_save_path)
                    torch.save({'epoch': epoch,
                                'model_state_dict': model.state_dict(),
                                'optimizer_state_dict': model_opt.optimizer.state_dict(),
                                'loss': cum_loss,
                                '_rate': model_opt._rate,
                                '_step': model_opt._step,
                                'train_iter': train_iter,
                                'hist_valid_scores': hist_valid_scores,
                                }, model_save_path)

        torch.save({'epoch': epoch,
                    'model_state_dict': model.state_dict(),
                    'optimizer_state_dict': model_opt.optimizer.state_dict(),
                    'loss': cum_loss,
                    '_rate': model_opt._rate,
                    '_step': model_opt._step,
                    'train_iter': train_iter,
                    'hist_valid_scores': hist_valid_scores,
                    }, os.path.join(directory, f'real_model_{epoch}.checkpoint'))


In [None]:
#run()