## Source

This notebook contains code from: https://github.com/harvardnlp/annotated-transformer

Original license: 

> MIT License
> 
> Copyright (c) 2018 Alexander Rush
> 
> Permission is hereby granted, free of charge, to any person obtaining a copy
> of this software and associated documentation files (the "Software"), to deal
> in the Software without restriction, including without limitation the rights
> to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
> copies of the Software, and to permit persons to whom the Software is
> furnished to do so, subject to the following conditions:
> 
> The above copyright notice and this permission notice shall be included in all
> copies or substantial portions of the Software.
> 
> THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
> AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
> LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
> OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
> SOFTWARE.


## License

The new work is licensed under the BSD 3-Clause License.


Copyright (c) 2018, Guillaume Chevalier

All rights reserved.


In [1]:
from src.data.read_txt import *
from src.data.config import *
from src.data.training_data import *
from src.data.sgnn_projection_layer import *
from src.model.loss import *
from src.model.transformer import *

import numpy as np
from joblib import dump, load
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable

import math
import copy
import time

In [2]:
%load_ext snakeviz

In [3]:

class NoamOpt:
    "Optim wrapper that implements rate."
    def __init__(self, model_size, factor, warmup, optimizer):
        self.optimizer = optimizer
        self._step = 0
        self.warmup = warmup
        self.factor = factor
        self.model_size = model_size
        self._rate = 0
        
    def step(self):
        "Update parameters and rate"
        self._step += 1
        rate = self.rate()
        for p in self.optimizer.param_groups:
            p['lr'] = rate
        self._rate = rate
        self.optimizer.step()
        
    def rate(self, step = None):
        "Implement `lrate` above"
        if step is None:
            step = self._step
        return self.factor * \
            (self.model_size ** (-0.5) *
            min(step ** (-0.5), step * self.warmup ** (-1.5)))
        
def get_std_opt(model_trainer):
    d_model = model_trainer.sentence_projection_model.encoder.layers[0].size
    return NoamOpt(d_model, 2, 4000,
            torch.optim.Adam(model_trainer.parameters(), lr=0, betas=(0.9, 0.98), eps=1e-9))

def run_epoch(epoch, model_trainer, model_opt, data_batch_iterator, cuda_device_id):
    """
    Standard Training and Logging Function.
    """
    start = time.time()
    total_tokens = 0
    total_loss = 0
    mod_tokens = 0
    mod = 10
    
    for i, (src, mask, category_per_sentence) in enumerate(data_batch_iterator):
        target_diagonal_block_matrix = categories_to_block_matrix(category_per_sentence)
        if cuda_device_id is not None:
            src = src.cuda(cuda_device_id)
            mask = mask.cuda(cuda_device_id)
            target_diagonal_block_matrix = target_diagonal_block_matrix.cuda(cuda_device_id)
                
        # forward.
        loss = model_trainer(src, mask, target_diagonal_block_matrix)
        total_loss += loss
        ntokens = (mask != 0.0).data.sum().item()
        total_tokens += ntokens
        mod_tokens += ntokens
        
        # backward.
        if model_opt is not None:
            loss.backward()
            model_opt.step()
            model_opt.optimizer.zero_grad()
        
        # log.
        if i % mod == 0:
            elapsed = time.time() - start
            print("Epoch %d Step: %d Loss: %f Tokens per Sec: %f" %
                    (epoch, i, loss / ntokens, mod_tokens / elapsed))
            start = time.time()
            mod_tokens = 0

    return total_loss / total_tokens


MY_MODEL_NAME = "my-model{}"

def save_model(preproc_sgnn_sklearn_pipeline, sentence_projection_model, model_name=MY_MODEL_NAME):
    a = model_name.format(".sklearn")
    b = model_name.format(".pytorch")
    dump(preproc_sgnn_sklearn_pipeline, a)
    with open(b, "wb") as f:
        torch.save(sentence_projection_model, f=f)
    print("Saved model to files:", a, b)


def load_model(model_name=MY_MODEL_NAME):
    a = model_name.format(".sklearn")
    b = model_name.format(".pytorch")
    preproc_sgnn_sklearn_pipeline = load(a)
    sentence_projection_model = torch.load(b)
    print("Loaded model from files:", a, b)
    return preproc_sgnn_sklearn_pipeline, sentence_projection_model

# preproc_sgnn_sklearn_pipeline = get_sgnn_projection_pipeline()
# sentence_projection_model = make_sentence_model()
# save_model(preproc_sgnn_sklearn_pipeline, sentence_projection_model)
# preproc_sgnn_sklearn_pipeline, sentence_projection_model = load_model()

In [4]:
# !du -sh my-model*
import glob  # todo

def load_most_recent_model(model_name):
    a = model_name.format(".sklearn*")
    b = model_name.format(".pytorch*")
    a = list(sorted(glob.glob(a)))[-1]  # model with highest epoch number
    b = list(sorted(glob.glob(b)))[-1]  # model with highest epoch number
    
    suffix = a.split(model_name.format(".sklearn"))[-1]
    return load_model(model_name + suffix)

# preproc_sgnn_sklearn_pipeline, sentence_projection_model = load_most_recent_model(MY_MODEL_NAME)

In [None]:
%%snakeviz

batch_size = 25
train_iters_per_epoch = 40
test_iters_per_epoch = 1
max_epoch = 10

# CUDA
cuda_device_id = None  # None for CPU, 0 for first GPU, etc.
if cuda_device_id is not None:
    context = torch.cuda.device(device_id)
    context.__enter__()

# Create model
# todo: load or not? bool param.
# preproc_sgnn_sklearn_pipeline = get_sgnn_projection_pipeline()
# sentence_projection_model = make_sentence_model(d_ff=1024)
preproc_sgnn_sklearn_pipeline, sentence_projection_model = load_most_recent_model(MY_MODEL_NAME)
model_trainer = TrainerModel(sentence_projection_model)
model_opt = get_std_opt(model_trainer)

# Train model
for epoch in range(max_epoch):

    model_trainer.train()
    run_epoch(
        epoch, model_trainer, model_opt,
        DataBatchIterator(preproc_sgnn_sklearn_pipeline, max_iters=train_iters_per_epoch),
        cuda_device_id
    )

    model_trainer.eval()
    run_epoch(
        epoch, model_trainer, model_opt,
        DataBatchIterator(preproc_sgnn_sklearn_pipeline, max_iters=test_iters_per_epoch),
        cuda_device_id
    )
    epoch_model_name = MY_MODEL_NAME + ".epoch_" + str(epoch).rjust(5, "0")
    save_model(preproc_sgnn_sklearn_pipeline, sentence_projection_model, epoch_model_name)

# CUDA
if cuda_device_id is not None:
    context.__exit__()

Loaded model from files: my-model.sklearn.epoch_9 my-model.pytorch.epoch_9
Epoch 0 Step: 0 Loss: 0.013953 Tokens per Sec: 84.084095
Epoch 0 Step: 10 Loss: 0.004901 Tokens per Sec: 161.256825
Epoch 0 Step: 20 Loss: 0.003658 Tokens per Sec: 108.806568
Epoch 0 Step: 30 Loss: 0.002423 Tokens per Sec: 96.876060
Epoch 0 Step: 0 Loss: 0.009643 Tokens per Sec: 96.005717
Saved model to files: my-model.sklearn.epoch_00000 my-model.pytorch.epoch_00000
Epoch 1 Step: 0 Loss: 0.014455 Tokens per Sec: 103.015428
Epoch 1 Step: 10 Loss: 0.004191 Tokens per Sec: 140.481162
Epoch 1 Step: 20 Loss: 0.003677 Tokens per Sec: 90.253537
Epoch 1 Step: 30 Loss: 0.002752 Tokens per Sec: 89.502752
Epoch 1 Step: 0 Loss: 0.009006 Tokens per Sec: 71.826464
Saved model to files: my-model.sklearn.epoch_00001 my-model.pytorch.epoch_00001
Epoch 2 Step: 0 Loss: 0.011542 Tokens per Sec: 76.331674
Epoch 2 Step: 10 Loss: 0.002441 Tokens per Sec: 110.938335
Epoch 2 Step: 20 Loss: 0.003203 Tokens per Sec: 100.831212
Epoch 2 St

In [None]:
# dataset_size = 36000000
batch_size = 25
max_epoch = 24
train_iters_per_epoch = 2400
test_iters_per_epoch = 1
# print(dataset_size/25)  # 1 440 000
# 100000 steps  = 100000*25 sentences = 2 500 000

# so I should have: epoch * steps_per_epochs = 57600

# batch_size = 25 sentences
# train_iters_per_epoch = 40
# test_iters_per_epoch = 1
# max_epoch = 10

# English-French dataset consisting of 36M (36000000) sentences
# Each training batch contained a set of sentence pairs containing approximately 25000 source tokens and 25000 target tokens
# Trained 100,000 steps or 12 hours
# each training step took about 0.4 seconds

In [None]:
import matplotlib.pyplot as plt
# Three settings of the lrate hyperparameters.
opts = [NoamOpt(512, 1, 4000, None), 
        NoamOpt(512, 1, 8000, None),
        NoamOpt(256, 1, 4000, None)]
plt.plot(np.arange(1, 20000), [[opt.rate(i) for opt in opts] for i in range(1, 20000)])
plt.legend(["512:4000", "512:8000", "256:4000"])


In [None]:
def data_gen(V, batch, nbatches):
    "Generate random data for a src-tgt copy task."
    for i in range(nbatches):
        data = torch.from_numpy(np.random.randint(1, V, size=(batch, 10)))
        data[:, 0] = 1
        src = Variable(data, requires_grad=False)
        tgt = Variable(data, requires_grad=False)
        yield Batch(src, tgt, 0)

In [None]:
c = data_gen(1000, 32, 777).__iter__().__next__()

In [None]:
c.src_mask.shape, c.src.shape
# (torch.Size([32, 1, 10]), torch.Size([32, 10]))

In [None]:
UTF8_TXT_RAW_FILES

In [None]:



# preproc_sgnn_sklearn_pipeline, sentence_projection_model#  = load_most_recent_model(MY_MODEL_NAME)
# model_trainer = TrainerModel(sentence_projection_model)

sentence_projection = preproc_sgnn_sklearn_pipeline.transform((
    "This is a test. This is another test. "
    "I like bacon. I don't like bacon. "
    "My name is Guillaume. My family name is Chevalier. "
    "Programming can be used for solving complicated math problems. Let's use the Python language to write some scientific code. "
    "My family regrouped for Christmast. We met aunts and uncles. "
    "I like linux. I have an operating system. "
    "Have you ever been in the situation where you've got Jupyter notebooks (iPython notebooks) so huge that you were feeling stuck in your code?. Or even worse: have you ever found yourself duplicating your notebook to do changes, and then ending up with lots of badly named notebooks?. Either and in any ways. For every medium to big application. "
    "If you're working with notebooks, it is highly likely that you're doing research and development. If doing research and development, to keep your amazing-10x-working-speed-multiplier, it might be a good idea to skip unit tests. I hope you were satisfied by this reading. What would you do?."
).split(". "))
category_per_sentence = [0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9]
projected_words, mask = pad_right(sentence_projection)


sentence_projection = sentence_projection_model(projected_words, mask)
prediction = matching_network_self_attention(sentence_projection)
clipped_prediction = ((prediction - prediction.mean() - prediction.std()) > 0)
target_diagonal_block_matrix = categories_to_block_matrix(category_per_sentence)

import matplotlib.pyplot as plt
plt.imshow(prediction.data.numpy())
plt.show()
plt.imshow(clipped_prediction.data.numpy())
plt.show()
plt.imshow(target_diagonal_block_matrix)
plt.show()