<table class="buttons" align="center">
  <td>
    <a target="_blank" href="https://colab.research.google.com/drive/1MbMZ_LfIZsNxLm5GOcQXaguMJ7CsSSX0"><img src="https://www.tensorflow.org/images/colab_logo_32px.png" />Google Colab дээр нээх</a>
  </td>
  <td>
    <a target="_blank" href="https://github.com/dl-ub-summer-school/2019/blob/master/seminar5.ipynb"><img src="https://www.tensorflow.org/images/GitHub-Mark-32px.png" />GitHub дээр нээх</a>
  </td>
  <td>
       <a target="_blank" href="https://sites.google.com/view/dlub/dl-ub-2019"><img src="https://avatars0.githubusercontent.com/u/52651086?s=32&v=4">Зуны сургалтын вебсайт</a>
   </td>
</table>


## Transformer Implementation

#### Кодны эх сурвалж:

[Adaptive Attention Span in Transformers](https://arxiv.org/abs/1905.07799), Sainbayar Sukhbaatar et al. 

https://github.com/facebookresearch/adaptive-span
    
#### Анхаарах зүйлс:

Эхлээд дараах үйлдлүүдийг дагаж хийгээд энэ нөтбүкийг өөрийн Драйвтаа хадгалж аваарай.

1. Зүүн дээд буланд байгаа _File_ дээр дараад
2. _Save copy in Drive_ дээр дарж өөрийн хувийг үүсгээд
3. Үүсгэсэн Колаб нөтбүк дээрээ ажиллаарай.
4. Дараа нь энэ Колаб нөтбүкээ Драйв доторх Colab Notebooks гэдэг нэртэй, автоматаар үүсдэг хавтсан дотроос олж үзээрэй.




#### Хэрэгтэй сан, функцүүдээ оруулж ирэх

In [0]:
import os
import math
import time
import torch
import random
import argparse
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Adagrad

#### Өөрсдийн кэйст тохируулж өөрчилсөн gradient clipping функц, үүнийг ашигласан Adagrad класс

In [0]:
def _clip_grad(clr, grad, group_grad_clip):
    if group_grad_clip > 0:
        norm = grad.norm(2).item()
        if norm > group_grad_clip:
            clr *= group_grad_clip / (norm + 1e-10)
    return clr


class AdagradWithGradClip(Adagrad):
    """Adagrad algoritm with custom gradient clipping"""
    def __init__(self,
                 params,
                 lr=1e-2,
                 lr_decay=0,
                 weight_decay=0,
                 initial_accumulator_value=0,
                 grad_clip=0):
        Adagrad.__init__(self,
                         params,
                         lr=lr,
                         lr_decay=lr_decay,
                         weight_decay=weight_decay,
                         initial_accumulator_value=initial_accumulator_value)
        self.defaults['grad_clip'] = grad_clip
        self.param_groups[0].setdefault('grad_clip', grad_clip)

    def step(self, closure=None):
        loss = None
        if closure is not None:
            loss = closure()

        for group in self.param_groups:
            for p in group['params']:
                if p.grad is None:
                    continue

                grad = p.grad.data
                state = self.state[p]

                state['step'] += 1

                if group['weight_decay'] != 0:
                    if p.grad.data.is_sparse:
                        raise RuntimeError("weight_decay option is "
                                           "not compatible with sparse "
                                           "gradients")
                    grad = grad.add(group['weight_decay'], p.data)

                clr = (group['lr'] /
                       (1 + (state['step'] - 1) * group['lr_decay']))

                # clip
                clr = _clip_grad(clr=clr,
                                 grad=grad,
                                 group_grad_clip=group['grad_clip'])

                if grad.is_sparse:
                    # the update is non-linear so indices must be unique
                    grad = grad.coalesce()
                    grad_indices = grad._indices()
                    grad_values = grad._values()
                    size = grad.size()

                    def make_sparse(values):
                        constructor = grad.new
                        if grad_indices.dim() == 0 or values.dim() == 0:
                            return constructor().resize_as_(grad)
                        return constructor(grad_indices, values, size)
                    state['sum'].add_(make_sparse(grad_values.pow(2)))
                    std = state['sum']._sparse_mask(grad)
                    std_values = std._values().sqrt_().add_(1e-10)
                    p.data.add_(-clr, make_sparse(grad_values / std_values))
                else:
                    state['sum'].addcmul_(1, grad, grad)
                    std = state['sum'].sqrt().add_(1e-10)
                    p.data.addcdiv_(-clr, grad, std)

        return loss


#### Орчин, дата, модел, оптимизац, сургалттай холбоотой параметрүүдийн default утга

In [0]:
# arguments with their default values

PARAMS_CONFIG = {
    # env-specific
    'env_params': {
        'distributed': {
            'action': 'store_true',
            'default': False,
            'help': 'enable distributed training.'
                    '(otherwise will use all available GPUs with dataparallel)',
            'dest': 'distributed'
        },
        'local_rank': {
            'type': int,
            'default': 0,
            'help': 'used in distributed training',
            'dest': 'local_rank'
        },
    },
    # data-specific
    'data_params': {
        'data': {
            'type': str,
            'default': 'data/text8',
            'help': 'data location '
                    '(must contain train.txt, valid.txt and test.txt)',
            'dest': 'data_path'
        },
    },
    # model-specific
    'model_params': {
        'hid-sz': {
            'type': int,
            'default': 256,
            'help': 'hidden size (i.e. model size)',
            'dest': 'hidden_size'
        },
        'inner-hid-sz': {
            'type': int,
            'default': 1024,
            'help': 'inner hidden size of FF layer',
            'dest': 'inner_hidden_size'
        },
        'nlayers': {
            'type': int,
            'default': 8,
            'help': 'number of layers',
            'dest': 'nb_layers'
        },
        'block-sz': {
            'type': int,
            'default': 64,
            'help': 'block size '
                    '(the length of sequence to process in parallel)',
            'dest': 'block_size'
        },
        'nheads': {
            'type': int,
            'default': 2,
            'help': 'number of self-attention heads',
            'dest': 'nb_heads'
        },
        'attn-span': {
            'type': int,
            'default': 32,
            'help': 'length of the attention span',
            'dest': 'attn_span'
        },
        'dropout': {
            'type': float,
            'default': 0.2,
            'help': 'dropout rate of ReLU and attention',
            'dest': 'dropout'
        },
    },
    # optimization-specific
    'optim_params': {
        'lr': {
            'type': float,
            'default': 0.03,
            'help': 'learning rate',
            'dest': 'lr'
        },
        'momentum': {
            'type': float,
            'default': 0.9,
            'help': 'SGD momentum',
            'dest': 'momentum'
        },
        'optim': {
            'type': str,
            'default': 'sgd',
            'help': 'optimization method: sgd | adagrad',
            'dest': 'optim'
        },
        'lr-warmup': {
            'type': int,
            'default': 0,
            'help': 'linearly increase LR from 0 '
                    'during first lr_warmup updates',
            'dest': 'lr_warmup'
        },
        'grad-clip': {
            'type': float,
            'default': 0,
            'help': '[only works with adagrad!] '
                    'clip gradient of each module parameters by a given '
                    'value',
            'dest': 'grad_clip'
        },
    },
    # trainer-specific
    'trainer_params': {
        'batch-sz': {
            'type': int,
            'default': 64,
            'help': 'batch size',
            'dest': 'batch_size'
        },
        'batch-split': {
            'type': int,
            'default': 1,
            'help': 'split a batch into smaller parts to fit in GPU memory',
            'dest': 'batch_split'
        },
        'nbatches': {
            'type': int,
            'default': 1000,
            'help': 'number of batches in each iteration',
            'dest': 'nb_batches_per_iter'
        },
        'niter': {
            'type': int,
            'default': 1000,
            'help': 'number of iterations to train',
            'dest': 'nb_iter'
        },
        'checkpoint': {
            'type': str,
            'default': '',
            'help': 'path to save/load model',
            'dest': 'checkpoint_path'
        },
        'full-eval-mode': {
            'action': 'store_true',
            'default': False,
            'help': 'do evaluation on the whole validation and the test data',
            'dest': 'full_eval_mode'
        },
    },
    # adaptive attention span specific params
    'adapt_span_params': {
        'adapt-span': {
            'action': 'store_true',
            'default': False,
            'help': 'enable adaptive attention span',
            'dest': 'adapt_span_enabled'
        },
        'adapt-span-loss': {
            'type': float,
            'default': 0,
            'help': 'the loss coefficient for span lengths',
            'dest': 'adapt_span_loss'
        },
        'adapt-span-ramp': {
            'type': int,
            'default': 32,
            'help': 'ramp length of the soft masking function',
            'dest': 'adapt_span_ramp'
        },
        'adapt-span-init': {
            'type': float,
            'default': 0,
            'help': 'initial attention span ratio',
            'dest': 'adapt_span_init'
        },
        'adapt-span-cache': {
            'action': 'store_true',
            'default': False,
            'help': 'adapt cache size as well to reduce memory usage',
            'dest': 'adapt_span_cache'
        },
    },
}


#### Дата токенчлох, корпус үүсгэх, дата хуваах функцүүд

In [0]:
def _tokenize(text_path, dictionary_to_update):
    """Tokenizes a text file."""
    print('Tokenizing {}'.format(text_path))
    assert os.path.exists(text_path)

    nb_tokens_in_dictionary = len(dictionary_to_update)

    # Count nb of tokens in text and update the dictionary
    with open(text_path, 'r', encoding="utf8") as f:
        for line in f:
            tokens = line.split() + ['<eos>']
            for token in tokens:
                if token not in dictionary_to_update:
                    dictionary_to_update[token] = nb_tokens_in_dictionary
                    nb_tokens_in_dictionary += 1

    # Assign to each token its identifier
    ids = []
    with open(text_path, 'r', encoding="utf8") as f:
        for line in f:
            tokens = line.split() + ['<eos>']
            for token in tokens:
                ids.append(dictionary_to_update[token])
    ids = torch.LongTensor(ids)

    return ids


class Corpus:
    def __init__(self, data_path):
        self._dictionary = {}
        self.train = _tokenize(
            text_path=os.path.join(data_path, 'train.txt'),
            dictionary_to_update=self._dictionary)
        self.valid = _tokenize(
            text_path=os.path.join(data_path, 'valid.txt'),
            dictionary_to_update=self._dictionary)
        self.test = _tokenize(
            text_path=os.path.join(data_path, 'test.txt'),
            dictionary_to_update=self._dictionary)

    @property
    def vocab_size(self):
        return len(self._dictionary)


def _batchify(data_tensor, batch_size):
    nb_batches = data_tensor.size(0) // batch_size
    # trim away some tokens to make whole batches
    data_tensor = data_tensor.narrow(0, 0, nb_batches * batch_size)
    data_tensor = data_tensor.view(batch_size, -1).contiguous()
    return data_tensor


def _build_corpus(data_path, env_params):
    # save the corpus to a file so that it's faster next time
    corpus_path = os.path.join(data_path, 'corpus.pt')
    if os.path.exists(corpus_path):
        print('Loading an existing corpus file from {}'.format(corpus_path))
        corpus = torch.load(corpus_path)
    else:
        print('Creating a corpus file at {}'.format(corpus_path))
        if env_params['distributed']:
            # only one process need to create a corpus file
            if env_params['rank'] == 0:
                corpus = Corpus(data_path)
                torch.save(corpus, corpus_path)
                # sync with other processes
                torch.distributed.broadcast(torch.zeros(1).cuda(), src=0)
            else:
                print('Waiting rank0 to create a corpus file.')
                # sync with rank0
                torch.distributed.broadcast(torch.zeros(1).cuda(), src=0)
                corpus = torch.load(corpus_path)
        else:
            corpus = Corpus(data_path)
            torch.save(corpus, corpus_path)
    return corpus


def _get_train_val_test_data(corpus, batch_size):
    return [
        _batchify(corpus.train, batch_size),
        _batchify(corpus.valid, batch_size),
        _batchify(corpus.test, batch_size)
    ]


def get_train_val_test_data(data_params, env_params, batch_size, device):
    corpus = _build_corpus(**data_params, env_params=env_params)
    data_params['vocab_size'] = corpus.vocab_size
    train_data, val_data, test_data = _get_train_val_test_data(
        corpus=corpus, batch_size=batch_size)

    if env_params['distributed']:
        # split the data into equal parts
        assert batch_size % env_params['world_size'] == 0
        device_batch_size = batch_size // env_params['world_size']
        slice_data = slice(
            device_batch_size * env_params['rank'],
            device_batch_size * (env_params['rank'] + 1))
        train_data = train_data[slice_data]
        val_data = val_data[slice_data]
        test_data = test_data[slice_data]

    train_data = train_data.to(device)
    val_data = val_data.to(device)
    test_data = test_data.to(device)
    return train_data, val_data, test_data


#### [TODO 1 - TODO 7]  Attention, Multi-head Attention, Feedforward, Transformer давхаргуудын классууд, Transformer-ын класс

In [0]:
# Size notations:
# B = batch_size, H = hidden_size, M = block_size, L = attn_span


def _skew(X, pad_value):
    """shift every row 1 step to right"""
    # X = B x M x L
    B, M, L = X.size()
    X = F.pad(X, (0, M + 1), value=pad_value)  # B x M x (L+M+1)
    X = X.view(B, -1)  # B x ML+MM+M
    X = X[:, :-M]  # B x ML+MM
    X = X.view(B, M, M + L)  # B x M x L+M
    return X


def _unskew(X):
    """reverse _skew operation"""
    # X = B x M x L+M
    B, M, L = X.size()
    L -= M
    X = X.view(B, -1)  # B x ML+MM
    X = F.pad(X, (0, M))  # B x ML+MM+M
    X = X.view(B, M, M + L + 1)  # B x M x L+M+1
    X = X[:, :, :L]  # B x M x L
    return X


class SeqAttention(nn.Module):
    """Sequential self-attention layer.
    Each token will attend to its previous fixed number of steps.
    Note that attention doesn't include the current step itself.
    """
    def __init__(self, hidden_size, attn_span,
                 dropout, adapt_span_params, **kargs):
        nn.Module.__init__(self)
        self.dropout = nn.Dropout(dropout)
        self.hidden_size = hidden_size # size of a single head
        self.attn_span = attn_span
        self.adapt_span_enabled = False

    def forward(self, query, key, value, key_pe):
        # query size = B x M x H
        # key, value sizes = B x (M+L) x H

        # compute attention from context
        # B x M (dest) x (M+L) (src)
        attn_cont = torch.matmul(query, key.transpose(-1, -2))
        attn_cont = _unskew(attn_cont)  # B x M x L

        # compute the effect of position embedding
        attn_pos = torch.matmul(query, key_pe)  # B x M x L_pos
        
        #TODO 1
        #Add positional encoding value to attention
        attn = attn_cont + attn_pos
        
        # TODO 2 
        # Scale attention scores with square root of d_k. HINT: Result dimention # B x M X L_pos
        attn = attn / math.sqrt(self.hidden_size)
        
        # TODO 3 
        # Compute softmax on attentions scores
        attn = F.softmax(attn, dim=-1)
        
        # TODO 4 
        # For regularization purpose, use dropout on attention scores. HINT: result dimention B x M X L_pos
        attn = self.dropout(attn)
        attn_cont = _skew(attn, 0)  # B x M X (L+M)
        
        # TODO 5
        # Compute attention output using attention scores and value vectors. HINT: result dimention B x M x H
        out = torch.matmul(attn_cont, value)
        
        return out

    def get_cache_size(self):
        return self.attn_span


class MultiHeadSeqAttention(nn.Module):
    def __init__(self, hidden_size, nb_heads, **kargs):
        nn.Module.__init__(self)
        assert hidden_size % nb_heads == 0
        self.nb_heads = nb_heads
        self.head_dim = hidden_size // nb_heads
        self.attn = SeqAttention(
            hidden_size=self.head_dim, nb_heads=nb_heads, **kargs)
        self.proj_query = nn.Linear(hidden_size, hidden_size, bias=False)
        self.proj_out = nn.Linear(hidden_size, hidden_size, bias=False)
        self.proj_val = nn.Linear(hidden_size, hidden_size, bias=False)
        self.proj_key = nn.Linear(hidden_size, hidden_size, bias=False)

    def head_reshape(self, x):
        K = self.nb_heads
        D = self.head_dim
        x = x.view(x.size()[:-1] + (K, D))  # B x (M+L) x K x D
        x = x.transpose(1, 2).contiguous()  # B x K x (M+L) x D
        x = x.view(-1, x.size(-2), x.size(-1))  # B_K x (M+L) x D
        return x

    def forward(self, query, key, value, key_pe):
        B = query.size(0)
        K = self.nb_heads
        D = self.head_dim
        M = query.size(1)

        query = self.proj_query(query)
        query = self.head_reshape(query)
        value = self.proj_val(value)
        value = self.head_reshape(value)
        key = self.proj_key(key)
        key = self.head_reshape(key)

        out = self.attn(query, key, value, key_pe)  # B_K x M x D
        out = out.view(B, K, M, D)  # B x K x M x D
        out = out.transpose(1, 2).contiguous()  # B x M x K x D
        out = out.view(B, M, -1)  # B x M x K_D
        out = self.proj_out(out)
        return out


class FeedForwardLayer(nn.Module):
    def __init__(self, hidden_size, inner_hidden_size, dropout, **kargs):
        nn.Module.__init__(self)
        self.fc1 = nn.Linear(hidden_size, inner_hidden_size)
        self.fc2 = nn.Linear(inner_hidden_size, hidden_size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, h):
        # TODO 6 
        # Using self.fc1,  linearly tranform attention output, h. 
        # Use activation function RELU
        h1 = F.relu(self.fc1(h))
        
        # TODO 7
        # For regularization purpose, add dropout to h1 above.
        h1 = self.dropout(h1)
        
        # TODO 8 
        # Using self.fc2 to linearly tranform h1. 
        h2 = self.fc2(h1)
        
        return h2


class TransformerSeqLayer(nn.Module):
    def __init__(self, hidden_size, **kargs):
        nn.Module.__init__(self)
        self.attn = MultiHeadSeqAttention(hidden_size=hidden_size, **kargs)
        #self.attn = SeqAttention(hidden_size=hidden_size, **kargs)
        self.ff = FeedForwardLayer(hidden_size=hidden_size, **kargs)
        self.norm1 = nn.LayerNorm(hidden_size)
        self.norm2 = nn.LayerNorm(hidden_size)

    def forward(self, h, h_cache, key_pe):
        # h = B x M x H
        # h_cache = B x L x H
        
        #TODO 9
        #Concatenate curent blocks with previous cache. HINT: output dimension B x (M+L) x H
        h_all = torch.cat([h_cache, h], dim=1)
        
        attn_out = self.attn(h, h_all, h_all, key_pe)
        
        #TODO 10
        #Normalize attention output. HINT:  output dimension # B x M x H
        h = self.norm1(h + attn_out)
        
        #TODO 11
        #Get the output from feedforward  NN 
        ff_out = self.ff(h)
        
        #TODO 12
        #Normalize the output. HINT: output dimension  B x M x H
        out = self.norm2(h + ff_out)
        
        return out


class TransformerSeq(nn.Module):
    def __init__(self, vocab_size, hidden_size, nb_heads, nb_layers,
                 attn_span, **kargs):
        nn.Module.__init__(self)
        # token embeddings
        self.in_emb = nn.Embedding(vocab_size, hidden_size)
        self.out_emb = nn.Linear(hidden_size, vocab_size)
        # position embeddings
        self.key_pe = nn.Parameter(
            torch.randn(1, hidden_size // nb_heads, attn_span))

        self.layers = nn.ModuleList()
        self.layers.extend(
            TransformerSeqLayer(
                hidden_size=hidden_size, nb_heads=nb_heads,
                attn_span=attn_span, **kargs)
            for _ in range(nb_layers))

    def forward(self, x, h_cache):
        # x size = B x M
        block_size = x.size(1)
        h = self.in_emb(x)  # B x M x H
        h_cache_next = []
        for l, layer in enumerate(self.layers):
            cache_size = layer.attn.attn.get_cache_size()
            if cache_size > block_size:
                h_cache_next_l = torch.cat(
                    [h_cache[l][:, -cache_size + block_size:, :], h],
                    dim=1).detach()
            else:
                h_cache_next_l = h[:, -cache_size:, :].detach()
            h_cache_next.append(h_cache_next_l)
            h = layer(h, h_cache[l], self.key_pe)  # B x M x H

        out = F.log_softmax(self.out_emb(h), dim=-1)

        return out, h_cache_next



#### Сургалтад хэрэглэгдэх, сургасан моделийг үнэлэх функцүүд

In [0]:
def _train_step(model, X, Y, h_cache, eval_only, loss_div=1):
    """Single training step."""

    out, h_cache = model(X, h_cache)
    out = out.view(-1, out.size(-1))
    loss = torch.nn.functional.nll_loss(out, Y.view(-1))
    loss_value = loss.item() / loss_div

    if not eval_only:
        # loss term from adaptive-span
        if model.module.layers[0].attn.attn.adapt_span_enabled:
            loss += sum(layer.attn.attn.adaptive_span.get_loss()
                        for layer in model.module.layers)

        (loss / loss_div).backward()

    return loss_value, h_cache


def _train_batch(model, optimizer, scheduler, X, Y, h_cache,
                 eval_only, batch_split):
    """Train on a batch."""

    optimizer.zero_grad()

    if batch_split == 1:
        # process a batch in a single step (default behaviour)
        loss_value, h_cache = _train_step(model, X, Y, h_cache, eval_only)
    else:
        # split a batch into multiple pieces that each can fit in memory
        assert X.size(0) % batch_split == 0
        split_size = X.size(0) // batch_split
        loss_value = 0
        h_cache_list = []
        for split_ind in range(batch_split):
            split_slice = slice(split_ind*split_size, (split_ind+1)*split_size)
            split_h_cache = [h[split_slice,:,:] for h in h_cache]
            split_loss_value, split_h_cache = _train_step(
                model, X[split_slice,:], Y[split_slice],
                split_h_cache, eval_only, batch_split)
            loss_value += split_loss_value
            h_cache_list.append(split_h_cache)
        h_cache = [
            torch.cat(
                [h_cache_list[i][l] for i in range(batch_split)]
            , dim=0) for l in range(len(h_cache))]

    if not eval_only:
        if scheduler is not None:
            scheduler.step()
        optimizer.step()

        # make sure span parameters are in a correct range
        if model.module.layers[0].attn.attn.adapt_span_enabled:
            for layer in model.module.layers:
                layer.attn.attn.adaptive_span.clamp_param()

    return loss_value, h_cache


def train_iteration(model, optimizer, scheduler, data, nb_batches_per_iter,
                    block_size, eval_only, train_pos, h_cache, batch_split):
    """Single training iteration."""
    if eval_only:
        model.eval()
    else:
        model.train()

    nb_batches_per_iter_max = nb_batches_per_iter
    if eval_only:
        # eval on fewer batches during training for speed-up
        nb_batches_per_iter_max = max(1, nb_batches_per_iter // 10)
        nb_batches_per_iter_max = min(nb_batches_per_iter_max,
                                      math.ceil(data.size(1) / block_size))

    loss_all = 0
    actual_nb_batches_per_iter = 0
    for _ in range(nb_batches_per_iter_max):
        actual_nb_batches_per_iter += 1
        X = data[:, train_pos: train_pos + block_size].contiguous()
        Y = data[:, train_pos + 1: train_pos + block_size + 1].contiguous()

        loss, h_cache = _train_batch(
            model=model,
            optimizer=optimizer,
            scheduler=scheduler,
            X=X, Y=Y,
            h_cache=h_cache,
            eval_only=eval_only,
            batch_split=batch_split)
        loss_all += loss
        train_pos += block_size
        if train_pos >= data.size(1) - block_size:
            # reached the end. randomize the offset to reduce overfitting
            train_pos = random.randrange(block_size)
            # reset the cache
            for h in h_cache:
                h.fill_(0)

    loss_all = loss_all / actual_nb_batches_per_iter
    return loss_all, train_pos, h_cache


# do full evaluation
def full_eval(model, optimizer, scheduler, data, block_size, hidden_size):
    model.eval()
    train_pos = 0
    nb_batches_per_iter_max = math.ceil(data.size(1) / block_size)
    h_cache = [
        torch.zeros(
            data.size(0),
            layer.attn.attn.get_cache_size(),
            hidden_size).to(data.device)
        for layer in model.module.layers]

    loss_all = 0
    actual_nb_batches_per_iter = 0
    for _ in range(nb_batches_per_iter_max):
        actual_nb_batches_per_iter += 1
        X = data[:, train_pos: train_pos + block_size].contiguous()
        Y = data[:, train_pos + 1: train_pos + block_size + 1].contiguous()

        loss, h_cache = _train_batch(
            model=model,
            optimizer=optimizer,
            scheduler=scheduler,
            X=X, Y=Y,
            h_cache=h_cache,
            eval_only=True,
            batch_split=1)
        loss_all += loss
        train_pos += block_size
        if train_pos >= data.size(1) - block_size:
            # Skip the remaining tokens as it can't make a whole block.
            # An effect on performance should be negligable for a large data.
            break

    loss_all = loss_all / actual_nb_batches_per_iter
    return loss_all


#### Параметр унших, орчин тохируулах, оптимизацчиллын, сурах явцдаа модел хадгалах, унших, явц хэвлэх зэрэг хэрэглээний функцүүд

In [0]:
def get_params(args, params_config):
    params = {}
    for params_category in params_config:
        params[params_category] = {}
        for param_config in params_config[params_category]:
            try:
                params[params_category][params_config[params_category][param_config]['dest']] = args[param_config]
            except KeyError:
                params[params_category][params_config[params_category][param_config]['dest']] = params_config[params_category][param_config]['default']
    return (params["env_params"], params["model_params"], params["adapt_span_params"], params["optim_params"], params["data_params"], params["trainer_params"])



##############################################################################
# ENVIRONMENT
##############################################################################

def _torch_distributed_init_process_group(local_rank):
    torch.distributed.init_process_group(
        backend='nccl',
        init_method='env://'
    )
    rank = torch.distributed.get_rank()
    world_size = torch.distributed.get_world_size()
    print('my rank={} local_rank={}'.format(rank, local_rank))
    torch.cuda.set_device(local_rank)
    return {
        'rank': rank,
        'world_size': world_size,
    }

def set_up_env(env_params):
    assert torch.cuda.is_available()
    if env_params['distributed']:
        env_params.update(
            _torch_distributed_init_process_group(
                local_rank=env_params['local_rank']))
    env_params['device'] = torch.device('cuda')


##############################################################################
# OPTIMIZER AND SCHEDULER
##############################################################################

def _get_grad_requiring_params(model):
    nb_parameters = 0
    grad_requiring_params = []
    for param in model.parameters():
        if param.requires_grad:
            nb_parameters += param.numel()
            grad_requiring_params.append(param)
    print('nb_parameters={:.2f}M'.format(nb_parameters / 1e6))
    return grad_requiring_params


def _get_optimizer(model,
                   optim,
                   lr: float,
                   momentum: float,
                   grad_clip: float):
    if optim == 'sgd':
        return torch.optim.SGD(_get_grad_requiring_params(model),
                               lr=lr,
                               momentum=momentum)
    elif optim == 'adagrad':
        return AdagradWithGradClip(_get_grad_requiring_params(model),
                                   lr=lr,
                                   grad_clip=grad_clip)
    else:
        raise RuntimeError("wrong type of optimizer "
                           "- must be 'sgd' or 'adagrad")


def _get_scheduler(optimizer, lr_warmup):
    if lr_warmup > 0:
        return torch.optim.lr_scheduler.LambdaLR(
            optimizer, lambda ep: min(1, ep / lr_warmup))
    return None


def get_optimizer_and_scheduler(model, optim_params):
    optimizer = _get_optimizer(model=model,
                               optim=optim_params['optim'],
                               lr=optim_params['lr'],
                               momentum=optim_params['momentum'],
                               grad_clip=optim_params['grad_clip'])
    scheduler = _get_scheduler(optimizer=optimizer,
                               lr_warmup=optim_params['lr_warmup'])
    return optimizer, scheduler


##############################################################################
# CHECKPOINT
##############################################################################

def _load_checkpoint(checkpoint_path, model, optimizer, scheduler, logger,
                     distributed):
    print('loading from a checkpoint at {}'.format(checkpoint_path))
    if distributed:
        # the model is saved from gpu0 so we need to map it to CPU first
        checkpoint_state = torch.load(
            checkpoint_path, map_location=lambda storage, loc: storage)
    else:
        checkpoint_state = torch.load(checkpoint_path)
    iter_init = checkpoint_state['iter_no'] + 1  # next iteration
    model.load_state_dict(checkpoint_state['model'])
    optimizer.load_state_dict(checkpoint_state['optimizer'])
    logger.load_state_dict(checkpoint_state['logger'])
    if 'scheduler_iter' in checkpoint_state:
        # we only need the step count
        scheduler.step(checkpoint_state['scheduler_iter'])
    return iter_init


def load_checkpoint(checkpoint_path, model, optimizer, scheduler, logger,
                    distributed):
    if checkpoint_path and os.path.exists(checkpoint_path):
        return _load_checkpoint(checkpoint_path=checkpoint_path,
                                model=model,
                                optimizer=optimizer,
                                scheduler=scheduler,
                                logger=logger,
                                distributed=distributed)
    return 0


def save_checkpoint(checkpoint_path, iter_no, model,
                    optimizer, scheduler, logger):
    if checkpoint_path:
        checkpoint_state = {
            'iter_no': iter_no,  # last completed iteration
            'model': model.state_dict(),
            'logger': logger.state_dict(),
            'optimizer': optimizer.state_dict(),
        }
        if scheduler is not None:
            checkpoint_state['scheduler_iter'] = scheduler.last_epoch
        torch.save(checkpoint_state, checkpoint_path)


##############################################################################
# LOGGER
##############################################################################

class Logger:
    def __init__(self):
        self._state_dict = dict()

    def load_state_dict(self, state_dict):
        self._state_dict = state_dict

    def state_dict(self):
        return self._state_dict

    def _log(self, title, value):
        if title not in self._state_dict:
            self._state_dict[title] = []
        self._state_dict[title].append(value)

    def log_iter(self, iter_no, nb_batches_per_iter, loss_train, loss_val,
                 elapsed, model):
        step = (iter_no + 1) * nb_batches_per_iter
        train_bpc = float(loss_train / math.log(2))
        val_bpc = float(loss_val / math.log(2))
        msg = 'steps: {}'.format(step)
        msg += '\ttrain: {:.3f}bpc\tval: {:.3f}bpc'.format(train_bpc, val_bpc)
        msg += '\tms/batch: {:.1f}'.format(elapsed)
        self._log(title='step', value=step)
        self._log(title='train_bpc', value=train_bpc)
        self._log(title='val_bpc', value=val_bpc)

        if model.module.layers[0].attn.attn.adapt_span_enabled:
            avg_spans = []
            max_spans = []
            for layer in model.module.layers:
                avg_spans.append(
                    layer.attn.attn.adaptive_span.get_current_avg_span())
                max_spans.append(
                    layer.attn.attn.adaptive_span.get_current_max_span())
            span_avg = float(sum(avg_spans)) / len(avg_spans)
            span_max = float(max(max_spans))
            self._log('span_avg', span_avg)
            self._log('span_max', span_max)
            msg += "\tspan_avg: {:.0f}\tspan_max: {:.0f}".format(span_avg, span_max)

        print(msg)


#### Монгол хэл дээрх Библийн датаг татах, боловсруулах

In [0]:
def get_data(file_name):
    if file_name == 'data/mnbible':
        !mkdir -p data/mnbible
        %cd data/mnbible
        !echo "Downloading mnbible data ..."
        !wget https://github.com/dl-ub-summer-school/2019/raw/master/Seminar5/mnbible8/mnbible.zip
        !unzip mnbible.zip
        !wget https://raw.githubusercontent.com/dl-ub-summer-school/2019/master/Seminar5/mnbible8/prep_mnbible.py
        !python3 prep_mnbible.py
        %cd ../..
    else:
        !echo "data not found!"
        
#get_data('data/mnbible')

####  Сургалтын ерөнхий функц

In [0]:
def launch(env_params,
           model_params,
           adapt_span_params,
           optim_params,
           data_params,
           trainer_params):
    # ENVIRONMENT (device, distributed, etc.)
    set_up_env(env_params)
    device = env_params['device']
    distributed = env_params['distributed']

    if distributed == False or env_params['rank'] == 0:
        print('model_params:\t', model_params)
        print('optim_params:\t', optim_params)
        print('data_params:\t', data_params)
        print('trainer_params:\t', trainer_params)
        print('adapt_span_params:\t', adapt_span_params)

    # DATA
    train_data, val_data, test_data = get_train_val_test_data(
        data_params=data_params,
        env_params=env_params,
        batch_size=trainer_params['batch_size'],
        device=device)

    # MODEL
    model = TransformerSeq(
        vocab_size=data_params['vocab_size'], **model_params,
        adapt_span_params=adapt_span_params)
    if distributed:
        local_rank = env_params['local_rank']
        model = model.to(device)
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[local_rank], output_device=local_rank)
    else:
        model = torch.nn.DataParallel(model)
        model = model.to(device)

    # OPTIMIZER AND SCHEDULER
    optimizer, scheduler = get_optimizer_and_scheduler(
        model=model, optim_params=optim_params)

    # create logger
    logger = Logger()

    # resume training from last checkpoint if exists
    iter_init = load_checkpoint(
        trainer_params['checkpoint_path'], model, optimizer, scheduler,
        logger, distributed)

    if trainer_params['full_eval_mode']:
        # evaluate the model on test data
        with torch.no_grad():
            loss_val = full_eval(model, optimizer, scheduler, val_data,
                                 model_params['block_size'],
                                 model_params['hidden_size'])
            loss_test = full_eval(model, optimizer, scheduler, test_data,
                                  model_params['block_size'],
                                  model_params['hidden_size'])
            if distributed:
                # collect results into rank0
                stats = torch.tensor(
                    [loss_val, loss_test]).to(device)
                torch.distributed.reduce(stats, 0)
                if env_params['rank'] == 0:
                    loss_val = stats[0] / env_params['world_size']
                    loss_test = stats[1] / env_params['world_size']
                else:
                    return

            print('val: {:.3f}bpc'.format(loss_val / math.log(2)))
            print('test: {:.3f}bpc'.format(loss_test / math.log(2)))
        return

    # position of current batch
    data_pos = [0] * 2
    # initialize caches for train and valid
    hid_cache = [[
        torch.zeros(
            train_data.size(0),
            layer.attn.attn.get_cache_size(),
            model_params['hidden_size']).to(device)
        for layer in model.module.layers] for _ in range(2)]

    nb_batches_per_iter = trainer_params['nb_batches_per_iter']
    for iter_no in range(iter_init, trainer_params['nb_iter']):
        t_sta = time.time()
        loss_train, data_pos[0], hid_cache[0] = train_iteration(
            model, optimizer, scheduler, train_data, nb_batches_per_iter,
            model_params['block_size'], False, data_pos[0], hid_cache[0],
            trainer_params['batch_split'])
        elapsed = 1000 * (time.time() - t_sta) / nb_batches_per_iter
        with torch.no_grad():
            loss_val, data_pos[1], hid_cache[1] = train_iteration(
                model, optimizer, scheduler, val_data, nb_batches_per_iter,
                model_params['block_size'], True, data_pos[1], hid_cache[1],
                trainer_params['batch_split'])

        if distributed:
            # collect results into rank0
            stats = torch.tensor(
                [loss_train, loss_val]).to(device)
            torch.distributed.reduce(stats, 0)
            if env_params['rank'] == 0:
                loss_train = stats[0] / env_params['world_size']
                loss_val = stats[1] / env_params['world_size']
            else:
                continue

        logger.log_iter(iter_no, nb_batches_per_iter, loss_train,
                        loss_val, elapsed, model)
        save_checkpoint(trainer_params['checkpoint_path'],
                        iter_no, model, optimizer, scheduler, logger)

def main(args):
    (env_params, model_params, adapt_span_params, optim_params, data_params, trainer_params) = get_params(args, PARAMS_CONFIG)
    launch(env_params, model_params, adapt_span_params, optim_params, data_params, trainer_params)

#### Монгол хэл дээрх Библийн датан дээрх үсгэн түвшний хэлний моделийн сургалтын функц, үүнийгээ ашиглаж сургалт эхлүүлэх нь

In [0]:
def mn_lm():
    # If run out of GPU memory, increase "--batch-split" argument.

    # get the data
    get_data('data/mnbible')
    !mkdir -p checkpoints

    args={"data": "data/mnbible",
          "nlayers": 8,
          "hid-sz": 256,
          "inner-hid-sz": 1024,
          "nheads": 1,
          "block-sz": 256,
          "batch-sz": 64,
          "lr": 0.07,
          "momentum": 0,
          "dropout": 0,
          "optim": "adagrad",
          "lr-warmup": 8000,
          "grad-clip": 0.03,
          "niter": 1,
          "nbatches": 1000,
          "checkpoint": "checkpoints/mnbible.pt"}
    
    !echo "Training ..."
    # using the pytorch distributed launching
    main(args)


    !echo "Evaluation ..."
    # use a smaller batch size to reduce tokens without context and omitted tokens.
    args["full-eval-mode"] = True
    args["batch-sz"] = 8
    main(args)

mn_lm()