# 作業：調整 ELMo 模型的訓練參數
- ELMo: [Deep contextualized word representations](https://arxiv.org/abs/1802.05365)
- 參考莫煩Python - [ELMo 一詞多義](https://mofanpy.com/tutorials/machine-learning/nlp/elmo/)，並將 Tensorflow 版本改用 PyTorch 實作
***
## [作業目標]
- 調整 ELMo 模型的參數, 觀察 loss 與數據比較

## [參數說明]
- UNITS : LSTM 特徵維度
- N_LAYERS : LSTM 層數
- BATCH_SIZE : 訓練批次大小
- LEARNING_RATE : 學習速率，影響收斂的快慢，須配合 BATCH_SIZE 調整

In [1]:
import os
import time
import torch
from torch import nn

import utils  # this refers to utils.py in https://github.com/MorvanZhou/NLP-Tutorials

In [13]:
class ELMo(nn.Module):
    def __init__(self, vocab_dim, embed_dim, hidden_dim, num_layers, pad_idx=0):
        super(ELMo, self).__init__()
        self.pad_idx = pad_idx

        self.embed = nn.Embedding(vocab_dim, embed_dim, padding_idx=pad_idx)
        nn.init.normal_(self.embed.weight, 0, 0.001)

        # forward lstm
        self.fs = nn.ModuleList([
            nn.LSTM(embed_dim, hidden_dim, batch_first=True) for _ in range(num_layers)
        ])
        self.f_logits = nn.Linear(hidden_dim, vocab_dim)

        # backward lstm
        self.bs = nn.ModuleList([
            nn.LSTM(embed_dim, hidden_dim, batch_first=True) for _ in range(num_layers)
        ])
        self.b_logits = nn.Linear(hidden_dim, vocab_dim)

        self.criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)

    def forward(self, inputs):
        mask = (inputs != self.pad_idx).unsqueeze(-1)
        embedded = self.embed(inputs)  # (batch_size, time_step, embed_dim)

        fxs, bxs = [embedded[:, :-1]], [embedded[:, 1:]]
        for fl, bl in zip(self.fs, self.bs):
            fx, _ = fl(fxs[-1])  # (batch_size, time_step - 1, embed_dim)
            fx = fx * mask[:, :-1]
            fxs.append(fx)

            bx, _ = bl(torch.flip(bxs[-1], [1]))  # (batch_size, time_step - 1, embed_dim)
            bx = torch.flip(bx, [1]) * mask[:, 1:]
            bxs.append(bx)

        return fxs, bxs

    def cal_loss(self, inputs):
        batch_size = inputs.size(0)
        fxs, bxs = self.forward(inputs)
        fo, bo = self.f_logits(fxs[-1]), self.b_logits(bxs[-1])
        loss = (
            self.criterion(fo.view(-1, fo.size(-1)), inputs[:, 1:].contiguous().view(-1)) +
            self.criterion(bo.view(-1, bo.size(-1)), inputs[:, :-1].contiguous().view(-1))
        ) / 2

        return loss, (fo, bo)

    def get_embed(self, inputs):
        fxs, bxs = self.forward(inputs)
        xs = [
            torch.cat((f[:, :-1], b[:, 1:]), axis=-1).cpu().numpy()
            for f, b in zip(fxs, bxs)
        ]

        for x in xs:
            print("layers shape=", x.size())

        return xs

In [3]:
def train(model, data, step, optimizer, device):
    start_time = time.time()
    for t in range(step):
        inputs = torch.LongTensor(data.sample(BATCH_SIZE)).to(device)
        loss, (fo, bo) = model.cal_loss(inputs)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if t % 100 == 0:
            fp = fo[0].argmax(axis=1)
            bp = bo[0].argmax(axis=1)
            end_time = time.time()
            print(
                f"step: {t} | time: {end_time - start_time:.2f} | loss: {loss.item():.3f}\n",
                f"| tgt: {' '.join([data.i2v[i] for i in inputs[0].tolist() if i != data.pad_id])}\n",
                f"| f_prd: {' '.join([data.i2v[i] for i in fp.tolist() if i != data.pad_id])}\n",
                f"| b_prd: {' '.join([data.i2v[i] for i in bp.tolist() if i != data.pad_id])}\n\n"
            )
            start_time = end_time
    os.makedirs('models', exist_ok=True)
    torch.save(model, os.path.join('models', 'elmo.pt'))

In [11]:
def export_w2v(model, data, device):
    model = torch.load(os.path.join('models', 'elmo.pt'), map_location=device)
    emb = model.get_embed(torch.LongTensor(data.sample(4)).to(device))
    print(emb)

In [5]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

BATCH_SIZE = 16
UNITS = 256
N_LAYERS = 2
LEARNING_RATE = 2e-3

data = utils.MRPCSingle('MRPC', rows=2000)
print('num word:', data.num_word)

downloading from https://mofanpy.com/static/files/MRPC/msr_paraphrase_train.txt
completed
downloading from https://mofanpy.com/static/files/MRPC/msr_paraphrase_test.txt
completed
num word: 12880


In [6]:
model = ELMo(
    data.num_word, 
    embed_dim=UNITS, 
    hidden_dim=UNITS, 
    num_layers=N_LAYERS, 
    pad_idx=data.pad_id
).to(device)
model

ELMo(
  (embed): Embedding(12880, 256, padding_idx=0)
  (fs): ModuleList(
    (0): LSTM(256, 256, batch_first=True)
    (1): LSTM(256, 256, batch_first=True)
  )
  (f_logits): Linear(in_features=256, out_features=12880, bias=True)
  (bs): ModuleList(
    (0): LSTM(256, 256, batch_first=True)
    (1): LSTM(256, 256, batch_first=True)
  )
  (b_logits): Linear(in_features=256, out_features=12880, bias=True)
  (criterion): CrossEntropyLoss()
)

In [7]:
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
train(model, data, 10000, optimizer, device)

step: 0 | time: 0.08 | loss: 9.466
 | tgt: <GO> <quote> during the investigation , bryant was cooperative with investigators and remains cooperative with authorities , <quote> the sheriff 's office said . <SEP>
 | f_prd: ala ala ala ala ala warrant warrant warrant warrant warrant warrant warrant warrant warrant warrant warrant warrant warrant warrant warrant warrant warrant warrant warrant warrant broaching broaching broaching broaching broaching broaching broaching broaching broaching broaching broaching broaching
 | b_prd: affiliation affiliation affiliation affiliation affiliation affiliation affiliation affiliation affiliation affiliation affiliation affiliation affiliation affiliation affiliation affiliation affiliation affiliation affiliation affiliation affiliation affiliation affiliation grow tito tito tito tito tito tito tito tito tito tito tito tito tito


step: 100 | time: 6.20 | loss: 7.164
 | tgt: <GO> pressure also came last night from religious circles as three anglican 