<a href="https://colab.research.google.com/github/gauss5930/Natural-Language-Processing/blob/main/ELMo/ELMo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from typing import LIst, Tuple
import torch
import torch.nn as nn
from char_cnn import CharEmbedding

class ELMo(nn.Module):
  def __init__(self, vocab_size, output_dim, emb_dim, hid_dim, prj_dim, kernel_sizes,
               seq_len, n_layers, dropout):
    #파라미터 설명(몇 개만)
    #output_dim: word vocaulary 크기
    #n_layers: LSTM의 레이어 수. 기본값은 2

    super(ELMo, self).__init__()

    self.embedding = CharEmbedding(vocab_size, emb_dim, prj_dim, kernel_sizes, seq_len)
    self.bilms = BidirectionalLanguageModel(hid_dim, hid_dim, n_layers, dropout)

    self.predict = nn.Linear(hid_dim, output_dim)

  def forward(self, x):
    #파라미터: x(Sentence)
    #차원: x([batch, seq_len])
    emb = self.embedding(x)
    _, last_output = self.bilms(emb)
    y = self.predict(last_output)

    return y   #훈련 단계에서는 오직 biLM의 마지막 LSTM의 output만을 사용하여라

  def get_embed_layer(self, x):   #torch.Tensor --> List
    #순전파와 똑같지만, 모든 레이어의 임베딩을 반환함
    #파라미터: x(character로 이루어진 sentence)
    #차원: x([batch, seq_len])
    emb = self.embedding(x)
    first_output, last_output = self.bilms(emb)

    return emb, (first_output, last_output)

  def init_weights(self):
    for p in self.parameters():
      if p.dim() > 1:
        nn.init.xavier_uniform_(p)

    for lstm in self.bilms.lstms:
      for names in lstm._all_weights:
        for name in filter(lambda n: 'bias' in n, names):
          bias = getattr(lstm, name)
          n = bias.size(0)
          start, end = n // 4, n // 2
          bias.data[start:end].fill_(1.)

class BidirectionalLanguageModel(nn.Module):
  def __init__(self, emb_dim, hid_dim, prj_emb, dropout):
    #LSTM 레이어의 이전과 이후 모두에 dropout 사용
    super(BidirectionalLanguageModel, self).__init__()
    self.lstms = nn.ModuleList([nn.LSTM(emb_dim, hid_dim, bidirectional = True, dropout = dropout,
                                        batch_first = True), nn.LSTM(prj_emb, hid_dim, bidirectional = True, dropout = dropout, bacth_first = True)])
    self.projection_layer = nn.Linear(2 * hid_dim, prj_emb)

  def forward(self, x, hidden = None):
    #파라미터: x(임베딩된 sentence tensor), hidden(hidden과 cell의 tuple)
    #차원: x([Batch, Seq_len, Emb_size]),
    #hidden([num_layers * num_directions, batch, hidden_size], [num_layers * num_directions, batch, hidden_size])
    
    #LSTM 레이어 사이에 residual connection 추가
    first_output, (hidden, cell) = self.lstms[0](x, hidden)

    projected = self.projection_layer(first_output)
    second_output, (hidden, cell) = self.lstms[1](projected, (hidden, cell))

    second_output = second_output.view(second_output.size(0), second_output.size(1), 2, -1)

    second_output = second_output[:, :, 0, :] + second_output[:, :, 1, :]

    return first_output, second_output