<a href="https://colab.research.google.com/github/gauss5930/Natural-Language-Processing/blob/main/ELECTRA/ELECTRA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#https://github.com/google-research/electra/blob/master/model/modeling.py

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import collections
import copy
import json
import math
import re

import numpy as np
import six
import tensorflow.compat.v1 as tf
from tensorflow.contrib import layers as contrib_layers

class BertConfig(object):
  def __init__(self, vocab_size, hidden_size = 768, num_hidden_layers = 12,
               num_attention_heads = 12, intermediate_size = 3072, hidden_act = 'gelu',
               hidden_dropout_prob = 0.1, attention_probs_dropout_prob = 0.1,
               max_position_embeddings = 512, type_vocab_size = 2, initializer_range = 0.02):
    '''
    BertConfig 구성

    Args:
      vocab_size: BertModel에서 input_ids의 vocabulary 크기
      hidden_size: encoder 레이어와 pooler 레이어의 크기
      num_hidden_layers: Transformer encoder의 hidden layer 수
      num_attention_heads: Transformer encoder에서 각 attention layer에 대한 attention head의 수
      intermediate_size: Transformer encoder에서 중간 레이어의 크기
      hidden_act: encoder와 pooler의 비선형 활성화 함수
      hidden_dropout_prob: embeddings, encoder, pooler에서 모든 fully connected layer에 대한 dropout 확률
      attention_probs_dropout_prob: attention 확률에 대한 dropout 비율
      max_position_embeddings: 모델이 사용할 수 있는 최대 sequence length
      type_vocab_size: BertModel에 들어가는 token_type_ids의 vocabulary 크기
      initializer_range: 모든 가중치 행렬 초기화에 대한 truncated_normal_initializer의 표준편차
    '''

    self.vocab_size = vocab_size
    self.hidden_size = hidden_size
    self.num_hidden_layers = num_hidden_layers
    self.num_attention_heads = num_attention_heads
    self.hidden_act = hidden_act
    self.intermediate_size = intermediate_size
    self.hidden_dropout_prob = hidden_dropout_prob
    self.attention_probs_dropout_prob = attention_probs_dropout_prob
    self.max_position_embeddings = max_position_embeddings
    self.type_vocab_size = type_vocab_size
    self.initializer_range = initializer_range

  @classmethod
  def from_dict(cls, json_object):
    # 파라미터의 Python dictionary로부터 BertConfig 구성
    config = BertConfig(vocab_size = None)
    for (key, value) in six.iteritems(json_object):
      config.__dict__[key] = value
    return config

  @classmethod
  def from_json_file(cls, json_file):
    # 파라미터의 json_file로부터 BertConfig 구성
    with tf.io.gfile.GFile(json_file, 'r') as reader:
      text = reader.read()
    return cls.from_dict(json.loads(text))

  def to_dict(self):
    # Python dictionary로 이 instance들을 나열하기
    output = copy.deepcopy(self.__dict__)
    return output

  def to_json_string(self):
    # JSON 문자열로 이 instance들을 나열하기
    return json.dumps(self.to_dict(), indent = 2, sort_keys = True) + '\n'

class BertModel(object):
  # BERT model이다. 학습 알고리즘은 다르지만 ELECTRA에 사용되는 BERT는 똑같다.
  def __init__(self, bert_config, is_training, input_ids, input_mask = None,
               token_type_ids = None, use_one_hot_embeddings = True, scope = None,
               embedding_size = None, input_embeddings = None, input_reprs = None,
               update_embeddings = True, untied_embeddings = False,
               ltr = False, rtl = False):
    '''
    Args:
      bert_config: BertConfig 인자
      is_training: true면 학습 모델, false면 평가 모델. dropout이 어디에 적용될 지 조종
      input_ids: [batch_size, seq_length] 형태의 Tensor
      input_mask: [batch_size, seq_length] 형태의 Tensor
      token_type_ids: [batch_size, seq_length] 형태의 Tensor
      use_one_hot_embeddings: word embedding에 대해 one-hot word embedding을 사용할 지 tf.embedding_lookup()을 사용할 지 결정
      scope: 변수 범위. 기본값은 'electra'
    '''
    bert_config = copy.deepcopy(bert_config)
    if not is_training:
      bert_config.hidden_dropout_prob = 0.0
      bert_config.attention_probs_dropout_prob = 0.0

    input_shape = get_shape_list(token_type_ids, expected_rank = 2)
    batch_size = input_shaep[0]
    seq_length = input_shape[1]

    if input_mask is None:
      input_mask = tf.ones(shape = [batch_size, seq_length], dtype = tf.int32)

    assert token_type_ids is not None

    if input_reprs is None:
      if input_embeddings is None:
        with tf.variable_scope(
            (scope if untied_embeddings else 'electra') + '/embeddings',
            reuse = tf.AUTO_REUSE):
          # word ids에 대해 embedding lookup을 수행
          if embedding_size is None:
            embedding_size = bert_config.hidden_size
          (self.token_embeddings, self.embedding_table) = embeddings_lookup(
              input_ids = input_ids,
              vocab_size = bert_config.vocab_size,
              embedding_size = embedding_size,
              initializer_range = bert_config.initializer_range,
              word_embedding_name = 'word_embeddings',
              use_one_hot_embeddings = use_one_hot_embeddings
          )
      else:
        self.token_embeddings = input_embeddings

      with tf.varialbe_scope(
          (scope if untied_embeddings else 'electra') + '/embeddings',
          reuse = tf.AUTO_REUSE):
        # positional embedding과 token_type_embedding 추가 후, layer normalization 진행한 뒤에 dropout 수행
        self.embedding_output = embedding_postprocessor(
            input_tensor = self.token_embeddings,
            use_token_type = True,
            token_type_ids = token_type_ids,
            token_type_vocab_size = bert_config.type_vocab_size,
            token_type_embedding_name = 'token_type_embeddings',
            use_position_embeddings = True,
            position_embedding_name = 'position_embeddings',
            initializer_range = bert_config.max_position_embeddings, dropout_prob = bert_config.hidden_dropout_prob
        )
    
    else:
      self.embedding_output = input_reprs
    if not update_embeddings:
      self.embedding_output = tf.stop_gradient(self.embedding_output)

    with tf.variable_scope(scope, default_name = 'electra'):
      if self.embedding_output.shape[-1] != bert_config_hidden_size:
        self.embedding_output = tf.layers.dense(
            self.embedding_output, bert_config.hidden_size,
            name = 'embeddings_project'
        )

      with tf.variable_scope('encoder'):
        # [batch_size, seq_length] 2차원 형태의 mask를 3차원 형태의 [batch_size, seq_length, seq_length]로 변환해줌
        # 이 mask는 attention score에 사용된다.
        attention_mask = creae_attention_mask_from_input_mask(
            token_type_ids, input_mask
        )

        # ltr 또는 rtl Transformer을 실행하기 위해 일반적인 masking을 attention에 추가한다.
        if ltr or rtl:
          casual_mask = tf.ones((seq_length, seq_length))
          if ltr:
            casual_mask = tf.matrix_band_part(casual_mask, -1, 0)
          else:
            casual_mask = tf.matrix_band_part(casual_mask, 0, -1)
          attention_mask *= tf.expand_dims(casual_mask, 0)

        # 적재된 Transformer를 실행. 출력의 형태는 다음과 같음.
        # sequence_output: [batch_size, seq_length, hidden_size]
        # pooled_output: [batch_size, hidden_size]
        # all_encoder_layers: [n_layers, batch_size, seq_length, hidden_size]
        # attn_maps: [n_layers, batch_size, n_heads, seq_length, seq_length]
        (self.all_layer_output, self.attn_maps) = transformer_model(
            input_tensor = self.embedding_output,
            attention_mask = attention_mask,
            hidden_size = bert_config.hidden_size,
            num_hidden_layers=bert_config.num_hidden_layers,
            num_attention_heads=bert_config.num_attention_heads,
            intermediate_size=bert_config.intermediate_size,
            intermediate_act_fn=get_activation(bert_config.hidden_act),
            hidden_dropout_prob=bert_config.hidden_dropout_prob,
            attention_probs_dropout_prob=
            bert_config.attention_probs_dropout_prob,
            initializer_range=bert_config.initializer_range,
            do_return_all_layers=True
        )
        self.sequence_output = self.all_layer_outputs[-1]
        self.pooled_output = self.sequence_output[:, 0]

  def get_pooled_output(self):
    return self.pooled_output

  def get_sequence_output(self):
    # encoder의 마지막 hidden layer을 얻음
    return self.sequence_output

  def get_all_encoder_layers(self):
    return self.all_layer_outputs
  
  def get_embedding_output(self):
    # embedding lookup의 출력을 얻음
    return self.embedding_output

  def get_embedding_table(self):
    return self.embedding_table

def gelu(input_tensor):   # Gaussian Error Linear Unit
  cdf = o.0 * (1.0 + tf.math.erf(input_tensor / tf.sqrt(2.0)))
  return input_tensor * cdf

def get_activation(activation_string):
  # 문자열을 Python 함수로 매핑. ex) 'relu' -> 'tf.nn.relu'
  if not isinstance(activation_string, six.string_types):
    return activation_string

  if not activation_string:
    return None

  act = activation_string.lower()
  if act == 'linear':
    return None
  elif act == 'relu':
    return tf.nn.relu
  elif act == 'gelu':
    return gelu
  elif act == 'tanh':
    return tf.tanh
  else:
    raise ValueError('Unsupported activation: %s' % act)

def get_assingment_map_from_checkpoint(tvars, init_checkpoint, prefic = ""):
  # 현재 변수와 checkpoint 변수의 조합을 계산
  name_to_variable = collections.OrderDict()
  for var in tvars:
    name = var.name
    m = re.match("^(.*):\\d+$", name)
    if m is not None:
      name = m.group(1)
      name_to_variable[name] = var

    initializer_varialbe_names = {}
    assingment_map = collections.OrderDict()
    for x in tf.train.list_variable(init_checkpoint):
      (name, var) = (x[0], x[1])
      if prefix + name not in name_tovariable:
        continue
      assignment_map[name] = prefix + name
      initialized_variable_names[name] = 1
      initialized_variable_names[name + ':0'] = 1

    return assignment_map, initialized_variable_names

def dropout(input_tensor, dropout_prob):
  # Dropout 수행
  if dropout_prob is None or dropout_prob == 0.0:
    return input_tensor

  output = tf.nn.dropout(input_tensor, 1.0 - dropout_prob)
  return output

def layer_norm(input_tensor, name = None):
  # tensor의 마지막 차원에 layer normalization 실행
  return contrib_layers.layer_norm(
      inputs = input_tensor, begin_norm_axis = -1, begin_params_axis = -1, scope = name
  )

def layer_norm_and_dropout(input_tensor, dropout_prob, name = None):
  # layer normalization 실행 후 dropout 실행
  output_tensor = layer_norm(input_tensor, name)
  output_tensor = dropout(output_tensor, dropout_prob)
  return output_tensor

def create_initializer(initializer_range = 0.02):
  # 주어진 범위를 사용하여 truncated_normal_initializer 생성
  return tf.truncated_normal_initializer(stddev = initializer_range)

def embedding_lookup(input_ids, vocab_size, embedding_size = 128, initializer_range = 0.02,
                     word_embedding_name = 'word_embeddings', use_one_hot_embeddings = False):
  '''
  id tensor에 대한 word embedding을 lookup

  Args:
    input_ids: word ids를 포함하는 [batch_size, seq_length] 형태의 Tensor
    vocab_size: embedding vocabulary의 크기
    initializer_range: embedding 초기화 범위
    word_embedding_name: embedding table의 이름
    use_one_hot_embeddings: True면 word embedding에 대해 one-hot method를 사용. False면 tf.nn.embedding_lookup()을 사용
  '''

  # 이 함수는 입력이 [batch_size, seq_length, num_inputs] 형태라고 생각
  # 입력이 [batch_size, seq_length]의 2차원 형태면 [batch_size, seq_length, 1]로 형태 변환
  original_dims = input_ids.shape.ndims
  if original_dims == 2:
    input_ids = tf.expand_dims(input_ids, axis = [-1])

  embedding_table = tf.get_variable(name = word_embedding_name, shape = [vocab_size, embedding_size],
                                    initializer = vreate_initializer(initializer_range))
  
  if original_dims == 3:
    input_shape = get_shape_list(input_ids)
    tf.reshape(input_ids, [-1, input_shape[-1]])
    output = tf.matmul(input_ids, embedding_table)
    output = tf.reshape(output, [input_shape[0], input_shape[1], embedding_size])
  else:
    if use_one_hot_embeddings:
      flat_input_ids = tf.reshape(input_ids, [-1])
      one_hot_input_ids = tf.one_hot(flat_input_ids, depth = vocab_size)
      output = tf.matmul(one_hot_input_ids, embedding_table)
    else:
      output = tf.nn.embedding_lookup(embedding_table, input_ids)

    input_shape = get_shape_list(input_ids)

    output = tf.reshape(output, 
                        input_shape[0:-1] + [input_shape[-1] * embedding_size])
  return output, embedding_table

def embedding_postprocessor(input_tensor, use_token_type = False, token_type_ids = None,
                            token_type_vocab_size = 16, token_type_embedding_name = 'token_type_embeddings',
                            use_position_embedding = True, position_embedding_name = 'position_embeddings',
                            initializer_range = 0.02, max_position_embeddings = 512, dropout_prob = 0.1):
  '''
  Args:
    input_tensor: [batch_size, seq_length, embedding_size] 형태의 Tensor
    use_token_type: token_type_ids를 위해 임베딩을 추가할 지 말 지 결정
    token_type_ids: [batch_size, seq_length] 형태의 Tensor. use_token_type이 true면 무조건 명시되어야 함
    token_type_vocab_size: token_type_ids의 vocabulary 크기
    token_type_embedding_name: token_type_ids에 대한 embedding table 변수의 이름
    use_position_embeddings: sequence에서 각 토큰의 위치에 대해 position embedding을 추가할 지 말 지 결정
    position_embedding_name: positional embedding을 위한 embedding table 변수의 이름
    initializer_range: 가중치 초기화의 범위
    max_position_embeddings: 모델이 사용할 수 있는 maximum sequence length
    dropout_prob: 마지막 output tensor에 적용되는 dropout 확률
  '''
  input_shape = get_shape_list(input_tensor, expected_rank = 3)
  batch_size = input_shape[0]
  seq_length = input_shape[1]
  width = input_shape[2]

  output = input_tensor

  if use_token_type:
    if token_type_ids is None:
      raise ValueError("`token_type_ids` must be specified if"
                       "`use_token_type` is True.")
    token_type_table = tf.get_variable(
        name = token_type_embedding_name,
        shape = [token_type_vocab_size, width],
        initializer = create_initializer(initializer_range)
    )
    # 이 vocab은 작을 것이기 때문에 항상 one-hot encoding을 한다. 이것이 작은 vocab에 대해서 항상 더 빠른 속도를 보여줌
    flat_token_type_ids = tf.reshape(token_type_ids, [-1])
    one_hot_ids = tf.one_hot(falt_token_type_ids, depth = token_type_vocab_size)
    token_type_embeddings = tf.matmul(one_hot_ids, token_type_table)
    token_type_embeddings = tf.reshape(token_type_embeddings, [batch_size, seq_length, width])
    output += token_type_embeddings

  if use_position_embeddings:
    assert_op = tf.assert_less_equal(seq_length, max_position_embeddings)
    with tf.control_dependencies([assert_op]):
      full_position_embeddings = tf.get_variable(
          name = position_embedding_name,
          shape = [max_position_embeddings, width],
          initializer = create_initializer(initializer_range)
      )
      '''
      position embedding은 학습된 변수이기 때문에, 'max_position_embeddings'의 sequecen length를 사용해서 생성한다.
      실제 sequence length는 이것보다 짧다. 왜냐하면 빠른 학습을 위해 긴 sequence를 가지면 안 되기 때문이다.
      
      그래서 'full_position_embedding'은 사실상 position [0, 1, 2, ..., max_position_embeddings-1]에 대한
      embedding table이고 현재 시퀀스의 position은 [0, 1, 2, ..., seq_length-1]이다.
      따라서 슬라이스만을 수행할 수 있다.
      '''
      position_embeddings = tf.slice(full_position_embeddings, [0, 0], [seq_length, -1])
      num_dims = len(output.shape.as_list())

      '''
      마지막 두 개의 차원만이 연관되어 있다(seq_length & width). 그래서 주로 배치 크기인 첫 번째 차원 사이에서 방송한다.
      '''
      position_broadcast_shape = []
      for _ in range(num_dims - 2):
        position_broadcast_shape.append(1)
      position_broadcast_shape.extend([seq_length, width])
      position_embeddings = tf.reshape(position_embeddings, position_broadcast_shape)
      output += position_embeddings

  output = layer_norm_and_dropout(output, dropout_prob)
  return output

def create_attention_mask_from_input_mask(from_tensor, to_mask):
  '''
  2D tensor mask로부터 3D attention mask 생성

  Args:
    from_tensor: [batch_size, from_seq_length] 형태의 2D 또는 3D Tensor
    to_mask: [batch_size, to_seq_length] 형태의 Tensor

  Returns:
    [batch_size, from_seq_length, to_seq_length] 형태의 Tensor
  '''
  from_shape = get_shape_list(from_tensor, expected_rank = [2, 3])
  batch_size = from_shape[0]
  from_seq_length = from_shape[1]

  to_shape = get_shape_list(to_mask, expected_rank = 2)
  to_seq_length = to_shape[1]

  to_mask = tf.cast(
      tf.reshape(to_mask, [batch_size, 1, to_seq_length]), tf.float32
  )

  '''
  from_tensor가 마스크일 것이라고 생각하지 않는다. 
  *from* 패딩 토큰(오직 *to* 패딩)을 참조하는지 여부는 실제로 신경쓰지 않으므로 모든 토큰의 텐서를 생성한다.
  
  broadcast_ones = [batch_size, from_seq_length, 1]
  '''
  broadcast_ones = tf.ones(
      shape = [batch_size, from_seq_length, 1], dtype = tf.float32
  )

  # mask를 생성하기 위해 두 개의 차원과 함께 broadcast
  mask = broadcast_ones * to_mask

  return mask

def attention_layer(from_tensor, to_tensor, attention_mask = None, num_attention_heads = 1,
                    size_per_head = 512, query_act = None, key_act = None, value_act = None,
                    attention_probs_dropout_prob = 0.0, initializer_range = 0.02,
                    do_return_2d_tensor = False, batch_size = None, from_seq_length = None,
                    to_seq_length = None):
  '''
  from_tensor로부터 to_tensor로 multi-head attention을 수행
  '''
  def transpose_for_scores(input_tensor, batch_size, num_attention_heads, seq_length, width):
    output_tensor = tf.reshape(
        input_tensor, [batch_size, seq_length, num_attention_heads, width]
    )

    output_tensor = tf.transpose(output_tensor, [0, 2, 1, 3])
    return output_tensor

  from_shape = get_shape_list(from_tensor, expected_rank = [2, 3])
  to_shape = get_shape_list(to_tensor, expected_rank = [2, 3])

  if len(from_shape) != len(to_shape):
    raise ValueError(
        "The rank of `from_tensor` must match the rank of `to_tensor`.")
    
  if len(from_shape) == 3:
    batch_size = from_shape[0]
    from_seq_length = from_shape[1]
    to_seq_length = to_shape[1]
  elif len(from_shape) == 2:
    if batch_size is None or from_seq_length is None or to_seq_length is None:
      raise ValueError(
          "When passing in rank 2 tensors to attention_layer, the values "
          "for `batch_size`, `from_seq_length`, and `to_seq_length` "
          "must all be specified.")
      
  '''
  스칼라 차원
  B = batch size(sequence의 수)
  F = from_tensor의 sequence length
  T = to_tensor의 sequence length
  N = num_attnetion_heads
  H = size_per_head
  '''

  from_tensor_2d = reshape_to_matrix(from_tensor)
  to_tensor_2d = reshape_to_matrix(to_tensor)

  # query layer = [B*F, N*H]
  query_layer = tf.layers.dense(
      from_tensor_2d, num_attention_heads * size_per_head,
      activation = query_act,
      name = 'query',
      kernel_initializer = create_initializer(initializer_range)
  )

  # key layer = [B*T, N*H]
  key_layer = tf.layers.dense(
      to_tensor_2d, num_attention_heads * size_per_head,
      activation = key_act, name = 'key',
      kernel_initializer = create_initializer(initializer_range)
  )

  # value layer = [B*T, N*H]
  value_layer = tf.layers.dense(
      to_tensor_2d, num_attention_heads * size_per_head,
      activation = value_act, name = 'value',
      kernel_initializer = create_initializer(initializer_range)
  )

  # query layer = [B, N, F, H]
  query_layer = transpose_for_scores(query_layer, batch_size, num_attention_heads,
                                     from_seq_length, size_per_head)
  
  # key layer = [B, N, T, H]
  key_layer = transpose_for_scores(key_layer, batch_size, num_attention_heads,
                                   to_seq_length, size_per_head)
  
  # query와 key의 내적으로 attention score를 얻음
  # attention_score = [B, N, F, T]
  attention_scores = tf.matmul(query_layer, key_layer, transpose_b = True)
  attention_scores = tf.multiply(attention_scores,
                                 1.0 / math.sqrt(float(size_per_head)))
  
  if attention_mask is not None:
    # attention_mask = [B, 1, F, T]
    attention_mask = tf.expand_dims(attention_mask, axis = [1])

    # attention_mask는 참조하길 원하는 position에 대해서는 1.0을 masked position에 대해서는 0.0을 가짐
    # 이 연산은 참조하길 원하는 position에 대해서는 0.0의 tensor를, masked position에 대해서는 -10000.0의 tensor를 가짐
    adder = (1.0 - tf.cast(attention_mask, tf.float32)) * -10000.0

    # 이 값을 softmax 이전의 raw score에 더하기 때문에, 이것은 완전히 지우는 것과 똑같이 효과적이다.
    attention_scores += adder

  # attention score를 확률로 정규화
  # attentipn_probs = [B, N, F, T]
  attention_rpobs = tf.nn.softmax(attention_scores)

  # 이것은 실제로 참조할 전체 토큰을 삭제함
  attention_probs = dropout(attention_probs, attention_probs_dropout_prob)

  # value_layer = [B, T, N, H]
  value_layer = tf.reshape(value_layer,
                           [batch_size, to_seq_length, num_attention_heads, size_per_head])
  
  # value_layer = [B, N, T, H]
  value_layer = tf.transpose(value_layer, [0, 2, 1, 3])

  # context_layer = [B, N, F, H]
  context_layer = tf.matmul(attention_probs, value_layer)

  # context_layer = [B, F, N, H]
  context_layer = tf.transpose(context_layer, [0, 2, 1, 3])

  if do_return_2d_tensor:
    # context_layer = [B*F, N*H]
    context_layer = tf.reshape(
        context_layer, [batch_size * from_seq_length, num_attention_heads * size_per_head]
    )
  else:
    # context_layer = [B, F, N*H]
    context_layer = tf.reshape(
        context_layer, [batch_size, from_seq_length, num_attention_heads * size_per_head]
    )

  return context_layer, attention_probs

def transformer_model(input_tensor, attention_mask = None, hidden_size = 768,
                      num_hidden_layers = 12, num_attention_heads = 12,
                      intermediate_size = 3072, intermediate_act_fn = gelu,
                      hidden_dropout_prob = 0.1, attention_probs_dropout_prob = 0.1,
                      initializer_range = 0.02, do_return_all_layers = False):
  '''
  Multi-headed, multi-layer Transformer
  '''
  if hidden_size % num_attention_head != 0:
    raise ValueError(
        "The hidden size (%d) is not a multiple of the number of attention "
        "heads (%d)" % (hidden_size, num_attention_heads))
    
  # re-shaping과 3D tensor를 2D tensor로 바꾸기 위한 노력을 피하려고
  # representation을 2D tensor로 유지시킴. re-shapes는 GPU/CPU 에서는 free지만, TPU에서는 그렇지 않음.
  # 그래서 optimizer을 돕기 위해 최소화하려고 함.
  prev_output = reshape_to_matrix(input_tensor)

  attn_maps = []
  all_layer_outputs = []
  for layer_idx in range(num_hidden_layers):
    with tf.variable_scope('layer_%d' % layer_idx):
      with tf.variavle_scope('attention'):
        attention_heads = []
        with tf.variable_scope('self'):
          attention_head, probs = attention_layer(
              from_tensor = prev_output, to_tensor = prev_output,
              attention_mask = attention_mask,
              num_attention_heads =  num_attention_heads,
              size_per_head = attention_head_size,
              attention_probs_dropout_prob = attention_probs_dropout_prob,
              initializer_range = initializer_range,
              do_return_2d_tensor = True,
              batch_size = batch_size,
              from_seq_length = seq_length,
              to_seq_length = seq_length
          )
          attention_heads.append(attention_head)
          attn_maps.append(probs)

        attention_output = None
        if len(attention_heads) == 1:
          attention_output = attention_heads[0]
        else:
          # 다른 sequence를 가지는 경우에, projection 이전에 self-attention head로 이들을 합침
          attention_output = tf.concat(attention_heads, axis = -1)

        # hidden_size의 선형 projection 실행하고 layer_input과 함께께 residual
        with tf.variable_scope('output'):
          attention_output = tf.layers.dense(
              attention_output, hidden_size, kernel_initializer = create_initializer(initializer_range)
          )
          attention_output = dropout(attention_output, hidden_dropout_prob)
          attention_output = layer_norm(attention_output + prev_output)

      # 활성화는 오직 중간 hidden layer에 적용됨
      with tf.variable_scope('intermediate'):
        intermediate_output = tf.layers.dense(
            attention_output, intermediate_size, activation = intermediate_act_fn,
            kernel_initializer = create_initializer(initializer_range)
        )

      # hidden_size로 다시 하향 투영한 뒤 residual을 추가
      with tf.variable_scope('output'):
        prev_output = tf.layers.dense(
            intermediate_output, hidden_size, kernel_initializer = create_initializer(initializer_range)
        )
        prev_output = dropout(prev_output, hidden_dropout_prob)
        prev_output = layer_norm(prev_output + attention_output)
        all_layer_outputs.append(prev_output)

  attn_maps = tf.stack(attn_maps, 0)
  if do_return_all_layers:
    return tf.stack([reshape_from_matrix(layer, input_shape) for layer in all_layer_outputs], 0), attn_maps
  else:
    return reshape_from_matrix(prev_output, input_shape), attn_maps

def get_shape_list(tensor, expected_rank = None, name = None):
  # tensor의 형태 리스트를 반환. static 차원을 더 선호
  if isinstance(tensor, np.ndarray) or isinstance(tensor, list):
    shape = np.array(tensor).shape
    if isinstance(expected_rank, six.integer_types)
      assert len(shape) == expected_rank
    elif expected_rank is not NOne:
      assert len(shape) in expected_rank
    return shape

  if name is NOne:
    name = tensor.name

  if expected_rank is not None:
    assert_rank(tensor, expected_rank, name)

  shape = tensor.shape.as_list()

  non_static_indexes = []
  for (index, dim) in enumerate(shape):
    if dim is None:
      non_static_indexes.append(index)

  if not non-static_indexes:
    return shape

  dyn_shape = tf.shape(tensor)
  for index in none_static_indexes:
    shape[index] = dyn_shape[index]
  return shape

def reshape_to_matrix(input_tensor):
  # a >= rank 2 tensor를 rank 2 tensor로 변형
  ndims = input_tensor.shape.ndims
  if ndims < 2:
    raise ValueError("Input tensor must have at least rank 2. Shape = %s" %
                     (input_tensor.shape))
    
  if ndims == 2:
    return input_tensor

  width = input_tensor.shape[-1]
  output_tensor = tf.reshape(input_tensor, [-1, width])
  return output_tensor

def reshape_from_matrix(output_tensor, orig_shape_list):
  # rank 2 tensor를 기존의 rank >= 2인 tensor로 변형
  if len(orig_shape_list) == 2:
    return output_tensor

  output_shape = get_shape_list(output_tensor)

  orig_dims = orig_shape_list[0:-1]
  width = output_shape[-1]

  return tf.reshape(output_tensor, orig_dims + [width])

def assert_rank(tensor, expected_rank, name = None):
  # tensor rank가 expected rank가 아니면 예외 발생시킴
  if name is None:
    name = tensor.name

  expected_rank_dict = {}
  if isinstance(expected_rank, sic.integer_types):
    expected_rank_dict[expected_rank] = True
  else:
    for x in expected_rank:
      expected_rank_dict[x] = True

  actual_rank = tensor.shape.ndims
  if actual_rank not in expectted_rank_dict:
    scope_name = tf.get_variable_scope().name
    raise ValueError(
        "For the tensor `%s` in scope `%s`, the actual rank "
        "`%d` (shape = %s) is not equal to the expected rank `%s`" %
        (name, scope_name, actual_rank, str(tensor.shape), str(expected_rank)))