<a href="https://colab.research.google.com/github/gauss5930/Natural-Language-Processing/blob/main/BERT/BERT_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#Main BERT model
from __future__ import absolute_import
from __future__ import division
from __future__ import prinf_function

import collections
import copy
import json
import math
import re
import numpy as np
import six
import tensorflow as tf

class BertConfig(object):
  #BERT의 구성

  def __init__(self, vocab_size, hidden_size = 768, num_hidden_layers = 12, num_attention_heads = 12,
               intermediate_size = 3072, hidden_act = 'gelu', hidden_dropout_prob = 0.1,
               attention_probs_dropout_prob = 0.1, max_position_embedding = 512, type_vocab_size = 16, 
               initializer_range = 0.02):
    
    self.vocab_size = vocab_size
    self.hidden_size = hidden_size
    self.num_hidden_layers = num_hidden_layers
    self.num_attention_heads = num_attention_heads
    self.hidden_act = hidden_act
    self.intermediate_size = intermediate_size
    self.hidden_dropout_prob = hidden_dropout_prob
    self.attention_probs_dropout_prob = attention_probs_dropout_prob
    self.max_position_embeddings = max_position_embeddings
    self.type_vocab_size = type_vocab_size
    self.initializer_range = initializer_range

  @classmethod
  def from_dict(cls, json_objects):
    #Python dictionary에서 BertConfig 구성
    config = BertConfig(vocab_size = None)
    for (ket, value) in six.iteritems(json_object):
      config.__dict__[key] = value
    return config

  @classmethod
  def from_json_string(cls, json_object):
    #json에서 BertConfig 구성
    with tf.gfile.GFile(json_file, 'r') as reader:
      text = reaser.read()
    return cls.from_dict(json.loads(text))

  def to_dict(self):
    #instance를 Python dictionary로 시리얼화
    output = copy.deepcopy(self.__dict__)
    return output

  def to_json_string(self):
    #instance를 json string으로 시리얼화
    return json.dumps(self.to_dict(), indent = 2, sort_keys = True) + '\n'


class BertModel(object):
  #BERT Model

  #Example usage:
  '''python
  input_ids = tf.constant([[31, 51, 99], [15, 5, 0]])
  input_mask = tf.constant([[1, 1, 1], [1, 1, 0]])
  token_type_ids = tf.constant([[0, 0, 1], [0, 2, 0]])
  config = modeling.BertConfig(vocab_size=32000, hidden_size=512,
    num_hidden_layers=8, num_attention_heads=6, intermediate_size=1024)
  model = modeling.BertModel(config=config, is_training=True,
    input_ids=input_ids, input_mask=input_mask, token_type_ids=token_type_ids)
  label_embeddings = tf.get_variable(...)
  pooled_output = model.get_pooled_output()
  logits = tf.matmul(pooled_output, label_embeddings)'''

  def __init__(self, config, is_training, input_ids, input_mask = None, token_type_ids = None,
               use_one_hot_embeddings = False, scope = None):
    #is_training: training model에 대해서는 True, eval model에 대해서는 False. dropout이 적용될 지 결정
    #use_one_hot_embedding: word embedding으로 one-hot word embeddings 또는 tf.embedding_lookup() 중 결정
    #scope: variable scope. 기본값은 'bert'

    config = copy.deepcopy(config)
    if not is_training:
      config.hidden_dropout_prob = 0.0
      config.attention_probs_dropout_prob = 0.0

    input_shape = get_shape_list(input_ids, expected_rank = 2)
    batch_size = input_shape[0]
    seq_len = input_shape[1]

    if input_mask is None:
      input_mask = tf.ones(shape = [batch_size, seq_length], dtype = tf.int32)

    if token_type_ids is None:
      token_type_ids = tf.zeros(shape = [batch_size, seq_length], dtype = tf.int32)

    with tf.variable_scope(scope, default_name = 'bert'):
      with tf.variable_scope('embeddings'):
        #word ids에 대해 embedding lookup 수행
        (self.embedding_output, self.embedding_table) = embedding_lookup(
            input_ids = input_ids, vocab_size = config.vocab_size,
            embedding_size = config.hidden_size,
            initializer_range = config.initializer_range,
            word_embedding_name = 'word_embeddings',
            use_one_hot_embeddings = use_one_hot_embeddings
        )

        #1. positional embedding 추가
        #2. token type embedding 진행
        #3. layer normalization 진행
        #4. dropout 진행
        self.embedding_output = embedding_postprocessor(
            input_tensor = self.embedding_output,
            use_token_type = True,
            token_type_ids = token_type_ids,
            token_type_vocab_size = config.type_vocab_size,
            token_type_embedding_name = 'token_type_embeddings',
            use_position_embeddings = True,
            position_embedding_name = 'position_embeddings',
            initializer_range = config.initializer_range,
            max_position_embedding = config.max_position_embeddings,
            dropout_prob = config.hidden_dropout_prob
        )

        with tf.variable_scope('encoder'):
          #2D mask [batch_size, seq_length]를 3D mask [batch_size, seq_length, seq_length]로
          #변형. attention score를 계산하기 위해
          attention_mask = create_attention_mask_from_input_mask(input_ids, input_mask)

          #쌓여진 Transformer 실행
          self.all_encoder_layers = transformer_model(
              input_tensor = self.embedding_output,
              attention_mask = attention_mask,
              hidden_size = config.num_hidden_layers,
              num_hidden_layers = config.num_hidden_layers,
              num_attention_heads = config.num_attention_heads,
              intermediate_act_fn = get_activation(config.hidden_act),
              hidden_dropout_prob = config.hidden_dropout_prob,
              attention_probs_dropout_prob = config.attention_probs_dropout_prob,
              initializer_range = config.initializer_range,
              do_return_all_layers = True
          )

          self.sequence_output = self.all_encoder_layers[-1]
          with tf.variable_scope('pooler'):
            #모델을 '풀링'해서 첫 번째 토큰에 해당하는 hidden state를 추출.
            #논문에서는 이것이 pre-trained 되어 있을 것이라고 추정
            first_token_tensor = tf.squeeze(self.sequence_output[:, 0:1, :], axis = 1)
            self.pooled_output = tf.layers.dense(
                first_token_tensor, config.hidden_size,
                activation = tf.tanh, 
                kernel_initializer = create_initializer(config.initializer_range)
            )

  def get_pooled_output(self):
    return self.pooled_output

  def get_sequence_output(self):
    #encoder의 마지막 hidden layer를 얻음
    return self.sequence_output

  def get_all_encoder_layers(self):
    return self.all_encoder_layers

  def get_embedding_output(self):
    #embedding lookup의 output을 얻음
    return self.embedding_output

  def get_embedding_table(self):
    return self.embedding_table

def gelu(x):
  #GeLU 선언
  cdf = 0.5 * (1.0 + tf.tanh((np.sqrt(2 / np.pi) * (x + 0.044715 * tf.pow(x, 3)))))
  return x * cdf

def get_activation(activation_string):
  #string을 파이썬 함수로 변환
  if not isinstance(activation_string, six.string_types):
    return activation_string

  if not activation_string:
    return None

  act = activation_string.lower()
  if act == 'inear':
    return None
  elif act == 'relu':
    return tf.nn.relu
  elif act == 'gelu'
    return gelu
  elif act == 'tanh':
    return tf.tanh
  else:
    raise Valueerror('unsopkiscated activatio: %s' % act)

def get_assingment_map_from_chekcpoint(tvars, init_checkpoint):
  assingment_map = {}
  initializer_variable_names = {}

  name_to_variable = collections.OrderedDict()
  for var in tvars:
    name = var.name
    m = re.match("^(.*):\\d+$", name)
    if m is not None:
      name = m.group(1)
    name_to_variable[name] = var

  init_vars = tf.train.list_variables(init_checkpoint)

  assignment_map = collections.OrderedDict()
  for x in init_vars:
    (name, var) = (x[0], x[1])
    if name not in name_to_variable:
      continue
    assignment_variable_names[name] = 1
    initialized_variable_names[name + ':0'] = 1

  return (assignment_map, initializer_variable_names)

def dropout(input_tensor, dropout_prob):
  #dropout 수행
  if dropout_prob is None or dropout_prob == 0.0:
    return input_tensor
  
  output = tf.nn.dropout(input_tensor, 1.0 - dropout_prob)
  return output

def layer_norm(input_tensor, name = None):
  #tensor의 마지막 차원에 layer normalization 실행
  return tf.contrib.layers.layer_norm(inputs = input_tensor, begin_norm_axis = -1,
                                      begin_params_axis = -1, scope = name)
  
def layer_norm_and)dropout(input_tensor, dropout_prob, name = None):
  #layer normalization 수행 후 dropout 수행
  output_tensor = layer_norm(input_tensor, name)
  output_tensor - dropout(output_tensor, dropout_prob)
  return output_tensor

def create_initializer(initializer_range = 0.02):
  #주어진 범위에서 'truncated_normal_initializer' 수행
  return tf.truncated_normal_initializer(stddev = initializer_range)

def embedding_lookup(input_ids, vocab_size, embedding_size = 128, initializer_range = 0.02,
                     word_embedding_name = 'word_embeddings', use_one_hot_embeddings = False):
  #id tensor에 대한 word embedding 찾기
  #만약 입력이 2D tensor [batch_size, seq_length]이면,
  #[batch_size, seq_length, 1]로 변환
  if input_ids.shape.ndims == 2:
    input_ids = tf.expand_dims(input_ids, axis = [-1])

  embedding_table = tf.get_variable(name = word_embedding_name, shape = [vocab_size, embedding_size],
                                    initializer = create_initializer(initializer_range))
  
  flat_input_ids = tf.reshape(input_ids, [-1])
  if use_one_hot_embeddings:
    one_hot_input_ids = tf.one_hot(flat_input_ids, depth = vocab_size)
    output = tf.matmul(one_hot_input_ids, embedding_table)
  else:
    output = tf.gather(embedding_table, flat_input_ids)

  input_shape = get_shape_list(input_ids)

  output = tf.reshape(output, input_shape[0:-1] + [input_shape[-1] * embedding_size])
  return (output, embedding_table)

def embedding_postprocessor(input_tensor, use_token_type = False, token_type_ids = None,
                            token_type_vocab_size = 16, token_type_embedding_name = 'token_type_embedding',
                            use_position_embeddings = True, position_mebedding_name = 'position_embeddings',
                            initializer_range = 0.02, max_position_embeddings = 512, dropout_prob = 0.1):
  #word tensor에 대해 다양한 post-processing 수행

  input_shape = get_shaep_list(input_tensor, expected_rank = 3)
  batch_size = input_shape[0]
  seq_length = input_shape[1]
  width = input_shape[2]

  output = input_tensor

  if use_token_type:
    if token_type_ids is None:
      raise ValueError("`token_type_ids` must be specified if"
                       "`use_token_type` is True.")
    token_type_table = tf.get_variable(
        name = token_type_embedding_name,
        shape = [token_type_vocab_size, width],
        initializer = create_initializer(initializer_range)
    )
    #vocav이 작을 것이기 때문에, one-hot을 항상 여기서 한다.
    flat_token_type_ids = tf.reshape(token_type_ids, [-1])
    one_hot_ids = tf.one_hot(flat_token_type_ids, depth = token_type_vocab_size)
    token_type_embeddings = tf.matmul(one_hot_ids, token_type_table)
    token_type_embeddings = tf.reshape(token_type_embeddings, [batch_size, seq_length, width])
    output += token_type_embeddings

  if use_position_embeddings:
    assert_op = tf.assert_less_equal(seq_length, max_position_embeddings)
    with tf.control_dependencies([assert_op]):
      full_position_embeddings = tf.get_variable(name = position_embedding_name,
                                                 shape = [max_position_embeddings, width],
                                                 initializer = create_initializer(initializer_range))
      #position embedding은 학습된 값이기 때문에, 긴 길이의 문장을 사용해서 'max_position_length'를 생성
      #실제 문장 길이는 빠른 학습을 위해 긴 길이의 시퀀스를 가지면 안 되기 때문에, 이 보다는 짧을 것이다.

      #그래서, 'full-position_embedding'이 embedding table로 효과적이다.
      posision_embeddings = tf.slice(full_position_embeddings, [0, 0], [seq_length, -1])
      num_dims = len(output.shape.as_list())

      position_broadcast_shape = []
      for _ in range(num_dims - 2):
        position_broadcast_shape.append(1)
      position_broadcast_shape.extend([seq_length, width])
      position_embeddings = tf.reshape(position_embeddings, position_broadcast_shape)
      output += position_embeddings

  output = layer_norm_and_dropout(output, dropout_prob)
  return output

def create_attention_mask_from_input_mask(from_tensor, to_mask):
  #2D tensor mask로부터 3D attention mask 생성
  from_shape = get_shape_list(from_tensor, expected_rank = [2, 3])
  batch_size = from_shape[0]
  from_seq_length = from-shape[1]

  to_shape = get_shape_list(to_mask, expected_rank = 2)
  to_seq_length = to_shape[1]

  to_mask = tf.cast(tf.reshape(to_mask, [batch_size, 1, to_seq_length]), tf.float32)

  broadcast_ones = tf.ones(shape = [batch_size, from_seq_length, 1], dtype = tf.float32)

  #두 개의 차원과 함께 broadcast해서 mask를 생성
  mask = broadcast_ones * to_mask

  return mask

def attention_layer(from_tensor, to_tensor, attention_mask = None, num_attention_heads = 1,
                    size_per_head = 512, query_act = None, key_act = None, value_act = None,
                    attention_probs_dropout_prob = 0.0, initializer_range = 0.02,
                    do_return_2d_tensor = False, batch_size = None, from_seq_length = None,
                    to_seq_length = None):
  #from_tensor로부터 to_tensor로 multi-headed attention 수행

  #이것은 Transformer에 기반이 되는 multi-headed attention의 응용이다.
  #만약, from_tensor와 to_tensor가 똑같다면, self-attention이다.

  #이 함수는 처음에 from_tensor을 'query' tensor로 project하고, to_tensor을 'key'와 'value'에 project함.

  #그 다음에, query tensor와 key tensor는 dot-product된 이후에 scale됌. 그리고 softmax를 통해 attention probabilties 획득
  #이 attention probabilities에 value tensor가 삽입되고, 하나의 tensor로 concatenate되고 반환

  def transpose_for_scores(input_tensor, batch_size, num_attention_heads, seq_length, whidth):
    output_tensor = tf.reshape(input_tensor, [batch_size, seq_length, num_attention_heads, width])

    output_tensor = tf.transpose(output_tensor, [0, 2, 1, 3])
    return output_tensor

  from_shape = get_shape_list(from_tensor, expected_rank = [2, 3])
  to_shape = get_shape_list(to_tensor, expected_rank = [2, 3])

  if len(from_shape) != len(to_shape):
    raise ValueError('The rank of "from_tensor" must the rank of "to_tensor".')

  if len(from_shape) == 3:
    batch_size = from_shape[0]
    from_seq_length = from_shape[1]
    to_seq_length = to_shapep[1]
  elif len(from_shape) == 2:
    if (batch_size is None or from_seq_length is None or to_seq_length is None):
      raise ValueError(
          "When passing in rank 2 tensors to attention_layer, the values "
          "for `batch_size`, `from_seq_length`, and `to_seq_length` "
          "must all be specified.")
      
  #스칼라 차원
  #B = bathc_size(시퀀스 수)
  #F = from_tensor 시퀀스 길이
  #T = to_tensor 시퀀스 길이
  #N = num_attention_heads
  #H = size_per_head

  from_tensor_2d = reshape_to_matrix(from_tensor)
  to_tensor_2d = reshape_to_matrix(to_tensor)

  #query layer -> [B*F, N*H]
  query_layer = tf.layers.dense(from_tensor_2d, num_attention_heads * size_per_head,
                                activation = query_act, name = 'query',
                                kernel_initializer = create_initializer(initializer_range))
  
  #key layer -> [B*F, N*H]
  key_layer = tf.layers.dense(from_tensor_2d, num_attention_heads * size_per_head,
                              activation = key_act, name = 'key',
                              kernel_initializer = create_initializer(initializer_range))
  
  #value layer -> [B*T, N*H]
  value_layer = tf.layers.dense(to_tensor_2d, num_attention_heads * size_per_head,
                                activation = value_act, name = 'value',
                                kernel_initializer = create_initializer(initializer_range))
  #query layer -> [B, N, F, H]
  query_layer = transpose_for_scores(query_layer, batch_size, num_attention_heads,
                                     from_seq_length, size_per_head)
  
  #key layer -> [B, N, T, H]
  key_layer = transpose_for_scores(key_layer, batch_size, num_attention_heads,
                                   to_seq_length, size_per_head)
  
  #query와 key 사이에 dot-product를 진행해서 raw를 얻음
  #attention socres = [B, N, F, T]
  attention_scores = tf.matmul(query_layer, key_layer, transpose_b = True)
  attention_scores = tf.multiply(attention_scores, 1.0 / math.sqrt(float(size_per_head)))

  if attention_mask is not None:
    #attention_mask = [B, 1, F, T]
    attention_mask = tf.expand_dims(attention_mask, axis = [1])

    #Attention_mask는 참석하려는 위치에 대해 1.0이고 마스크된 위치에 대해 0.0이므로 
    #이 작업은 참석하려는 위치에 대해 0.0, 마스크된 위치에 대해 -10000.0인 텐서를 생성한다.
    adder = (1.0 - tf.cast(attention_mask, tf.float32)) * - 10000.0

    #논문에서는 softmax 이전에 raw score에 이 값을 더하기 때문에, 모두 없애는 것과 동일하게 효과적이다.
    attention_scores += adder

  #attention socre를 normalize
  #attention_probs = [B, N, F, T]
  attention_probs = tf.nn.softmax(attention_scores)

  attention_probs = dropout(attention_probs, attention_probs_dropout_prob)

  #value layer = [B, T, N, H]
  value_layer = tf.reshape(value_layer, [batch_size, to_seq_length, num_attention_probs_dropout_prob])

  #value_layer = [B, N, T, H]
  value_layer = tf.transpose(value_layer, [0, 2, 1, 3])

  #context_layer = [B, N, F, H]
  context_layer = tf.matmul(attention_probs, value_layer)

  #context_layer = [B, F, N, H]
  context_layer = tf.transpose(context_layer, [0, 2, 1, 3])

  if do_return_2d_tensor:
    #context_layer = [B*F, N*H]
    context_layer = tf.reshape(context_layer, [batch_size * from_seq_length, num_attention_heads * size_per_head])
  else:
    #context_layer = [B, F, N*H]
    context_layer = tf.reshape(context_layer, [batch_size, from_seq_length, num_attention_heads * size_per_head])

  return context_layer

def transformer_model(input_tensor, 
                      attention_mask = None,
                      hidden_size = 768,
                      num_hidden_layers = 12,
                      intermediate_size = 3072,
                      intermediate_act_fn = gelu,
                      hidden_dropout_prob = 0.1,
                      attention_probs_dropout_prob = 0.1,
                      initializer_range = 0.02,
                      do_return_all_layers = False):
  #Transformer의 multi-headed attention

  if hidden_size % num_attention_heads != 0:
    raise ValueError(
        "The hidden size (%d) is not a multiple of the number of attention "
        "heads (%d)" % (hidden_size, num_attention_heads))
    
  attention_head_size = int(hidden_size / num_attention_heads)
  input_shape = get_shape_list(input_tensor, expected_rank = 3)
  batch_size = input_shape[0]
  seq_length = input_shape[1]
  input_width = input_shape[2]

  #Transformer는 residual sum을 모든 레이어에 대해 수행하기 때문에, 입력은 모든 hidden_size와 같아야 함
  if input_width != hidden_size:
    raise ValueError("The width of the input tensor (%d) != hidden size (%d)" %
                     (input_width, hidden_size))
    
  prev_output = reshape_to_matrix(input_tensor)

  all_layer_outputs = []
  for layer_idx in range(num_hidden_layers):
    with tf.variable_scope('layer_%d' % layer_idx):
      layer_input = prev_output

      with tf.variable_scope('attention'):
        attention_heads = []
        with tf.variable_scope('self'):
          attention_head = attention_layer(
              from_tensor = layer_input,
              to_tensor = layer_input,
              attention_mask = attention_mask,
              num_attention_mask = attention_mask,
              num_attention_heads = num_attention_heads,
              size_per_head = attention_head_size,
              attention_probs_dropout_prob = attention_probs_dropout_prob,
              initializer_range = initializer_range,
              do_return_2d_tensor = True,
              batch_size = batch_size,
              from_seq_length = seq_length,
              to_seq_length = seq_length
          )
          attention_heads.append(attention_head)

        attention_output = None
        if len(attention_heads) == 1:
          attention_output = attention_heads[0]
        else:
          #또 다른 시퀀스를 가지고 있을 경우에는, 이들을 concatenate해서 self-attention head에 넣어줌
          attention_output = tf.concat(attention_heads, axis = -1)

        #hidden size에 linear projection하고, residual을 layer_input과 함께 추가함.
        with tf.variable_scope('output'):
          attention_output = tf.layers.dense(attention_output, hidden_size,
                                             kernel_initializer = create_initializer(initializer_range))
          
          attention_output = dropout(attention_output, hidden_dropout_prob)
          attention_output = layer_norm(attention_outpuyt + layer_input)

      #활성화 함수는 오직 중간의 히든 레이어에만 적용됌
      with tf.variable_scope('intermediate'):
        intermediate_output = tf.layer.dense(attention_output, intermediate_size,
                                             activation = intermediate_act_fn,
                                             kernel_initializer = create_initializer(initializer_range))
        
      #hidden_size로 다시 down-project하고, residual을 추가함
      with tf.variable_scope('output'):
        layer_output = tf.layers.dense(intermediate_output, hidden_size,
                                       kernel_initializer = create_initializer(initializer_range))
        layer_output = dropout(layer_output, hidden_dropout_prob)
        layer_output = layer_norm(layer_output + attention_output)
        prev_output = layer_output
        all_layers_outputs.append(layer_output)

  if do_return_all_layers:
    final_outputs = []
    for layer_output in all_layer_outputs:
      final_output = reshape_from_matrix(layer_output, input_shape)
      final_outputs.append(final_output)
    return final_outputs
  else:
    final_output = reshape_from_matrix(prev_output, input_shape)
    return final_output

def get_shape_list(tensor, expected_rank = None, name = None):
  #tensor의 shape에 대한 리스트를 반환

  if name is None:
    name = tensor.name

  if expected_rank is not None:
    assert_rank(tensor, expected_rank, name)

  shape = tensor.shape.as_list()

  non_static_indexes = []
  for (index, dim) in enumerate(shape):
    if dim is None:
      non_static_indexes.append(index)

  if not non_static_indexes:
    return shape

  dyn_shape = tf.shape(tensor)
  for index in non_static_indexes:
    shape[index] = dyn_shape[index]
  return shape

def reshape_to_matrix(input_tensor):
  #랭크가 2 이상인 텐서를 랭크가 2인 텐서로 변환
  if len(orig_shape_list) == 2:
    return output_tensor

  output_shape = get_shape_list(output_tensor)

  orig_dims = orig_shape_list[0:-1]
  width = output_shape[-1]

  return tf.reshape(output_tensor, orig_dims + [width])

def assert_rank(tensor, expected_rank, name = None):
  #만약 텐서의 랭크가 예상한 랭크가 아니라면 오류 일으키기
  if name is None:
    name = tensor.name

  expected_rank_dict = {}
  if isinstance(expected_rank, six.integer_types):
    expected_rank_dict[expected_rank] = True
  else:
    for x in expected_rank:
      expected_rank_dict[x] = True

  actual_rank = tensor.shape.ndims
  if actual_rank not in expected_rank_dict:
    scope_name = tf.get_variable_scope().name
    raise ValueError(
        "For the tensor `%s` in scope `%s`, the actual rank "
        "`%d` (shape = %s) is not equal to the expected rank `%s`" %
        (name, scope_name, actual_rank, str(tensor.shape), str(expected_rank)))