## 스팀 리뷰 데이터 감성 분석 모델(이진 분류)
- 사용 데이터 : 스팀 보유자 수 기준 상위 5000개 + a 게임의 한국어 리뷰 데이터, 총 194263개
  - 각 리뷰는 최대 8000글자를 담을 수 있음
- 모델 : [krBERT](https://github.com/snunlp/KR-BERT)을 Fine tuning함


## tf 2.x 버전 실행


In [1]:
!pip install transformers
!pip install tf-models-official
!pip install keras-tuner

import os
os.chdir('drive/MyDrive/Now/KR-BERT/krbert_tensorflow')
from transformers import BertConfig, TFBertModel, load_tf_weights_in_bert
import logging
import pandas as pd
import tensorflow as tf
import tensorflow_models as tfm
import numpy as np
from sklearn.model_selection import train_test_split
import re
import csv
from google.colab import auth
from google.cloud import storage
import subprocess
import collections
import matplotlib.pyplot as plt
from tensorflow.keras import backend as K
import keras_tuner

# krbert의 `.py` 파일
import modeling
import tokenization_ranked as tokenization
import optimization

tf.get_logger().setLevel(logging.INFO)

real_init_checkpoint = 'gs://steam-project-bucket/krbert_tensorflow/MyModel/'
real_bert_config_file = 'bert_config_char16424.json'
real_data_dir = './data/steam/'
real_vocab_file = 'vocab_char_16424.txt'

# 모델 훈련 체크포인트 & 결과가 저장되는 디렉토리
# output_dir = "gs://steam-project-bucket/krbert_tensorflow/ModelTrain/"


class InputExample(object):
  """A single training/test example for simple sequence classification."""

  def __init__(self, guid, text_a, text_b=None, label=None):
    """Constructs a InputExample.

    Args:
      guid: Unique id for the example.
      text_a: string. The untokenized text of the first sequence. For single
        sequence tasks, only this sequence must be specified.
      text_b: (Optional) string. The untokenized text of the second sequence.
        Only must be specified for sequence pair tasks.
      label: (Optional) string. The label of the example. This should be
        specified for train and dev examples, but not for test examples.
    """
    self.guid = guid
    self.text_a = text_a
    self.text_b = text_b
    self.label = label

class DataProcessor(object):
  """Base class for data converters for sequence classification data sets."""

  def get_train_examples(self, data_dir):
    """Gets a collection of `InputExample`s for the train set."""
    raise NotImplementedError()

  def get_dev_examples(self, data_dir):
    """Gets a collection of `InputExample`s for the dev set."""
    raise NotImplementedError()

  def get_test_examples(self, data_dir):
    """Gets a collection of `InputExample`s for prediction."""
    raise NotImplementedError()

  def get_labels(self):
    """Gets the list of labels for this data set."""
    raise NotImplementedError()

  @classmethod
  def _read_tsv(cls, input_file, quotechar=None):
    """Reads a tab separated value file."""
    with tf.io.gfile.GFile(input_file, "r") as f:
      reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
      lines = []
      for line in reader:
        lines.append(line)
      return lines

class SteamProcessor(DataProcessor):
  """
  직접 수정 : 상속 받아서 메서드 오버라이드함
  """
  def get_train_examples(self, data_dir, csv_name):
    """See base class."""
    return self._create_examples(
        self._read_csv(os.path.join(data_dir, csv_name)), "train")

  def get_dev_examples(self, data_dir, csv_name):
    """See base class."""
    return self._create_examples(
        self._read_csv(os.path.join(data_dir, csv_name)), "dev")

  def get_test_examples(self, data_dir, csv_name):
    """See base class."""
    return self._create_examples(
        self._read_csv(os.path.join(data_dir, csv_name)), "test")

  def get_labels(self):
    """See base class."""
    return ["0", "1"]

  def _create_examples(self, lines, set_type):
    """Creates examples for the training and dev sets."""
    examples = []
    for (i, line) in enumerate(lines):
      if i == 0: continue
      guid = "%s-%s" % (set_type, i)

      # 230831 수정
      # text_a는 모든 문장을 가져옴
      # 또한, 일부 문장의 양 끝에 "이 나타나므로 이를 제거함
      entire_text = "".join(line[1:-1]) # 문장 합침
      entire_text = self.preprocess_text(entire_text)

      text_a = tokenization.convert_to_unicode(entire_text)
      label = tokenization.convert_to_unicode(line[-1])

      examples.append(
          InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
      # if i == 10:
      #   break
    return examples

  # 230831 : 텍스트 가공(필요 없는 문자 제거, 문장 통합..)
  def preprocess_text(self, text):

    processed_text = text.strip('"')

    # http:// 로 시작하는 링크 제거
    http_pattern = r"https?://\S+" # http://나 https://로 시작하며 공백문자가 아닌 패턴
    processed_text = re.sub(http_pattern, '', processed_text)

    # HTML 태그 제거
    html_pattern = r"\[.*?\]"
    processed_text = re.sub(html_pattern, '', processed_text)

    return processed_text

  @classmethod
  def _read_csv(cls, input_file, quotechar = None):
    with tf.io.gfile.GFile(input_file, "r") as f:
      reader = csv.reader(f, quotechar=quotechar)
      lines = []
      for line in reader:
        lines.append(line)
      return lines

def file_based_convert_examples_to_features(
    examples, label_list, max_seq_length, tokenizer, output_file):
  """Convert a set of `InputExample`s to a TFRecord file."""

  writer = tf.compat.v1.python_io.TFRecordWriter(output_file)
  # writer = tf.io.TFRecordWriter(output_file)

  for (ex_index, example) in enumerate(examples):

    if ex_index % 10000 == 0:
      tf.compat.v1.logging.info("Writing example %d of %d" % (ex_index, len(examples)))

    feature = convert_single_example(ex_index, example, label_list,
                                     max_seq_length, tokenizer)

    def create_int_feature(values):
      f = tf.compat.v1.train.Feature(int64_list=tf.train.Int64List(value=list(values)))
      return f

    features = collections.OrderedDict()
    features["input_ids"] = create_int_feature(feature.input_ids)
    features["input_mask"] = create_int_feature(feature.input_mask)
    features["segment_ids"] = create_int_feature(feature.segment_ids)
    features["label_ids"] = create_int_feature([feature.label_id])
    features["is_real_example"] = create_int_feature(
        [int(feature.is_real_example)])

    tf_example = tf.compat.v1.train.Example(features=tf.train.Features(feature=features))
    writer.write(tf_example.SerializeToString())
  writer.close()


# 사용 함수 모음

def _truncate_seq_pair(tokens_a, tokens_b, max_length):
  """Truncates a sequence pair in place to the maximum length."""

  # This is a simple heuristic which will always truncate the longer sequence
  # one token at a time. This makes more sense than truncating an equal percent
  # of tokens from each, since if one sequence is very short then each token
  # that's truncated likely contains more information than a longer sequence.
  while True:
    total_length = len(tokens_a) + len(tokens_b)
    if total_length <= max_length:
      break
    if len(tokens_a) > len(tokens_b):
      tokens_a.pop()
    else:
      tokens_b.pop()

def preprocess_text(text):
  """
  http:// 제거 및 HTML 태그 제거
  """

  processed_text = text.strip('"')

  # http:// 로 시작하는 링크 제거
  http_pattern = r"https?://\S+" # http://나 https://로 시작하며 공백문자가 아닌 패턴
  processed_text = re.sub(http_pattern, '', processed_text)

  # HTML 태그 제거
  html_pattern = r"\[.*?\]"
  processed_text = re.sub(html_pattern, '', processed_text)

  return processed_text


def convert_single_example(ex_index, example, label_list, max_seq_length,
                           tokenizer):
  """Converts a single `InputExample` into a single `InputFeatures`."""

  if isinstance(example, PaddingInputExample):
    return InputFeatures(
        input_ids=[0] * max_seq_length,
        input_mask=[0] * max_seq_length,
        segment_ids=[0] * max_seq_length,
        label_id=0,
        is_real_example=False)

  label_map = {}
  for (i, label) in enumerate(label_list):
    label_map[label] = i

  # 230918 : 토큰화 전, 필요없어 보이는 텍스트 제거
  text_a = preprocess_text(example.text_a)

  tokens_a = tokenizer.tokenize(text_a)
  tokens_b = None
  if example.text_b:
    tokens_b = tokenizer.tokenize(example.text_b)

  if tokens_b:
    # Modifies `tokens_a` and `tokens_b` in place so that the total
    # length is less than the specified length.
    # Account for [CLS], [SEP], [SEP] with "- 3"
    _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)
  else:
    # Account for [CLS] and [SEP] with "- 2"
    if len(tokens_a) > max_seq_length - 2:
      tokens_a = tokens_a[0:(max_seq_length - 2)]

  # The convention in BERT is:
  # (a) For sequence pairs:
  #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
  #  type_ids: 0     0  0    0    0     0       0 0     1  1  1  1   1 1
  # (b) For single sequences:
  #  tokens:   [CLS] the dog is hairy . [SEP]
  #  type_ids: 0     0   0   0  0     0 0
  #
  # Where "type_ids" are used to indicate whether this is the first
  # sequence or the second sequence. The embedding vectors for `type=0` and
  # `type=1` were learned during pre-training and are added to the wordpiece
  # embedding vector (and position vector). This is not *strictly* necessary
  # since the [SEP] token unambiguously separates the sequences, but it makes
  # it easier for the model to learn the concept of sequences.
  #
  # For classification tasks, the first vector (corresponding to [CLS]) is
  # used as the "sentence vector". Note that this only makes sense because
  # the entire model is fine-tuned.
  tokens = []
  segment_ids = []
  tokens.append("[CLS]")
  segment_ids.append(0)
  for token in tokens_a:
    tokens.append(token)
    segment_ids.append(0)
  tokens.append("[SEP]")
  segment_ids.append(0)

  if tokens_b:
    for token in tokens_b:
      tokens.append(token)
      segment_ids.append(1)
    tokens.append("[SEP]")
    segment_ids.append(1)

  input_ids = tokenizer.convert_tokens_to_ids(tokens)

  # The mask has 1 for real tokens and 0 for padding tokens. Only real
  # tokens are attended to.
  input_mask = [1] * len(input_ids)

  # Zero-pad up to the sequence length.
  while len(input_ids) < max_seq_length:
    input_ids.append(0)
    input_mask.append(0)
    segment_ids.append(0)

  assert len(input_ids) == max_seq_length
  assert len(input_mask) == max_seq_length
  assert len(segment_ids) == max_seq_length

  label_id = label_map[example.label]
  if ex_index < 5:
    tf.compat.v1.logging.info("*** Example ***")
    tf.compat.v1.logging.info("guid: %s" % (example.guid))
    tf.compat.v1.logging.info("tokens: %s" % " ".join(
        [tokenization.printable_text(x) for x in tokens]))
    tf.compat.v1.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
    tf.compat.v1.logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
    tf.compat.v1.logging.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
    tf.compat.v1.logging.info("label: %s (id = %d)" % (example.label, label_id))

  feature = InputFeatures(
      input_ids=input_ids,
      input_mask=input_mask,
      segment_ids=segment_ids,
      label_id=label_id,
      is_real_example=True)

  return feature

class InputExample(object):
  """A single training/test example for simple sequence classification."""

  def __init__(self, guid, text_a, text_b=None, label=None):
    """Constructs a InputExample.

    Args:
      guid: Unique id for the example.
      text_a: string. The untokenized text of the first sequence. For single
        sequence tasks, only this sequence must be specified.
      text_b: (Optional) string. The untokenized text of the second sequence.
        Only must be specified for sequence pair tasks.
      label: (Optional) string. The label of the example. This should be
        specified for train and dev examples, but not for test examples.
    """
    self.guid = guid
    self.text_a = text_a
    self.text_b = text_b
    self.label = label

class PaddingInputExample(object):
  """Fake example so the num input examples is a multiple of the batch size.

  When running eval/predict on the TPU, we need to pad the number of examples
  to be a multiple of the batch size, because the TPU requires a fixed batch
  size. The alternative is to drop the last batch, which is bad because it means
  the entire output data won't be generated.

  We use this class instead of `None` because treating `None` as padding
  batches could cause silent errors.
  """

class InputFeatures(object):
  """A single set of features of data."""

  def __init__(self,
               input_ids,
               input_mask,
               segment_ids,
               label_id,
               is_real_example=True):
    self.input_ids = input_ids
    self.input_mask = input_mask
    self.segment_ids = segment_ids
    self.label_id = label_id
    self.is_real_example = is_real_example

def _decode_record(record, name_to_features):
  """Decodes a record to a TensorFlow example."""
  example = tf.compat.v1.io.parse_single_example(record, name_to_features)

  # tf.Example only supports tf.int64, but the TPU only supports tf.int32.
  # So cast all int64 to int32.
  for name in list(example.keys()):
    t = example[name]
    if t.dtype == tf.compat.v1.int64:
      t = tf.compat.v1.to_int32(t)
    example[name] = t

  return example


def create_krbert_model(bert_config, num_labels, dropout_rate = 0.1):
    """
    Creates a classification model.
    230919 : BERT Model 자체는 Hugging Face로 구축하지만 손실함수 구축 때문에 가져온다.
    + 텐서플로우 2버전에 맞게 코드를 변경한다.
    """
    bert_model = TFBertModel(config=bert_config)

    input_ids_input = tf.keras.layers.Input(shape=(None, ), dtype=tf.int32, name='input_ids')
    input_mask_input = tf.keras.layers.Input(shape=(None, ), dtype=tf.int32, name='input_mask')
    segment_ids_input = tf.keras.layers.Input(shape=(None, ), dtype=tf.int32, name='segment_ids')

    outputs = bert_model(
        input_ids=input_ids_input,
        attention_mask=input_mask_input
    )

    output_layer = outputs[1]
    dropout_layer = tf.keras.layers.Dropout(dropout_rate)(output_layer)

    classification_layer = tf.keras.layers.Dense(1,
                                         activation = 'sigmoid',
                                         name = 'output_layer')(dropout_layer)


    model = tf.keras.models.Model(inputs = [input_ids_input,
                                         input_mask_input,
                                         segment_ids_input],
                               outputs = classification_layer)

    return model

def parse_record(record):
  return _decode_record(record, name_to_features)

def split_data_and_label(dataset):
  features = {
      'input_ids' : dataset['input_ids'],
      'input_mask' : dataset['input_mask'],
      'is_real_example' : dataset['is_real_example'],
      'segment_ids' : dataset['segment_ids']
  }
  target = dataset['label_ids']
  return features, target


# def lr_warmup(learning_rate, warmup_steps_ratio, global_steps, total_steps):

#   """
#   warmup_step 동안 학습률이 선형적으로 증가하고, 이후는 일정한 값을 유지
#   KrBERT는 AdamW를 이용하므로 이후에 학습률은 Decay됨

#   <혼동 방지>
#   global_steps : 모든 에포크를 통틀어 한 배치 당 1스텝
#   total_steps : 한 에포크 당 스텝(=전체 데이터를 배치 수로 나눈 값) * 에포크 수
#   """

#   warmup_steps = total_steps * warmup_steps_ratio
#   warmup_steps = tf.constant(num_warmup_steps, dtype=tf.float32)
#   global_steps = tf.cast(global_steps, tf.float32)

#   warmup_percent_done = global_steps / warmup_steps
#   warmup_learning_rate = learning_rate * warmup_percent_done

#   is_warmup = tf.cast(global_steps < warmup_steps, tf.float32)
#   learning_rate = (
#       (1.0 - is_warmup) * learning_rate + is_warmup * warmup_learning_rate)

#   return learning_rate

class WarmupCallback(tf.keras.callbacks.Callback):
    def __init__(self, total_steps=0, warmup_ratio = 0.0):

        super(WarmupCallback, self).__init__()
        # self.learning_rate = learning_rate
        self.total_steps = total_steps
        self.warmup_ratio = warmup_ratio
        self.global_step = 0
        self.lrs = []

    def on_train_batch_begin(self, batch, logs=None):
        previous_lr = (self.lrs[-1] if self.lrs
                                    else K.get_value(self.model.optimizer.learning_rate))

        lr = self.lr_warmup(learning_rate = previous_lr,
                        global_steps=self.global_step,
                        total_steps=self.total_steps,
                        warmup_steps_ratio=self.warmup_ratio,
                       )
        # print('step start :', lr)
        K.set_value(self.model.optimizer.lr, lr)

    def on_train_batch_end(self, batch, logs=None):
        self.global_step = self.global_step + 1

        # 학습률 저장
        # lr = K.get_value(self.model.optimizer.lr)
        lr = K.get_value(self.model.optimizer.learning_rate)
        # print('step end : ', lr)

        self.lrs.append(lr)

    def lr_warmup(self, learning_rate, warmup_steps_ratio, global_steps, total_steps):

      """
      warmup_step 동안 학습률이 선형적으로 증가하고, 이후는 일정한 값을 유지
      KrBERT는 AdamW를 이용하므로 이후에 학습률은 Decay됨

      <혼동 방지>
      global_steps : 모든 에포크를 통틀어 한 배치 당 1스텝
      total_steps : 한 에포크 당 스텝(=전체 데이터를 배치 수로 나눈 값) * 에포크 수
      """

      warmup_steps = total_steps * warmup_steps_ratio
      warmup_steps = tf.constant(warmup_steps, dtype=tf.float32)
      global_steps = tf.cast(global_steps, tf.float32)

      warmup_percent_done = global_steps / warmup_steps
      warmup_learning_rate = learning_rate * warmup_percent_done

      is_warmup = tf.cast(global_steps < warmup_steps, tf.float32)
      learning_rate = (
          (1.0 - is_warmup) * learning_rate + is_warmup * warmup_learning_rate)

      return learning_rate

# 230927 : 학습률 변화 추적 콜백함수
class LearningRateMonitor(tf.keras.callbacks.Callback):
  def on_epoch_end(self, epoch, logs = None):
    lr = self.model.optimizer.learning_rate
    print(f"Epoch {epoch + 1} : Learning Rate : {lr.numpy()}")

# 230927 : 웜업 스텝 추적 콜백함수
class WarmupLrPrintCallback(tf.keras.callbacks.Callback):
    def __init__(self, warmup_steps):
        super(WarmupLrPrintCallback, self).__init__()
        self.warmup_steps = warmup_steps
        self.current_step = 1

    def on_train_batch_end(self, batch, logs=None):
        # 웜업 스텝 범위 내에서만 실행
        if self.current_step % 500 == 0 and self.current_step < self.warmup_steps:
          learning_rate = self.model.optimizer.learning_rate.numpy()
          print(f"Now Warm-up Step : {self.current_step}, Learning Rate = {learning_rate:.6f}")

        if self.current_step == self.warmup_steps:
            end_warmup_lr = self.model.optimizer.learning_rate.numpy()
            print(f'End Warm-up Step :{self.current_step}: Learning Rate = {end_warmup_lr:.6f}')
        self.current_step += 1

# 하이퍼파라미터 튜닝
EPOCHS = 15

def build_model_hpo(hp, hpo = True):

  if hpo:
    # freeze_body = hp.Boolean('freeze_body')
    dropout_rate = hp.Float('dropout_rate', min_value = 0, max_value = 0.5, step = 0.1)
    weight_decay = hp.Float('weight_decay', min_value = 0.01, max_value = 0.1, step = 0.01)
    learning_rate = hp.Float('Learning_rate', min_value = 5e-6, max_value = 5e-5, step = 5e-6)

    # 모델 훈련 파라미터
    # warmup_ratio = 0.1
    # warmup_steps = int(train_total_steps * warmup_ratio)
  # {'dropout_rate': 0.0,
  # 'learning_rate': 1.5000000000000002e-05,
  # 'warmup_steps': 1500,
  # 'end_learning_rate': 1e-07,
  # 'weight_decay': 0.05}
  else:
    droput_rate = 0.1
    weight_decay = 5e-2

  # learning_rate = 1.5e-5
  warmup_steps = 1500
  end_learning_rate = 1e-7

  epochs = EPOCHS
  train_steps_per_epoch = len(train_ex) // train_batch_size
  train_total_steps = train_steps_per_epoch * epochs
  decay_steps = train_total_steps - warmup_steps

  # 모델 정의 및 훈련
  config = BertConfig.from_json_file(real_bert_config_file)

  with strategy.scope():
    model = create_krbert_model(
        bert_config = config,
        num_labels = 2,
        dropout_rate = dropout_rate
    )
    load_tf_weights_in_bert(model,
                          config,
                          "./MyModel_raw/model.ckpt-2000000.index")

  loss = tf.keras.losses.BinaryCrossentropy()

  lr_schedule = tf.keras.optimizers.schedules.PolynomialDecay(
      learning_rate,
      decay_steps,
      end_learning_rate = end_learning_rate,
      power = 1.0,
      cycle = False
  )

  warmup_lr_schedule = tfm.optimization.PolynomialWarmUp(
      after_warmup_lr_sched=lr_schedule,
      warmup_steps=warmup_steps,
      power=1.0
  )

  optimizer = tf.keras.optimizers.AdamW(
      learning_rate = warmup_lr_schedule,
      weight_decay = weight_decay,
      epsilon = 1e-6,
      clipnorm = 1.0
  )

  model.compile(
      optimizer = optimizer,
      loss = loss,
      metrics = ['accuracy']
  )

  return model

# GCS 연동, TPU 설정, csv 파일에서 데이터를 나눠서 다시 저장

# GCS 연동 과정
auth.authenticate_user()

# Google Cloud 프로젝트 ID
project_id = 'copying-book'
# project_id = 'service-copying-book@cloud-tpu.iam.gserviceaccount.com'

!gcloud config set account dowrave@gmail.com

# 이거 자동화는 무리일 듯 : 2개의 확인을 거쳐 들어가야 하고, 보안 키는 계속 달라짐
!gcloud auth login --no-launch-browser --quiet --force
!gcloud config set project 'copying-book'
# !gsutil acl ch -u dowrave@gmail.com:WRITE gs://steam-project-bucket

# GCS 클라이언트
client = storage.Client()

# GCS 버킷 이름
bucket_name = 'steam-project-bucket'

# GCS 버킷 객체
bucket = client.get_bucket(bucket_name)
blob = bucket.blob('krbert_tensorflow/ModelTrain')

!echo $COLAB_TPU_ADDR # TPU의 IP와 포트 확인

try:
  TPU_PATH = f"grpc://{os.environ['COLAB_TPU_ADDR']}"
  use_tpu = True
except:
  use_tpu = False

record_dir = "gs://steam-project-bucket/krbert_tensorflow/ModelTrain/"
tf.io.gfile.makedirs(record_dir)


if use_tpu:
  # TPU와 런타임 연결

  tpu = tf.distribute.cluster_resolver.TPUClusterResolver(tpu=TPU_PATH)

  tf.config.experimental_connect_to_cluster(tpu)
  tf.tpu.experimental.initialize_tpu_system(tpu)
  strategy = tf.distribute.TPUStrategy(tpu)

else:
  strategy = tf.distribute.get_strategy()

# 토크나이저
vocab_file = './vocab_char_16424.txt'
tokenizer = tokenization.FullTokenizer(vocab_file = real_vocab_file,
                                      do_lower_case = False)
max_seq_length = 512 # 고정

train_batch_size = 64
# val_batch_size = train_batch_size // 8 # 231021 : 데이터 사이즈에 맞춤
val_batch_size = 32

project_name = f"TEST1_steam_krbert_BO_seq{max_seq_length}_{train_batch_size}_{val_batch_size}/"
project_exists = bucket.blob(f'krbert_tensorflow/ModelTrain/' + project_name).exists()

# tfrecord 파일 경로
train_tfrecord_name = f'REAL_LAST_train{train_batch_size}_seq{max_seq_length}.tfrecord'
val_tfrecord_name = f'REAL_LAST_val{train_batch_size}_seq{max_seq_length}.tfrecord'
test_tfrecord_name = f'REAL_LAST_test{train_batch_size}_seq{max_seq_length}.tfrecord'

train_exists = bucket.blob(f'krbert_tensorflow/ModelTrain/' + train_tfrecord_name).exists()
val_exists = bucket.blob(f'krbert_tensorflow/ModelTrain/' + val_tfrecord_name).exists()
test_exists = bucket.blob(f'krbert_tensorflow/ModelTrain/' + test_tfrecord_name).exists()

record_dir = "gs://steam-project-bucket/krbert_tensorflow/ModelTrain/"
train_file_dir = record_dir + train_tfrecord_name
val_file_dir = record_dir + val_tfrecord_name
test_file_dir = record_dir + test_tfrecord_name

train_csv_name = f'train_data_{train_batch_size}.csv'
val_csv_name = f'val_data_{train_batch_size}.csv'
test_csv_name = f'test_data_{train_batch_size}.csv'

# tfrecord가 없으면 데이터 분리 = csv 파일을 다시 만든다.
# if train_exists == False or val_exists == False:

# 프로젝트명의 폴더가 없으면 데이터 분리부터 작업 진행.
if project_exists == False:
  print("데이터 재생성 및 분리")

  # 인풋 데이터 훈련, 검증, 테스트 데이터로 분리
  raw_df = pd.read_csv('../../korean_review_raw.csv', index_col = 0)
  df = raw_df[['id', 'review', 'recommend']]
  df = df.dropna(subset = ['review'])


  # 비율은 8 : 1 : 1
  train_data, temp_data = train_test_split(df,
                                          test_size = 0.2,
                                          stratify = df['recommend'])

  val_data, test_data = train_test_split(temp_data,
                                          test_size = 0.5,
                                          stratify = temp_data['recommend'])

  train_data.to_csv(real_data_dir + train_csv_name, index=False)
  val_data.to_csv(real_data_dir + val_csv_name, index=False)
  test_data.to_csv(real_data_dir + test_csv_name, index=False)

  processor = SteamProcessor()

  train_ex = processor.get_train_examples(real_data_dir, train_csv_name)
  val_ex = processor.get_dev_examples(real_data_dir, val_csv_name)
  test_ex = processor.get_test_examples(real_data_dir, test_csv_name)
  label_list = processor.get_labels()

  # train
  file_based_convert_examples_to_features(train_ex,
                                          label_list,
                                          max_seq_length,
                                          tokenizer,
                                          train_file_dir)
  # val
  file_based_convert_examples_to_features(val_ex,
                                          label_list,
                                          max_seq_length,
                                          tokenizer,
                                          val_file_dir)

  # test
  file_based_convert_examples_to_features(test_ex,
                                          label_list,
                                          max_seq_length,
                                          tokenizer,
                                          test_file_dir)

else:
  print("이미 tfrecord 파일이 있음")
  processor = SteamProcessor()
  train_ex = processor.get_train_examples(real_data_dir, train_csv_name)
  val_ex = processor.get_dev_examples(real_data_dir, val_csv_name)
  test_ex = processor.get_test_examples(real_data_dir, test_csv_name)

buffer_size = 100
name_to_features = {
    "input_ids": tf.io.FixedLenFeature([max_seq_length], tf.int64),
    "input_mask": tf.io.FixedLenFeature([max_seq_length], tf.int64),
    "segment_ids": tf.io.FixedLenFeature([max_seq_length], tf.int64),
    "label_ids": tf.io.FixedLenFeature([], tf.int64),
    "is_real_example": tf.io.FixedLenFeature([], tf.int64),
}


# tfrecord -> dataset
train_ds = (tf.data.TFRecordDataset(train_file_dir).shuffle(buffer_size = buffer_size)
                                                .map(parse_record)
                                                .batch(train_batch_size,
                                                      drop_remainder = True)
                                                # .repeat()
                                                )

val_ds = (tf.data.TFRecordDataset(val_file_dir).shuffle(buffer_size = buffer_size)
                                                .map(parse_record)
                                                .batch(val_batch_size,
                                                      drop_remainder = True)
                                                )
# test_ds = (tf.data.TFRecordDataset(test_file_dir).shuffle(buffer_size = 100)
#                                                 .map(parse_record)
#                                                 .batch(test_batch_size,
#                                                       drop_remainder = True)
#                                                 # .repeat()
#                                                 )

# 데이터셋에서 타겟 데이터 분리
train_ds = train_ds.map(lambda x : split_data_and_label(x))
val_ds = val_ds.map(lambda x : split_data_and_label(x))
# test_ds = test_ds.map(lambda x : split_data_and_label(x))
# ----------------------------------------------------------------------------

# 하이퍼파라미터 튜닝 모델 생성
hp = keras_tuner.HyperParameters()
# model = build_model_hpo(hp)

# 튜너 설정
use_tuner = 'BO'

# Bayesian Optimzation
if use_tuner == 'BO':
  tuner = keras_tuner.BayesianOptimization(
      hypermodel = build_model_hpo,
      objective = 'val_loss',
      max_trials = 15,
      executions_per_trial = 3,
      overwrite = False,
      directory = 'gs://steam-project-bucket/krbert_tensorflow/ModelTrain',
      # directory = 'ModelTrain',
      project_name= project_name,
  )

# Hyper Band
elif use_tuner == 'HB':
  tuner = keras_tuner.Hyperband(
    hypermodel=build_model_hpo,
    objective="val_loss",
    max_epochs = EPOCHS, # 최종적으로 가장 많이 수행할 1개의 모델의 최대 에포크 수
                    # factor 값을 3으로 잡음
    factor = 3, # eta값 - bracket 별로 증가되는 에포크 배수 (= 줄어드는 하이퍼파라미터 쌍 비율)
    hyperband_iterations = 1, # 전체 하이퍼밴드 알고리즘 반복 횟수
    distribution_strategy = strategy,
    overwrite = False,
      # directory = 'gs://steam-project-bucket/krbert_tensorflow/ModelTrain',
      directory = 'ModelTrain',
      project_name= project_name,
  )

# 콜백함수 지정
earlystopping_cb = tf.keras.callbacks.EarlyStopping(
  monitor = 'val_loss',
  min_delta = 0.001, # "향상"의 기준치
  patience = 3,
  restore_best_weights = True
)
lr_monitor_cb = LearningRateMonitor()
csv_logger_cb = tf.keras.callbacks.CSVLogger(f"steam_krbert_hpo_seq{max_seq_length}_{train_batch_size}_{val_batch_size}.csv", append=True)
tb_cb = tf.keras.callbacks.TensorBoard('gs://steam-project-bucket/krbert_tensorflow/ModelTrain/logs')

tuner.search(train_ds,
              epochs = EPOCHS,
              validation_data = val_ds,
              callbacks = [earlystopping_cb,
                           lr_monitor_cb,
                           csv_logger_cb,
                           tb_cb])

best_hps = tuner.get_best_hyperparameters(num_trials = 1)[0]

Updated property [core/account].
Go to the following link in your browser:

    https://accounts.google.com/o/oauth2/auth?response_type=code&client_id=32555940559.apps.googleusercontent.com&redirect_uri=https%3A%2F%2Fsdk.cloud.google.com%2Fauthcode.html&scope=openid+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fuserinfo.email+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fcloud-platform+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fappengine.admin+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fsqlservice.login+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fcompute+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Faccounts.reauth&state=d4NbSQDMtKVLYJcRkALJi9MeUL9CNS&prompt=consent&access_type=offline&code_challenge=1FFcEkZgj5AGiDvrEydsNBPY0lxrXlXNHJMqrpjLGzA&code_challenge_method=S256

Enter authorization code: 4/0AfJohXnQs_TNCRF9IVsWcuk2IIh-r5UKMQbYSvpIbD-9VV12YTl84cQfW67flPRMRMFGSQ

You are now logged in as [dowrave@gmail.com].
Your current project is [copying-book].  You can change this setting by running:
  $ gcl

INFO:tensorflow:Deallocate tpu buffers before initializing tpu system.
INFO:tensorflow:Initializing the TPU system: grpc://10.15.121.146:8470
INFO:tensorflow:Finished initializing TPU system.
INFO:tensorflow:Found TPU system:
INFO:tensorflow:*** Num TPU Cores: 8
INFO:tensorflow:*** Num TPU Workers: 1
INFO:tensorflow:*** Num TPU Cores Per Worker: 8
INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:CPU:0, CPU, 0, 0)
INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:CPU:0, CPU, 0, 0)
INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:0, TPU, 0, 0)
INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:1, TPU, 0, 0)
INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:2, TPU, 0, 0)
INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:3

이미 tfrecord 파일이 있음


Instructions for updating:
Use `tf.cast` instead.


Reloading Tuner from gs://steam-project-bucket/krbert_tensorflow/ModelTrain/TEST1_steam_krbert_BO_seq512_64_32/tuner0.json


In [2]:
print("best hp combinations : ", best_hps.values)
print("best_loss : ", tuner.oracle.get_best_trials(1)[0].score)

model = tuner.hypermodel.build(best_hps)
model.fit(train_ds, epochs = 10,
          validation_data = (val_ds),
          callbacks = [earlystopping_cb,
                           lr_monitor_cb,
                           csv_logger_cb,
                           tb_cb])



best hp combinations :  {'dropout_rate': 0.1, 'weight_decay': 0.01, 'Learning_rate': 1.5000000000000002e-05}
best_loss :  0.34772299726804096
Epoch 1/10


  inputs = self._flatten_to_reference_inputs(inputs)


      6/Unknown - 62s 241ms/step - loss: 0.5871 - accuracy: 0.7474



   2426/Unknown - 633s 236ms/step - loss: 0.4570 - accuracy: 0.8011Epoch 1 : Learning Rate : 1.3964388017484453e-05
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10


<keras.src.callbacks.History at 0x7e1eda0ee020>

In [11]:
model.save('gs://steam-project-bucket/krbert_tensorflow/realLastModel.h5')
model.save('gs://steam-project-bucket/krbert_tensorflow/realLastModel.keras')

In [25]:
!pip install google-cloud-storage

from google.cloud import storage
client = storage.Client()
bucket = client.bucket('steam-project-bucket')


local_directory = 'RealLastModel/'
os.makedirs(local_directory, exist_ok=True)

# print(blobs)


# print(f'폴더 다운로드 완료: {local_directory}')



In [47]:
blobs = bucket.list_blobs(prefix='krbert_tensorflow/mymodel/mymodel')
local_directory = './231208_RealLastModel/'

if os.path.exists(local_directory) == False:
  os.mkdir(local_directory)

for blob in blobs:
    # 로컬 디렉토리에 파일 다운로드

    relative_path = os.path.relpath(blob.name, 'krbert_tensorflow/mymodel/mymodel')
    if relative_path == '.':
      continue
    # print(blob.name)
    print(relative_path)

    # "원하는 폴더 이름"이 포함된 파일 또는 폴더인 경우에 다운로드
    if relative_path == 'assets' or relative_path == 'variables':
        # 로컬 디렉토리에 상대 경로에 해당하는 폴더 생성
        os.mkdir(relative_path)
        continue

    # 로컬 디렉토리에 파일 다운로드
    local_filename = os.path.join(local_directory, relative_path)
    blob.download_to_filename(local_filename)
    print(f'다운로드 완료: {local_filename}')

assets
fingerprint.pb
다운로드 완료: ./231208_RealLastModel/fingerprint.pb
keras_metadata.pb
다운로드 완료: ./231208_RealLastModel/keras_metadata.pb
saved_model.pb
다운로드 완료: ./231208_RealLastModel/saved_model.pb
variables
variables/variables.data-00000-of-00001


FileNotFoundError: ignored

In [None]:

%tensorboard --logdir="/ModelTrain/logs"

# tf 1.x 버전으로 시도
- 에포크를 돌려도 학습률이 개선되지 않는 문제가 있다. `try_tf1.py`에 저장함.
- `estimator`를 어떻게 사용할지 감이 오지 않아서 익숙한 텐서플로우 2.x 버전을 사용하기로 했다. `ckpt`에 저장된 가중치를 로드하는 방법을 알게 되기도 했고.