# Implementation of Improving Language Understanding by Generative Pre-Training

## GPT from Transformer Decoder
### 3.1 Unsupervised pre-training
  - We use **a multi-layer Transformer decoder** for the language model, which is a variant of the transformer.
  - This model applies a multi-headed self-attention operation over the input context tokens followed by **position-wise feedforward layers** to produce an output distribution over target tokens.
  - h0 = UWe + Wp
  - hl = transformer_block(hl−1)∀i ∈ [1, n]
  - P(u) = softmax(hn WeT)
  
#### Todos:
  ![transformer_removed](./images/transformer_removed.png)
  1. Take only decoder part code.
  2. Modify from positional encoding to text & position embedding.
  3. Modify the linked part code between encoder and decoder.
  4. Remove the padding mask. (Auto-Regressive)
  
#### Completed architecture form
  ![gpt](./images/gpt.png)

In [118]:
import tensorflow as tf
import tensorflow_datasets as tfds
import os
import re
import numpy as np
import matplotlib.pyplot as plt

In [169]:
# 스케일드 닷 프로덕트 어텐션 함수
def scaled_dot_product_attention(query, key, value, mask):
  # 어텐션 가중치는 Q와 K의 닷 프로덕트
  matmul_qk = tf.matmul(query, key, transpose_b=True)

  # 가중치를 정규화
  depth = tf.cast(tf.shape(key)[-1], tf.float32)
  logits = matmul_qk / tf.math.sqrt(depth)

  # 패딩에 마스크 추가
  if mask is not None:
    logits += (mask * -1e9)

  # softmax적용
  attention_weights = tf.nn.softmax(logits, axis=-1)

  # 최종 어텐션은 가중치와 V의 닷 프로덕트
  output = tf.matmul(attention_weights, value)
  return output

# Multi-Head Attention
class MultiHeadAttention(tf.keras.layers.Layer):

  def __init__(self, d_model, num_heads, name="multi_head_attention"):
    super(MultiHeadAttention, self).__init__(name=name)
    self.num_heads = num_heads
    self.d_model = d_model

    assert d_model % self.num_heads == 0

    self.depth = d_model // self.num_heads

    self.query_dense = tf.keras.layers.Dense(units=d_model)
    self.key_dense = tf.keras.layers.Dense(units=d_model)
    self.value_dense = tf.keras.layers.Dense(units=d_model)

    self.dense = tf.keras.layers.Dense(units=d_model)

  def split_heads(self, inputs, batch_size):
    inputs = tf.reshape(
        inputs, shape=(batch_size, -1, self.num_heads, self.depth))
    return tf.transpose(inputs, perm=[0, 2, 1, 3])

  def call(self, inputs):
    query, key, value, mask = inputs['query'], inputs['key'], inputs[
        'value'], inputs['mask']
    batch_size = tf.shape(query)[0]

    # Q, K, V에 각각 Dense를 적용합니다
    # 1. WQ, WK, WV에 해당하는 밀집층 지나기
    # q : (batch_size, query의 문장 길이, d_model)
    # k : (batch_size, key의 문장 길이, d_model)
    # v : (batch_size, value의 문장 길이, d_model)
    # 참고) 인코더(k, v)-디코더(q) 어텐션에서는 query 길이와 key, value의 길이는 다를 수 있다.
    query = self.query_dense(query)
    key = self.key_dense(key)
    value = self.value_dense(value)

    # 병렬 연산을 위한 머리를 여러 개 만듭니다
    # 2. 헤드 나누기
    # q : (batch_size, num_heads, query의 문장 길이, d_model/num_heads)
    # k : (batch_size, num_heads, key의 문장 길이, d_model/num_heads)
    # v : (batch_size, num_heads, value의 문장 길이, d_model/num_heads)
    query = self.split_heads(query, batch_size)
    key = self.split_heads(key, batch_size)
    value = self.split_heads(value, batch_size)

    # 스케일드 닷 프로덕트 어텐션 함수
    scaled_attention = scaled_dot_product_attention(query, key, value, mask)

    scaled_attention = tf.transpose(scaled_attention, perm=[0, 2, 1, 3])

    # 어텐션 연산 후에 각 결과를 다시 연결(concatenate)합니다
    concat_attention = tf.reshape(scaled_attention,
                                  (batch_size, -1, self.d_model))

    # 최종 결과에도 Dense를 한 번 더 적용합니다
    outputs = self.dense(concat_attention)

    return outputs

In [170]:
#########################
# 2. 포지셔널 인코딩 레이어 -> Text & position embedding Layer
#########################
# class PositionalEncoding(tf.keras.layers.Layer):
class TextPostionEmbeddingLayer(tf.keras.layers.Layer):

  def __init__(self, vocab_size, d_model, max_position=512):
    super(TextPostionEmbeddingLayer, self).__init__()
    self.token_embedding = tf.keras.layers.Embedding(vocab_size, d_model)
    self.position_embedding = tf.keras.layers.Embedding(max_position, d_model)  # ✅ 학습 가능한 위치 임베딩
    self.d_model = d_model

  def call(self, inputs):
    seq_length = tf.shape(inputs)[1]
    positions = tf.range(start=0, limit=seq_length, delta=1)
    
    position_embeddings = self.position_embedding(positions)
    token_embeddings = self.token_embedding(inputs)
    
    return token_embeddings + position_embeddings

In [171]:
#########################
# Decoder
#########################
# 디코더 하나의 레이어를 함수로 구현.
# 이 하나의 레이어 안에는 세 개의 서브 레이어가 존재합니다.
def decoder_layer(units, d_model, num_heads, dropout, name="decoder_layer"):
  inputs = tf.keras.Input(shape=(None, d_model), name="inputs")
  # 마스크
  #########################
  # 4. Remove the padding mask. (Auto-Regressive)
  #########################
  look_ahead_mask = tf.keras.Input(
      shape=(1, None, None), name="look_ahead_mask")

  # 첫 번째 서브 레이어 : 멀티 헤드 어텐션 수행 (셀프 어텐션)
  attention = MultiHeadAttention(
      d_model, num_heads, name="attention_1")(inputs={
          'query': inputs,
          'key': inputs,
          'value': inputs,
          'mask': look_ahead_mask
      })

  # 멀티 헤드 어텐션의 결과는 LayerNormalization이라는 훈련을 돕는 테크닉을 수행
  attention = tf.keras.layers.LayerNormalization(
      epsilon=1e-6)(attention + inputs)

  #########################
  # 3. Modify the linked part code between encoder and decoder.
  #########################
  # 두 번째 서브 레이어 : 마스크드 멀티 헤드 어텐션 수행 (인코더-디코더 어텐션)
  # --> 디코더만 사용하기 때문에 인코더와 연결 부분인 해당 두 번째 서브 레이어를 삭제

  # 세 번째 서브 레이어 : 2개의 완전연결층 --> Feed Forward Network (FFN)
  outputs = tf.keras.layers.Dense(units=units, activation='relu')(attention)
  outputs = tf.keras.layers.Dense(units=d_model)(outputs)

  # 완전연결층의 결과는 Dropout과 LayerNormalization 수행
  outputs = tf.keras.layers.Dropout(rate=dropout)(outputs)
  outputs = tf.keras.layers.LayerNormalization(
      epsilon=1e-6)(outputs + attention)

  return tf.keras.Model(
      inputs=[inputs, look_ahead_mask],
      outputs=outputs,
      name=name)

In [172]:
def decoder(vocab_size,
            num_layers,
            units,
            d_model,
            num_heads,
            dropout,
            max_position=512,
            name='decoder'):
  inputs = tf.keras.Input(shape=(None,), name='inputs')

  # 마스크
  #########################
  # 4. Remove the padding mask. (Auto-Regressive)
  #########################
  look_ahead_mask = tf.keras.Input(
      shape=(1, None, None), name='look_ahead_mask')

  #########################
  # 2. Modify from positional encoding to text & position embedding.
  #########################
  embeddings = TextPostionEmbeddingLayer(
      vocab_size, 
      d_model, 
      max_position=max_position
  )(inputs)

  # Dropout이라는 훈련을 돕는 테크닉을 수행
  outputs = tf.keras.layers.Dropout(rate=dropout)(embeddings)

  for i in range(num_layers):
    outputs = decoder_layer(
        units=units,
        d_model=d_model,
        num_heads=num_heads,
        dropout=dropout,
        name='decoder_layer_{}'.format(i),
    )(inputs=[outputs, look_ahead_mask])

  return tf.keras.Model(
      inputs=[inputs, look_ahead_mask],
      outputs=outputs,
      name=name)

## GPT Pretraining
#### Loss function
  - The inputs are passed through our pre-trained model to obtain the final transformer block’s activation hml , which is then fed into an added linear output layer with parameters Wy to predict y:
    - P(y|x1,...,xm) = softmax(hml Wy)
  - We optimize the following objective (with weight λ)
    - L3(C) = L2(C) + λ ∗ L1(C)
    - L1(U) = Sigma(logP(ui|ui−k,...,ui−1;Θ))
    - L2(C) = Sigma(logP(y|x1,...,xm))
    
#### Todos:
1. Add pretraining.
2. Add output.
3. Modify Look ahead mask function.

In [173]:
#########################
# 3. Modify Look ahead mask function.
#########################
def create_look_ahead_mask(x):
  seq_len = tf.shape(x)[1]
  # 룩어헤드 마스크
  # return 1 - tf.linalg.band_part(tf.ones((seq_len, seq_len)), -1, 0)
  return tf.linalg.band_part(tf.ones((1, 1, seq_len, seq_len)), -1, 0)

In [213]:
class GPTPretraining(tf.keras.Model):
  def __init__(self, decoder, vocab_size, lambda_weight=1.0):
    super(GPTPretraining, self).__init__()
    self.decoder = decoder
    self.loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    self.vocab_size = vocab_size
    self.lambda_weight = lambda_weight

  def call(self, inputs):
    # tf.print(inputs)
    # input_tokens = inputs[:, :-1]
    # target_tokens = inputs[:, 1:]
    input_tokens, target_tokens = inputs
    # Input : 마지막 토큰 제외
    input_tokens = input_tokens[:, :-1]
    # Target : 첫 번째 토큰 제외
    target_tokens = target_tokens[:, 1:]

    look_ahead_mask = create_look_ahead_mask(input_tokens)
    
    # Predict
    logits = self.decoder([input_tokens, look_ahead_mask])
    # logits = tf.clip_by_value(logits, -10, 10)
    
    # Loss
    pretrain_loss = self.loss_fn(target_tokens, logits)
    fine_tune_loss = self.loss_fn(target_tokens, logits)
    loss = fine_tune_loss + self.lambda_weight * pretrain_loss
    
    return logits, loss

  def train_step(self, data):
    with tf.GradientTape() as tape:
      logits, loss = self(data)

    gradients = tape.gradient(loss, self.trainable_variables)
    self.optimizer.apply_gradients(zip(gradients, self.trainable_variables))
    return {"loss": loss}

## Data preprocessing

In [125]:
import os
import pandas as pd

datapath = os.getenv('HOME')+'/aiffel/transformer_chatbot/data/ChatbotData.csv'
data = pd.read_csv(datapath)
data.head()

Unnamed: 0,Q,A,label
0,12시 땡!,하루가 또 가네요.,0
1,1지망 학교 떨어졌어,위로해 드립니다.,0
2,3박4일 놀러가고 싶다,여행은 언제나 좋죠.,0
3,3박4일 정도 놀러가고 싶다,여행은 언제나 좋죠.,0
4,PPL 심하네,눈살이 찌푸려지죠.,0


In [126]:
import re

# 전처리 함수
def preprocess_sentence(sentence):
  # 단어와 구두점(punctuation) 사이의 공백 추가
  sentence = re.sub(r"([?.!,])", r" \1 ", sentence)
  # 연속된 공백을 하나의 공백으로 전환
  sentence = re.sub(r'[" "]+', " ", sentence)
  sentence = sentence.strip()
  return sentence

In [127]:
data['clean_q'] = data['Q'].apply(lambda x : preprocess_sentence(x))
data['clean_a'] = data['A'].apply(lambda x : preprocess_sentence(x))

In [128]:
data.head()

Unnamed: 0,Q,A,label,clean_q,clean_a
0,12시 땡!,하루가 또 가네요.,0,12시 땡 !,하루가 또 가네요 .
1,1지망 학교 떨어졌어,위로해 드립니다.,0,1지망 학교 떨어졌어,위로해 드립니다 .
2,3박4일 놀러가고 싶다,여행은 언제나 좋죠.,0,3박4일 놀러가고 싶다,여행은 언제나 좋죠 .
3,3박4일 정도 놀러가고 싶다,여행은 언제나 좋죠.,0,3박4일 정도 놀러가고 싶다,여행은 언제나 좋죠 .
4,PPL 심하네,눈살이 찌푸려지죠.,0,PPL 심하네,눈살이 찌푸려지죠 .


In [129]:
questions = data['clean_q']
answers = data['clean_a']

In [215]:
from tokenizers import ByteLevelBPETokenizer

# 질문(Q)과 답변(A)을 합쳐서 BPE 학습용 파일 생성
sentences = data['clean_q'].tolist() + data['clean_a'].tolist()
corpus_path = os.getenv('HOME') + "/aiffel/transformer_chatbot/data/chatbot_corpus.txt"

with open(corpus_path, "w", encoding="utf-8") as f:
    for sentence in sentences:
        f.write(sentence + "\n")

# BPE 토크나이저 학습
tokenizer = ByteLevelBPETokenizer()
tokenizer.train(files=[corpus_path], vocab_size=32000, min_frequency=2, special_tokens=[
    "<s>", "<pad>", "</s>", "<unk>", "<mask>", "$", "<sos>", "<eos>"
])

# 학습된 BPE 모델 저장
bpe_path = os.getenv('HOME') + "/aiffel/transformer_chatbot/data/bpe_tokenizer"
tokenizer.save_model(bpe_path)

# 학습된 토크나이저 불러오기
tokenizer = ByteLevelBPETokenizer(bpe_path + "/vocab.json", bpe_path + "/merges.txt")






In [219]:
df = pd.read_fwf(bpe_path + "/merges.txt", header=None)
df

Unnamed: 0,0,1,2,3,4
0,#version: 0.2,-,Trained,by,`huggingface/tokenizers`
1,Ġ ì,,,,
2,Ġ ë,,,,
3,ì Ŀ,,,,
4,Ġ .,,,,
...,...,...,...,...,...
17212,Ġê²¬ê³¼ë¥ ĺëĤ,,,,
17213,ìĬ¤íĥĢë²ħ ìĬ¤,,,,
17214,ĠíĮĢìĽĮíģ¬ ê°,,,,
17215,ê¸°ì´ĪëĮĢìĤ¬ë,ī,ìĿĦ,,


In [131]:
# 토큰화 함수 정의
def encode_text(text):
    return tokenizer.encode(text).ids  # 텍스트 → 토큰 ID 변환

def decode_tokens(token_ids):
    return tokenizer.decode(token_ids)  # 토큰 ID → 텍스트 변환

In [152]:
# Q + "$" + A 형태로 변환
data['q_tokens'] = data['clean_q'].apply(lambda x: [tokenizer.token_to_id("<sos>")] + encode_text(x) + [tokenizer.token_to_id("$")])
data['a_tokens'] = data['clean_a'].apply(lambda x: encode_text(x) + [tokenizer.token_to_id("<eos>")])

In [153]:
data.head()

Unnamed: 0,Q,A,label,clean_q,clean_a,q_tokens,a_tokens
0,12시 땡!,하루가 또 가네요.,0,12시 땡 !,하루가 또 가네요 .,"[6, 23, 24, 393, 4961, 568, 5]","[5415, 977, 7506, 266, 7]"
1,1지망 학교 떨어졌어,위로해 드립니다.,0,1지망 학교 떨어졌어,위로해 드립니다 .,"[6, 23, 11343, 3750, 3741, 5]","[15017, 7753, 266, 7]"
2,3박4일 놀러가고 싶다,여행은 언제나 좋죠.,0,3박4일 놀러가고 싶다,여행은 언제나 좋죠 .,"[6, 25, 2966, 26, 409, 6145, 720, 5]","[8208, 2409, 1028, 266, 7]"
3,3박4일 정도 놀러가고 싶다,여행은 언제나 좋죠.,0,3박4일 정도 놀러가고 싶다,여행은 언제나 좋죠 .,"[6, 25, 2966, 26, 409, 1891, 6145, 720, 5]","[8208, 2409, 1028, 266, 7]"
4,PPL 심하네,눈살이 찌푸려지죠.,0,PPL 심하네,눈살이 찌푸려지죠 .,"[6, 54, 54, 50, 13863, 5]","[13821, 15655, 266, 7]"


In [154]:
data['tokens'] = data['tokens'] = data.apply(lambda row: row['q_tokens'] + row['a_tokens'], axis=1)
data.head()

Unnamed: 0,Q,A,label,clean_q,clean_a,q_tokens,a_tokens,tokens
0,12시 땡!,하루가 또 가네요.,0,12시 땡 !,하루가 또 가네요 .,"[6, 23, 24, 393, 4961, 568, 5]","[5415, 977, 7506, 266, 7]","[6, 23, 24, 393, 4961, 568, 5, 5415, 977, 7506..."
1,1지망 학교 떨어졌어,위로해 드립니다.,0,1지망 학교 떨어졌어,위로해 드립니다 .,"[6, 23, 11343, 3750, 3741, 5]","[15017, 7753, 266, 7]","[6, 23, 11343, 3750, 3741, 5, 15017, 7753, 266..."
2,3박4일 놀러가고 싶다,여행은 언제나 좋죠.,0,3박4일 놀러가고 싶다,여행은 언제나 좋죠 .,"[6, 25, 2966, 26, 409, 6145, 720, 5]","[8208, 2409, 1028, 266, 7]","[6, 25, 2966, 26, 409, 6145, 720, 5, 8208, 240..."
3,3박4일 정도 놀러가고 싶다,여행은 언제나 좋죠.,0,3박4일 정도 놀러가고 싶다,여행은 언제나 좋죠 .,"[6, 25, 2966, 26, 409, 1891, 6145, 720, 5]","[8208, 2409, 1028, 266, 7]","[6, 25, 2966, 26, 409, 1891, 6145, 720, 5, 820..."
4,PPL 심하네,눈살이 찌푸려지죠.,0,PPL 심하네,눈살이 찌푸려지죠 .,"[6, 54, 54, 50, 13863, 5]","[13821, 15655, 266, 7]","[6, 54, 54, 50, 13863, 5, 13821, 15655, 266, 7]"


In [155]:
max_len = max(data['tokens'].apply(len))
max_len

38

In [156]:
def pad_sequence(tokens, max_len, pad_token_id):
    return tokens + [pad_token_id] * (max_len - len(tokens))

In [157]:
pad_token_id = tokenizer.token_to_id("<pad>")
data['tokens'] = data['tokens'].apply(lambda x: pad_sequence(x, max_len, pad_token_id))
data.head()

Unnamed: 0,Q,A,label,clean_q,clean_a,q_tokens,a_tokens,tokens
0,12시 땡!,하루가 또 가네요.,0,12시 땡 !,하루가 또 가네요 .,"[6, 23, 24, 393, 4961, 568, 5]","[5415, 977, 7506, 266, 7]","[6, 23, 24, 393, 4961, 568, 5, 5415, 977, 7506..."
1,1지망 학교 떨어졌어,위로해 드립니다.,0,1지망 학교 떨어졌어,위로해 드립니다 .,"[6, 23, 11343, 3750, 3741, 5]","[15017, 7753, 266, 7]","[6, 23, 11343, 3750, 3741, 5, 15017, 7753, 266..."
2,3박4일 놀러가고 싶다,여행은 언제나 좋죠.,0,3박4일 놀러가고 싶다,여행은 언제나 좋죠 .,"[6, 25, 2966, 26, 409, 6145, 720, 5]","[8208, 2409, 1028, 266, 7]","[6, 25, 2966, 26, 409, 6145, 720, 5, 8208, 240..."
3,3박4일 정도 놀러가고 싶다,여행은 언제나 좋죠.,0,3박4일 정도 놀러가고 싶다,여행은 언제나 좋죠 .,"[6, 25, 2966, 26, 409, 1891, 6145, 720, 5]","[8208, 2409, 1028, 266, 7]","[6, 25, 2966, 26, 409, 1891, 6145, 720, 5, 820..."
4,PPL 심하네,눈살이 찌푸려지죠.,0,PPL 심하네,눈살이 찌푸려지죠 .,"[6, 54, 54, 50, 13863, 5]","[13821, 15655, 266, 7]","[6, 54, 54, 50, 13863, 5, 13821, 15655, 266, 7..."


In [159]:
data['tokens'][0]

[6,
 23,
 24,
 393,
 4961,
 568,
 5,
 5415,
 977,
 7506,
 266,
 7,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1]

In [214]:
data.head()

Unnamed: 0,Q,A,label,clean_q,clean_a,q_tokens,a_tokens,tokens
0,12시 땡!,하루가 또 가네요.,0,12시 땡 !,하루가 또 가네요 .,"[6, 23, 24, 393, 4961, 568, 5]","[5415, 977, 7506, 266, 7]","[6, 23, 24, 393, 4961, 568, 5, 5415, 977, 7506..."
1,1지망 학교 떨어졌어,위로해 드립니다.,0,1지망 학교 떨어졌어,위로해 드립니다 .,"[6, 23, 11343, 3750, 3741, 5]","[15017, 7753, 266, 7]","[6, 23, 11343, 3750, 3741, 5, 15017, 7753, 266..."
2,3박4일 놀러가고 싶다,여행은 언제나 좋죠.,0,3박4일 놀러가고 싶다,여행은 언제나 좋죠 .,"[6, 25, 2966, 26, 409, 6145, 720, 5]","[8208, 2409, 1028, 266, 7]","[6, 25, 2966, 26, 409, 6145, 720, 5, 8208, 240..."
3,3박4일 정도 놀러가고 싶다,여행은 언제나 좋죠.,0,3박4일 정도 놀러가고 싶다,여행은 언제나 좋죠 .,"[6, 25, 2966, 26, 409, 1891, 6145, 720, 5]","[8208, 2409, 1028, 266, 7]","[6, 25, 2966, 26, 409, 1891, 6145, 720, 5, 820..."
4,PPL 심하네,눈살이 찌푸려지죠.,0,PPL 심하네,눈살이 찌푸려지죠 .,"[6, 54, 54, 50, 13863, 5]","[13821, 15655, 266, 7]","[6, 54, 54, 50, 13863, 5, 13821, 15655, 266, 7..."


In [None]:
from transformers import AutoTokenizer


### Models

In [175]:
tf.keras.backend.clear_session()

# 하이퍼파라미터
VOCAB_SIZE = 32000
# NUM_LAYERS = 2 # 인코더와 디코더의 층의 개수
NUM_LAYERS = 6
# D_MODEL = 256 # 인코더와 디코더 내부의 입, 출력의 고정 차원
D_MODEL = 768
NUM_HEADS = 8 # 멀티 헤드 어텐션에서의 헤드 수
UNITS = 512 # 피드 포워드 신경망의 은닉층의 크기
DROPOUT = 0.1 # 드롭아웃의 비율

decoder = decoder(
    vocab_size=VOCAB_SIZE,
    num_layers=NUM_LAYERS,
    units=UNITS,
    d_model=D_MODEL,
    num_heads=NUM_HEADS,
    dropout=DROPOUT)

decoder.summary()

Model: "decoder"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
inputs (InputLayer)             [(None, None)]       0                                            
__________________________________________________________________________________________________
text_postion_embedding_layer (T (None, None, 768)    24969216    inputs[0][0]                     
__________________________________________________________________________________________________
dropout (Dropout)               (None, None, 768)    0           text_postion_embedding_layer[0][0
__________________________________________________________________________________________________
look_ahead_mask (InputLayer)    [(None, 1, None, Non 0                                            
____________________________________________________________________________________________

In [141]:
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
  def __init__(self, d_model, warmup_steps=4000):
    super(CustomSchedule, self).__init__()

    self.d_model = d_model
    self.d_model = tf.cast(self.d_model, tf.float32)

    self.warmup_steps = warmup_steps

  def __call__(self, step):
    arg1 = tf.math.rsqrt(step)
    arg2 = step * (self.warmup_steps**-1.5)

    return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)

In [210]:
gpt_model = GPTPretraining(decoder, vocab_size=VOCAB_SIZE)

In [211]:
# learning_rate = CustomSchedule(D_MODEL)
learning_rate=3e-5
optimizer = tf.keras.optimizers.Adam(
    learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9)

gpt_model.compile(optimizer=optimizer)

In [194]:
checkpoint_path = os.getenv('HOME') + "/aiffel/transformer_chatbot/data/gpt1_checkpoint"
os.makedirs(checkpoint_path, exist_ok=True)
checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_path,
    save_weights_only=True,
    save_best_only=True,
    monitor="loss",
    mode="min"
)

In [206]:
x_train = np.array(data['tokens'].tolist(), dtype=np.int32)
x_train = tf.convert_to_tensor(x_train, dtype=tf.int32)

In [202]:
x_train

<tf.Tensor: shape=(11823, 38), dtype=int32, numpy=
array([[    6,    23,    24, ...,     1,     1,     1],
       [    6,    23, 11343, ...,     1,     1,     1],
       [    6,    25,  2966, ...,     1,     1,     1],
       ...,
       [    6,  1320,   246, ...,     1,     1,     1],
       [    6,  1701,   820, ...,     1,     1,     1],
       [    6, 13899,  1175, ...,     1,     1,     1]], dtype=int32)>

In [212]:
history = gpt_model.fit(
    x_train,
    x_train,
    batch_size=32,  # 배치 크기 설정
    epochs=10,  # 학습 epoch 설정
    callbacks=[checkpoint_callback]
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [166]:
print(history.history["loss"])

[nan, nan, nan, nan, nan, nan, nan, nan, nan, nan]
