In [None]:
import os
import re
import json
import copy
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


from tqdm import tqdm
import tensorflow as tf
from transformers import *
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint


In [2]:
tf.random.set_seed(111)
np.random.seed(111)

BATCH_SIZE =32
NUM_EPOCHS = 3
VALID_SPLIT = 0.2
MAX_LEN = 39

In [None]:
import urllib.request

train_file = urllib.request.urlopen("https://raw.githubusercontent.com/e9t/nsmc/master/ratings_train.txt")
test_file = urllib.request.urlopen("https://raw.githubusercontent.com/e9t/nsmc/master/ratings_test.txt")
print(type(train_file))
train_data = pd.read_table(train_file)
test_data = pd.read_table(test_file)

train_data = train_data.dropna()
test_data = test_data.dropna()

In [None]:
train_data.head()

## BERTTokenizer

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased', cached_dir='bert_ckpt', do_lower_case =False)

In [None]:
help(tokenizer.encode_plus)

In [7]:
def bert_tokenizer(sentence, MAX_LEN):
    encoded_dict = tokenizer.encode_plus(text = sentence, add_special_tokens = True, max_length = MAX_LEN, pad_to_max_length = True, return_attention_mask = True)
    print(encoded_dict)
    print(encoded_dict.keys())
    input_id = encoded_dict['input_ids']
    print(f"input_id : {input_id}")
    attention_mask = encoded_dict['attention_mask']
    print(f"attention_mask : {attention_mask}")
    token_type_id = encoded_dict['token_type_ids']
    print(f"token_type_id : {token_type_id}")


    return input_id, attention_mask, token_type_id

In [None]:

input_ids = []
attention_masks, token_type_ids = [], []
train_data_labels = []

for train_sentence, train_label in tqdm(zip(train_data['document'], train_data['label']), total = len(train_data)):

    
    try:
        input_id, attention_mask, token_type_id = bert_tokenizer(train_sentence, MAX_LEN)
        
        input_ids.append(input_id)
        
        attention_masks.append(attention_mask)
        token_type_ids.append(token_type_id)
        train_data_labels.append(train_label)

        print(len(input_ids))
    except Exception as e:

        print(e)
        pass

train_movie_input_ids = np.array(input_ids, dtype=int)


train_movie_attention_masks = np.array(attention_masks, dtype=int)
train_movie_token_type_ids = np.array(token_type_ids, dtype=int)
train_movie_inputs = (train_movie_input_ids, train_movie_attention_masks, train_movie_token_type_ids)
train_data_labels = np.array(train_data_labels, dtype=int)

print(f"Sentences : {len(train_movie_input_ids), len(train_data_labels)}")

In [None]:
idx = 5
input_id = train_movie_input_ids[idx]
attentionb_mask = train_movie_attention_masks[idx]
token_type_id = train_movie_token_type_ids[idx]


print(input_id)
print(attentionb_mask)
print(token_type_id)

print(tokenizer.decode(input_id))


In [None]:
class TFBertClassifier(tf.keras.Model):
    def __init__(self, model_name, dir_path, num_class):
        super(TFBertClassifier, self).__init__()

        self.bert = TFBertModel.from_pretrained(model_name, cache_dir = dir_path)
        self.dropout = tf.keras.layers.Dropout(self.bert.config.hidden_dropout_prob)
        self.classifier = tf.keras.layers.Dense(num_class, kernel_initializer = tf.keras.initializers.TruncatedNormal(self.bert.config.initializer_range,),name='classifier')

    def call(self, inputs, attention_mask=None, token_type_ids=None, training= False):
        outputs = self.bert(inputs, attention_mask = attention_mask, token_type_ids = token_type_ids)
        pooled_output = outputs[1]
        pooled_output = self.dropout(pooled_output, training = training)
        logits = self.classifier(pooled_output)

        return logits

cls_model = TFBertClassifier(model_name= 'bert-base-multilingual-cased', dir_path='bert_ckpt', num_class=2)

## 모델 학습

In [11]:
optimizer = tf.keras.optimizers.Adam(3e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
cls_model.compile(optimizer = optimizer, loss =loss, metrics= [metric])

In [12]:
model_name = "tf2_bert_naver_movie"

es_callback = EarlyStopping(monitor='val_accuracy', min_delta=0.0001, patience=2)

checkpoint_path = os.path.join("./", model_name, 'weights.h5')
checkpoint_dir = os.path.dirname(checkpoint_path)

if os.path.exists(checkpoint_dir):
    print(f"{checkpoint_path}Directory already")
else:
    os.makedirs(checkpoint_dir, exist_ok = True)
    print(f"{checkpoint_path}Directory create")

cp_callback = ModelCheckpoint(checkpoint_path, monitor = 'val_accuracy', verbose = 1, save_best_only = True, save_weights_only = True)

history = cls_model.fit(train_movie_inputs, train_data_labels, epochs = NUM_EPOCHS, batch_size=BATCH_SIZE, validation_split=VALID_SPLIT, callbacks=[es_callback, cp_callback])

print(history.history)

./tf2_bert_naver_movie\weights.h5Directory already
Epoch 1/3


AttributeError: in user code:

    File "c:\Python311\Lib\site-packages\keras\engine\training.py", line 1284, in train_function  *
        return step_function(self, iterator)
    File "c:\Python311\Lib\site-packages\keras\engine\training.py", line 1268, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "c:\Python311\Lib\site-packages\keras\engine\training.py", line 1249, in run_step  **
        outputs = model.train_step(data)
    File "c:\Python311\Lib\site-packages\keras\engine\training.py", line 1050, in train_step
        y_pred = self(x, training=True)
    File "c:\Python311\Lib\site-packages\keras\utils\traceback_utils.py", line 70, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "C:\Users\NCC_AD~1\AppData\Local\Temp\__autograph_generated_filev2b4o76f.py", line 13, in tf__call
        logits = ag__.converted_call(ag__.ld(self).calssifier, (ag__.ld(pooled_output),), None, fscope)

    AttributeError: Exception encountered when calling layer 'tf_bert_classifier' (type TFBertClassifier).
    
    in user code:
    
        File "C:\Users\ncc_admin\AppData\Local\Temp\ipykernel_7968\877456597.py", line 13, in call  *
            logits = self.calssifier(pooled_output)
    
        AttributeError: 'TFBertClassifier' object has no attribute 'calssifier'
    
    
    Call arguments received by layer 'tf_bert_classifier' (type TFBertClassifier):
      • inputs=('tf.Tensor(shape=(None, 39), dtype=int32)', 'tf.Tensor(shape=(None, 39), dtype=int32)', 'tf.Tensor(shape=(None, 39), dtype=int32)')
      • attention_mask=None
      • token_type_ids=None
      • training=True
