In [1]:
import re

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import *
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

import transformers

from sklearn.metrics import accuracy_score

# os.environ["CUDA_VISIBLE_DEVICES"]="0"

tf.random.set_seed(0)
np.random.seed(0)

In [2]:
data_path = '../data/'

train_origin = pd.read_csv(data_path+'train_data.csv')
test_origin = pd.read_csv(data_path+'test_data.csv')
topic_dict_origin = pd.read_csv(data_path+'topic_dict.csv')
sample_submission_origin = pd.read_csv(data_path+'sample_submission.csv')

In [3]:
train = train_origin.copy()
test = test_origin.copy()
topic_dict = topic_dict_origin.copy()
sample_submission = sample_submission_origin.copy()

In [4]:
tokenizer = transformers.BertTokenizerFast.from_pretrained(
    'kykim/bert-kor-base',
    cache_dir = '../tokenizer/kykim/bert-kor-base',
    do_lower_case = False
)

In [5]:
data_config = {}
data_config['max_length'] = 44

In [6]:
def cleaning_text(text):
    text = re.sub('[^a-z가-힣]', ' ', text.lower())
    text = re.sub('[\s]+', ' ', text)
    return text

In [7]:
X_train = train['title'].apply(cleaning_text)
X_train = X_train.values.tolist()

X_train_encoded = tokenizer(
    X_train,
    padding = 'max_length',
    truncation = True,
    max_length = data_config['max_length'],
    return_tensors = 'tf'
)

In [8]:
y_train = train['topic_idx'].values

In [9]:
bert_model = transformers.TFBertForSequenceClassification.from_pretrained(
    'kykim/bert-kor-base', 
    cache_dir = '../model/kykim/bert-kor-base/cache/',
    output_hidden_states=False,
    output_attentions=False,
    use_cache = False,
    num_labels = 64
)

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at kykim/bert-kor-base and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
def set_model(learning_rate=3e-5, print_summary=False):

    input_ids = Input(batch_shape=(None, data_config['max_length']), dtype=tf.int32, name='input_ids')
    input_masks = Input(batch_shape=(None, data_config['max_length']), dtype=tf.int32, name='attention_masks')
    bert_outputs = bert_model([input_ids, input_masks])['logits']
    dropout_0 = Dropout(0.1, name='dropout_0')(bert_outputs)
    dense_0 = Dense(32, activation='relu', name='dense_0')(dropout_0)
    dropout_1 = Dropout(0.1, name='dropout_1')(dense_0)
    outputs = Dense(7, activation='softmax', name='outputs')(dropout_1)

    model = Model(
        inputs = [input_ids, input_masks],
        outputs = outputs,
        name = 'Bert_Classification'
    )

    optimizer = keras.optimizers.Adam(learning_rate=learning_rate)
    model.compile(
        loss = 'sparse_categorical_crossentropy',
        optimizer = optimizer, 
        metrics = ['accuracy']
    )

    if print_summary:
        model.summary(line_length=150)
    
    return model

In [11]:
model = set_model(learning_rate=9e-5)
model.layers[2].trainable = False
model.summary()

Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: <cyfunction Socket.send at 0x7f3c57dd9110> is not a module, class, method, function, traceback, frame, or code object
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: <cyfunction Socket.send at 0x7f3c57dd9110> is not a module, class, method, function, traceback, frame, or code object

Instructions for updating:
The `validate_indices` argument has no effect. Indices are always validated on CPU and never validated on GPU.
Model: "Bert_Classification"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_ids (InputLayer)          [(None, 44)]         0          

In [12]:
input_ids = Input(batch_shape=(None, data_config['max_length']), dtype=tf.int32, name='input_ids')
input_masks = Input(batch_shape=(None, data_config['max_length']), dtype=tf.int32, name='attention_masks')
bert_outputs = bert_model([input_ids, input_masks])['logits']



In [13]:
history = model.fit(
    x = [X_train_encoded['input_ids'], X_train_encoded['attention_mask']], 
    y = y_train,
    epochs = 1,
    batch_size = 64
)



In [14]:
model.layers[2].trainable = True

optimizer = keras.optimizers.Adam(learning_rate=3e-5)
model.compile(
    loss = 'sparse_categorical_crossentropy',
    optimizer = optimizer, 
    metrics = ['accuracy']
)

In [15]:
early_stop = EarlyStopping(
    monitor = 'val_loss',
    patience = 5
)
model_check_point = ModelCheckpoint(
    filepath = '../model/bert_kor_base_for_classification_fine_tuning_0.h5',
    monitor = 'val_accuracy',
    verbose = 1,
    save_best_only = True,
    save_weights_only = True
)
def scheduler(epoch, lr):
    return lr * 0.95
lr_scheduler = tf.keras.callbacks.LearningRateScheduler(scheduler)

history = model.fit(
    x = [X_train_encoded['input_ids'], X_train_encoded['attention_mask']], 
    y = y_train,
    epochs = 15,
    batch_size = 32,
    validation_split = 0.2,
    callbacks = [early_stop, model_check_point, lr_scheduler]
)

Epoch 1/15

Epoch 00001: val_accuracy improved from -inf to 0.90242, saving model to ../model/bert_kor_base_for_classification_fine_tuning_0.h5
Epoch 2/15

Epoch 00002: val_accuracy did not improve from 0.90242
Epoch 3/15

Epoch 00003: val_accuracy did not improve from 0.90242
Epoch 4/15

Epoch 00004: val_accuracy did not improve from 0.90242
Epoch 5/15

Epoch 00005: val_accuracy did not improve from 0.90242
Epoch 6/15

Epoch 00006: val_accuracy did not improve from 0.90242


In [16]:
best_model = set_model()
best_model.load_weights('../model/bert_kor_base_for_classification_fine_tuning_0.h5')



In [17]:
y_train_pred = best_model.predict([X_train_encoded['input_ids'], X_train_encoded['attention_mask']])
y_train_pred = np.argmax(y_train_pred, axis=1)

print(f'train accuracy: {accuracy_score(y_train, y_train_pred):.04f}')

train accuracy: 0.9304


In [18]:
X_test = test['title'].apply(cleaning_text)
X_test = X_test.values.tolist()

X_test_encoded = tokenizer(
    X_test,
    padding = 'max_length',
    truncation = True,
    max_length = data_config['max_length'],
    return_tensors = 'tf'
)

In [19]:
y_test_pred = best_model.predict([X_test_encoded['input_ids'], X_test_encoded['attention_mask']])
y_test_pred = np.argmax(y_test_pred, axis=1)

sample_submission['topic_idx'] = y_test_pred

In [20]:
# sample_submission.to_csv('../submit/bert_kor_base_for_classification_0.csv', index=False)