In [1]:
import re

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import *
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

import transformers

from sklearn.metrics import accuracy_score

# os.environ["CUDA_VISIBLE_DEVICES"]="0"

tf.random.set_seed(0)
np.random.seed(0)

In [2]:
data_path = '../data/'

train_origin = pd.read_csv(data_path+'train_preprocessed.csv')
test_origin = pd.read_csv(data_path+'test_preprocessed.csv')
topic_dict_origin = pd.read_csv(data_path+'topic_dict.csv')
sample_submission_origin = pd.read_csv(data_path+'sample_submission.csv')

In [3]:
train = train_origin.copy()
test = test_origin.copy()
topic_dict = topic_dict_origin.copy()
sample_submission = sample_submission_origin.copy()

In [4]:
list_to_sentence = lambda stem_noun: '[CLS] '+' '.join(eval(stem_noun))+' [SEP]'
train['stem_noun'] = train['stem_noun'].apply(list_to_sentence)
test['stem_noun'] = test['stem_noun'].apply(list_to_sentence)

In [5]:
def txt_to_seq(train_clean, test_clean, max_len=None, padding='post'):
    
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(train_clean)
    train_seq = tokenizer.texts_to_sequences(train_clean)
    test_seq = tokenizer.texts_to_sequences(test_clean)
    vocabulary = tokenizer.word_index
    X_train = pad_sequences(train_seq, maxlen=max_len, padding=padding)
    X_test = pad_sequences(test_seq, maxlen=max_len, padding=padding)
    return X_train, X_test, vocabulary, tokenizer

In [6]:
data_config = {}
data_config['max_length'] = 50

In [7]:
X_train_encoded = {}
X_test_encoded = {}

In [8]:
X_train_encoded['input_ids'], X_test_encoded['input_ids'], vocabulary, vectorizer = txt_to_seq(train['stem_noun'], test['stem_noun'], max_len=data_config['max_length'], padding='post')
X_train_encoded['input_ids'].shape, X_test_encoded['input_ids'].shape

((45654, 50), (9131, 50))

In [9]:
X_train_encoded['attention_mask'] = np.zeros(X_train_encoded['input_ids'].shape)
X_test_encoded['attention_mask'] = np.zeros(X_test_encoded['input_ids'].shape)

for row in range(X_train_encoded['attention_mask'].shape[0]):
    X_train_encoded['attention_mask'][row] = [1 if i else 0 for i in X_train_encoded['input_ids'][row]]
for row in range(X_test_encoded['attention_mask'].shape[0]):
    X_test_encoded['attention_mask'][row] = [1 if i else 0 for i in X_test_encoded['input_ids'][row]]

In [10]:
y_train = train['topic_idx'].values

In [11]:
bert_model = transformers.TFBertForSequenceClassification.from_pretrained(
    'kykim/bert-kor-base', 
    cache_dir = '../model/kykim/bert-kor-base/cache/',
    output_hidden_states=False,
    output_attentions=False,
    use_cache = False,
    num_labels = 64
)

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at kykim/bert-kor-base and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
def set_model(learning_rate=3e-5, print_summary=False):

    input_ids = Input(batch_shape=(None, data_config['max_length']), dtype=tf.int32, name='input_ids')
    input_masks = Input(batch_shape=(None, data_config['max_length']), dtype=tf.int32, name='attention_masks')
    bert_outputs = bert_model([input_ids, input_masks])['logits']
    dropout_0 = Dropout(0.1, name='dropout_0')(bert_outputs)
    dense_0 = Dense(32, activation='relu', name='dense_0')(dropout_0)
    dropout_1 = Dropout(0.1, name='dropout_1')(dense_0)
    outputs = Dense(7, activation='softmax', name='outputs')(dropout_1)

    model = Model(
        inputs = [input_ids, input_masks],
        outputs = outputs,
        name = 'Bert_Classification'
    )

    optimizer = keras.optimizers.Adam(learning_rate=learning_rate)
    model.compile(
        loss = 'sparse_categorical_crossentropy',
        optimizer = optimizer, 
        metrics = ['accuracy']
    )

    if print_summary:
        model.summary(line_length=150)
    
    return model

In [13]:
model = set_model(learning_rate=9e-5)
model.layers[2].trainable = False
model.summary()

Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: <cyfunction Socket.send at 0x7ffabef40110> is not a module, class, method, function, traceback, frame, or code object
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: <cyfunction Socket.send at 0x7ffabef40110> is not a module, class, method, function, traceback, frame, or code object

Instructions for updating:
The `validate_indices` argument has no effect. Indices are always validated on CPU and never validated on GPU.
Model: "Bert_Classification"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_ids (InputLayer)          [(None, 50)]         0          

In [14]:
input_ids = Input(batch_shape=(None, data_config['max_length']), dtype=tf.int32, name='input_ids')
input_masks = Input(batch_shape=(None, data_config['max_length']), dtype=tf.int32, name='attention_masks')
bert_outputs = bert_model([input_ids, input_masks])['logits']



In [15]:
history = model.fit(
    x = [X_train_encoded['input_ids'][:1000], X_train_encoded['attention_mask'][:1000]], 
    y = y_train[:1000],
    epochs = 1,
    batch_size = 64
)



In [16]:
model.layers[2].trainable = True

optimizer = keras.optimizers.Adam(learning_rate=3e-5)
model.compile(
    loss = 'sparse_categorical_crossentropy',
    optimizer = optimizer, 
    metrics = ['accuracy']
)

In [17]:
early_stop = EarlyStopping(
    monitor = 'val_loss',
    patience = 3
)
model_check_point = ModelCheckpoint(
    filepath = '../model/bert_kor_base_for_classification_fine_tuning_mecab_0.h5',
    monitor = 'val_accuracy',
    verbose = 1,
    save_best_only = True,
    save_weights_only = True
)
def scheduler(epoch, lr):
    return lr * 0.95
lr_scheduler = tf.keras.callbacks.LearningRateScheduler(scheduler)

history = model.fit(
    x = [X_train_encoded['input_ids'], X_train_encoded['attention_mask']], 
    y = y_train,
    epochs = 15,
    batch_size = 32,
    validation_split = 0.2,
    callbacks = [early_stop, model_check_point, lr_scheduler]
)

Epoch 1/15

Epoch 00001: val_accuracy improved from -inf to 0.26076, saving model to ../model/bert_kor_base_for_classification_fine_tuning_mecab_0.h5
Epoch 2/15

Epoch 00002: val_accuracy improved from 0.26076 to 0.46304, saving model to ../model/bert_kor_base_for_classification_fine_tuning_mecab_0.h5
Epoch 3/15

Epoch 00003: val_accuracy improved from 0.46304 to 0.55043, saving model to ../model/bert_kor_base_for_classification_fine_tuning_mecab_0.h5
Epoch 4/15

Epoch 00004: val_accuracy improved from 0.55043 to 0.57245, saving model to ../model/bert_kor_base_for_classification_fine_tuning_mecab_0.h5
Epoch 5/15

Epoch 00005: val_accuracy improved from 0.57245 to 0.67605, saving model to ../model/bert_kor_base_for_classification_fine_tuning_mecab_0.h5
Epoch 6/15

Epoch 00006: val_accuracy did not improve from 0.67605
Epoch 7/15

Epoch 00007: val_accuracy improved from 0.67605 to 0.69992, saving model to ../model/bert_kor_base_for_classification_fine_tuning_mecab_0.h5
Epoch 8/15

Epoch 

In [18]:
best_model = set_model()
best_model.load_weights('../model/bert_kor_base_for_classification_fine_tuning_mecab_0.h5')



In [19]:
y_train_pred = best_model.predict([X_train_encoded['input_ids'], X_train_encoded['attention_mask']])
y_train_pred = np.argmax(y_train_pred, axis=1)

print(f'train accuracy: {accuracy_score(y_train, y_train_pred):.04f}')

train accuracy: 0.9005


In [21]:
y_test_pred = best_model.predict([X_test_encoded['input_ids'], X_test_encoded['attention_mask']])
y_test_pred = np.argmax(y_test_pred, axis=1)

In [22]:
sample_submission['topic_idx'] = y_test_pred
sample_submission.to_csv('../submit/bert_kor_base_for_classification_mecab_0.csv', index=False)