In [1]:
import re

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import *
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

import transformers

from sklearn.metrics import accuracy_score

In [2]:
data_path = '../data/'

train_origin = pd.read_csv(data_path+'train_data.csv')
test_origin = pd.read_csv(data_path+'test_data.csv')
topic_dict_origin = pd.read_csv(data_path+'topic_dict.csv')
sample_submission_origin = pd.read_csv(data_path+'sample_submission.csv')

In [3]:
train = train_origin.copy()
test = test_origin.copy()
topic_dict = topic_dict_origin.copy()
sample_submission = sample_submission_origin.copy()

In [4]:
tokenizer = transformers.BertTokenizer.from_pretrained(
    'bert-base-multilingual-cased', 
    cache_dir = '../tokenizer/bert-base-multilingual-cased',
    do_lower_case = False
)

In [5]:
data_config = {}
data_config['max_length'] = 44

In [6]:
def cleaning_text(text):
    text = re.sub('[^a-z가-힣]', ' ', text.lower())
    text = re.sub('[\s]+', ' ', text)
    return text

In [7]:
X_train = train['title'].apply(cleaning_text)
X_train = X_train.values.tolist()

X_train_encoded = tokenizer(
    X_train,
    padding = 'max_length',
    truncation = True,
    max_length = data_config['max_length'],
    return_tensors = 'tf'
)

In [8]:
y_train = train['topic_idx'].values

In [9]:
bert_config = transformers.BertConfig.from_pretrained(
    'bert-base-multilingual-cased',
    cache_dir = '../model/bert-base-multilingual-cased/cache/',
    output_hidden_states=False,
    output_attentions=False,
    use_cache = False
)

bert_model = transformers.TFBertModel.from_pretrained(
    'bert-base-multilingual-cased', 
    config = bert_config
)

Some layers from the model checkpoint at bert-base-multilingual-cased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-multilingual-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [10]:
def set_model(print_summary=False):

    input_ids = Input(batch_shape=(None, data_config['max_length']), dtype=tf.int32, name='input_ids')
    input_masks = Input(batch_shape=(None, data_config['max_length']), dtype=tf.int32, name='attention_masks')
    bert_outputs = bert_model([input_ids, input_masks])[1]
    dropout_0 = Dropout(0.1)(bert_outputs)
    dense_0 = Dense(64, activation='relu', name='dense_0')(dropout_0)
    dropout_1 = Dropout(0.1)(dense_0)
    dense_1 = Dense(32, activation='relu', name='dense_1')(dropout_1)
    dropout_2 = Dropout(0.1)(dense_1)
    outputs = Dense(7, activation='softmax', name='outputs')(dropout_2)
    
#     input_ids = Input(batch_shape=(None, data_config['max_length']), dtype=tf.int32, name='input_ids')
#     input_masks = Input(batch_shape=(None, data_config['max_length']), dtype=tf.int32, name='attention_masks')
#     bert_outputs = bert_model([input_ids, input_masks])[1]
#     dropout_0 = Dropout(0.1)(bert_outputs)
#     outputs = Dense(7, activation='softmax', name='outputs')(dropout_0)

    model = Model(
        inputs = [input_ids, input_masks],
        outputs = outputs,
        name = 'Bert_Classification'
    )

    # bert_model.trainable = False  # bert는 train 해제. pre-trained 그대로 사용

    optimizer = keras.optimizers.Adam(learning_rate=3e-5)
    model.compile(
        loss = 'sparse_categorical_crossentropy',
        optimizer = optimizer, 
        metrics = ['accuracy']
    )

    if print_summary:
        model.summary(line_length=150)
    
    return model

In [11]:
model = set_model(print_summary=True)

Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: <cyfunction Socket.send at 0x7f8bb0074110> is not a module, class, method, function, traceback, frame, or code object
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: <cyfunction Socket.send at 0x7f8bb0074110> is not a module, class, method, function, traceback, frame, or code object

Instructions for updating:
The `validate_indices` argument has no effect. Indices are always validated on CPU and never validated on GPU.
Model: "Bert_Classification"
______________________________________________________________________________________________________________________________________________________
Layer (type)                                     Output Shape                     Param #           Connected t

In [12]:
early_stop = EarlyStopping(
    monitor = 'val_loss',
    patience = 3
)
model_check_point = ModelCheckpoint(
    filepath = '../model/bert_fine_tuning_0.h5',
    monitor = 'val_accuracy',
    verbose = 1,
    save_best_only = True,
    save_weights_only = True
)
def scheduler(epoch, lr):
    return lr * 0.95
lr_scheduler = tf.keras.callbacks.LearningRateScheduler(scheduler)

history = model.fit(
    x = [X_train_encoded['input_ids'], X_train_encoded['attention_mask']], 
    y = y_train,
    epochs = 10,
    batch_size = 32,
    validation_split = 0.2,
    callbacks = [early_stop, model_check_point, lr_scheduler]
)

Epoch 1/10

Epoch 00001: val_accuracy improved from -inf to 0.70858, saving model to ../model/bert_fine_tuning_0.h5
Epoch 2/10

Epoch 00002: val_accuracy improved from 0.70858 to 0.75271, saving model to ../model/bert_fine_tuning_0.h5
Epoch 3/10

Epoch 00003: val_accuracy improved from 0.75271 to 0.78568, saving model to ../model/bert_fine_tuning_0.h5
Epoch 4/10

Epoch 00004: val_accuracy did not improve from 0.78568
Epoch 5/10

Epoch 00005: val_accuracy did not improve from 0.78568
Epoch 6/10

Epoch 00006: val_accuracy did not improve from 0.78568


In [13]:
best_model = set_model()
best_model.load_weights('../model/bert_fine_tuning_0.h5')



In [14]:
y_train_pred = best_model.predict([X_train_encoded['input_ids'], X_train_encoded['attention_mask']])
y_train_pred = np.argmax(y_train_pred, axis=1)

print(f'train accuracy: {accuracy_score(y_train, y_train_pred):.04f}')

train accuracy: 0.8888


In [15]:
X_test = test['title']
X_test = X_test.values.tolist()

X_test_encoded = tokenizer(
    X_test,
    padding = 'max_length',
    truncation = True,
    max_length = data_config['max_length'],
    return_tensors = 'tf'
)

In [16]:
y_test_pred = best_model.predict([X_test_encoded['input_ids'], X_test_encoded['attention_mask']])
y_test_pred = np.argmax(y_test_pred, axis=1)

sample_submission['topic_idx'] = y_test_pred

In [17]:
# sample_submission.to_csv('../submit/bert_0.csv', index=False)