In [1]:
import os
import re

import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

from transformers import BertTokenizer, TFBertModel
from tqdm import tqdm


tf.random.set_seed(9999)

In [2]:
def get_chk_dir_path()->str:
    return os.path.abspath('../chk')    

def get_dat_dir_path()->str:
    return os.path.abspath('../dat')

def get_train_csv_path()->str:
    dat_dir = get_dat_dir_path()
    # print(dat_dir)
    return os.path.join(dat_dir, "train.csv")

def get_test_csv_path()->str:
    dat_dir = get_dat_dir_path()
    return os.path.join(dat_dir, "test_x.csv")

def get_sample_sub_path()->str:
    dat_dir = get_dat_dir_path()
    return os.path.join(dat_dir, "sample_submission.csv")


In [3]:
BATCH_SIZE = 8
NUM_EPOCHS = 5
VALID_SPLIT = 0.2
MAX_LEN = 200
BERT_PRETRAIND_MODEL = "bert-base-cased"

In [4]:
#### 텐서플로2 자연어처리 7장

tokenizer = BertTokenizer.from_pretrained(BERT_PRETRAIND_MODEL)


def bert_tokenizer(sent: str, max_length: int):

    encoded_dict = tokenizer.encode_plus(
        text=sent,
        add_special_tokens=True,
        max_length=max_length,
        pad_to_max_length=True,
        return_attention_mask=True
    )

    input_id = encoded_dict["input_ids"]
    attention_mask = encoded_dict["attention_mask"]
    token_type_id = encoded_dict["token_type_ids"]

    return input_id, attention_mask, token_type_id

In [5]:
encoded = tokenizer.encode("Hello darkness my old friend.")
print(encoded)
decoded = tokenizer.decode(encoded)
print(decoded)

[101, 8667, 4542, 1139, 1385, 1910, 119, 102]
[CLS] Hello darkness my old friend. [SEP]


In [6]:
r = bert_tokenizer("What should I do for a mended heart, dear", 40)
tokenizer.decode(r[0])

Truncation was not explicitely activated but `max_length` is provided a specific value, please use `truncation=True` to explicitely truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


'[CLS] What should I do for a mended heart, dear [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]'

In [7]:
train = pd.read_csv(get_train_csv_path())
test = pd.read_csv(get_test_csv_path())
sample_submission = pd.read_csv(get_sample_sub_path())

In [8]:
train.head()

Unnamed: 0,index,text,author
0,0,"He was almost choking. There was so much, so m...",3
1,1,"“Your sister asked for it, I suppose?”",2
2,2,"She was engaged one day as she walked, in per...",1
3,3,"The captain was in the porch, keeping himself ...",4
4,4,"“Have mercy, gentlemen!” odin flung up his han...",3


In [9]:
test.head()

Unnamed: 0,index,text
0,0,“Not at all. I think she is one of the most ch...
1,1,"""No,"" replied he, with sudden consciousness, ""..."
2,2,As the lady had stated her intention of scream...
3,3,“And then suddenly in the silence I heard a so...
4,4,His conviction remained unchanged. So far as I...


In [10]:
def transform_texts_to_bert_inputs(texts:pd.Series, MAX_LEN:int):
    input_ids = []
    attention_masks = []
    token_type_ids = []

    for train_sent in tqdm(texts):
        try:
            input_id, attention_mask, token_type_id = bert_tokenizer(train_sent, MAX_LEN)
            
            input_ids.append(input_id)
            attention_masks.append(attention_mask)
            token_type_ids.append(token_type_id)

        except Exception as e:
            print(e)
            print(train_sent)
            pass

    train_movie_input_ids = np.array(input_ids, dtype=int)
    train_movie_attention_masks = np.array(attention_masks, dtype=int)
    train_movie_type_ids = np.array(token_type_ids, dtype=int)
    train_movie_inputs = (train_movie_input_ids, train_movie_attention_masks, train_movie_type_ids)

    return train_movie_inputs
    

In [11]:

train_bert_inputs = transform_texts_to_bert_inputs(train["text"], MAX_LEN)
test_bert_inputs = transform_texts_to_bert_inputs(test["text"], MAX_LEN)

100%|██████████| 54879/54879 [00:34<00:00, 1571.37it/s]
100%|██████████| 19617/19617 [00:23<00:00, 834.16it/s]


In [12]:
############################################################################
## this model part is from 텐서플로2와 머신러닝으로 시작하는 자연어처리 07. 텍스트분류
############################################################################



In [13]:
class TFBertClassifier(tf.keras.Model):
    def __init__(self, model_name, dir_path, num_class):
        super(TFBertClassifier, self).__init__()

        self.bert = TFBertModel.from_pretrained(model_name, cache_dir=dir_path)
        self.dropout = tf.keras.layers.Dropout(self.bert.config.hidden_dropout_prob)
        self.classifier = tf.keras.layers.Dense(num_class,
                                                kernel_initializer=tf.keras.initializers.TruncatedNormal(self.bert.config.initializer_range),
                                                activation="softmax",
                                                name="classifier")
    def call(self, inputs, attention_mask=None, token_type_ids=None, training=False):
        outputs = self.bert(inputs, attention_mask=attention_mask, token_type_ids=token_type_ids)
        pooled_output = outputs[1]
        pooled_output = self.dropout(pooled_output, training=training)
        logits = self.classifier(pooled_output)
        return logits

        

In [14]:
model = TFBertClassifier(model_name=BERT_PRETRAIND_MODEL, dir_path="bert_ckpt", num_class=5)


Some layers from the model checkpoint at bert-base-cased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [15]:
# 학습 준비하기
optimizer = tf.keras.optimizers.Adam(3e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
model.compile(optimizer=optimizer, loss="sparse_categorical_crossentropy", metrics=[metric])

In [16]:

#model.build(train_padded.shape)
#print(model.summary())            

In [17]:
model_name = "bert_classifier_en"

# overfitting을 막기 위한 ealrystop 추가
earlystop_callback = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', min_delta=0.0001,patience=2)
# min_delta: the threshold that triggers the termination (acc should at least improve 0.0001)
# patience: no improvment epochs (patience = 1, 1번 이상 상승이 없으면 종료)\

checkpoint_path = os.path.join( get_chk_dir_path(), model_name, 'weights.h5')
checkpoint_dir = os.path.dirname(checkpoint_path)

# Create path if exists
if os.path.exists(checkpoint_dir):
    print("{} -- Folder already exists \n".format(checkpoint_dir))
else:
    os.makedirs(checkpoint_dir, exist_ok=True)
    print("{} -- Folder create complete \n".format(checkpoint_dir))
    
cp_callback = tf.keras.callbacks.ModelCheckpoint(
    checkpoint_path, monitor='val_accuracy', verbose=1, save_best_only=True, save_weights_only=True)

# 학습과 eval 시작
history = model.fit(train_bert_inputs, train["author"], epochs=NUM_EPOCHS, batch_size=BATCH_SIZE,
                    validation_split = VALID_SPLIT, callbacks=[earlystop_callback, cp_callback])

#steps_for_epoch

print(history.history)

d:\github\daewonyoon\misc\2020\10_dacon_predict_novelist\chk\bert_classifier_en -- Folder already exists 

Epoch 1/5
Epoch 00001: val_accuracy improved from -inf to 0.85031, saving model to d:\github\daewonyoon\misc\2020\10_dacon_predict_novelist\chk\bert_classifier_en\weights.h5
Epoch 2/5
Epoch 00002: val_accuracy improved from 0.85031 to 0.87035, saving model to d:\github\daewonyoon\misc\2020\10_dacon_predict_novelist\chk\bert_classifier_en\weights.h5
Epoch 3/5
Epoch 00003: val_accuracy did not improve from 0.87035
Epoch 4/5
Epoch 00004: val_accuracy did not improve from 0.87035
{'loss': [0.6199377775192261, 0.3051068186759949, 0.17255185544490814, 0.11439233273267746], 'accuracy': [0.7681935429573059, 0.8931735754013062, 0.9405279755592346, 0.9611871838569641], 'val_loss': [0.4247288703918457, 0.3788391053676605, 0.42822933197021484, 0.47887152433395386], 'val_accuracy': [0.8503097891807556, 0.8703535199165344, 0.8620626926422119, 0.8638848662376404]}


In [18]:

"""
#num_epochs = 20
earlystop_callback = tf.keras.callbacks.EarlyStopping(monitor="val_accuracy", 
                                                    min_delta=0.0001, 
                                                    patience=2)

checkpoint_path = os.path.join( get_chk_dir_path(), model_name, "weights.h5" )
checkpoint_dir = os.path.dirname(checkpoint_path)

cp_callback = tf.keras.callbacks.ModelCheckpoint(checkpoint_path, 
                                                monitor="val_accuracy", 
                                                verbose=1, 
                                                save_best_only=True, 
                                                save_weights_only=True)

if not os.path.exists(checkpoint_dir):
    os.makedirs(checkpoint_dir, exist_ok=True)

history = model.fit(train_padded, y_train, 
                    batch_size=BATCH_SIZE, 
                    epochs=NUM_EPOCHS, 
                    verbose=2, 
                    validation_split=VALID_SPLIT, 
                    callbacks=[earlystop_callback, cp_callback])
"""                    

'\n#num_epochs = 20\nearlystop_callback = tf.keras.callbacks.EarlyStopping(monitor="val_accuracy", \n                                                    min_delta=0.0001, \n                                                    patience=2)\n\ncheckpoint_path = os.path.join( get_chk_dir_path(), model_name, "weights.h5" )\ncheckpoint_dir = os.path.dirname(checkpoint_path)\n\ncp_callback = tf.keras.callbacks.ModelCheckpoint(checkpoint_path, \n                                                monitor="val_accuracy", \n                                                verbose=1, \n                                                save_best_only=True, \n                                                save_weights_only=True)\n\nif not os.path.exists(checkpoint_dir):\n    os.makedirs(checkpoint_dir, exist_ok=True)\n\nhistory = model.fit(train_padded, y_train, \n                    batch_size=BATCH_SIZE, \n                    epochs=NUM_EPOCHS, \n                    verbose=2, \n                    vali

In [19]:
#model.load_weights(checkpoint_path)

pred = model.predict(test_bert_inputs)

In [20]:
pred

array([[1.5885371e-03, 3.3860454e-01, 6.5654796e-01, 2.7213146e-03,
        5.3759245e-04],
       [2.1667460e-04, 9.9912483e-01, 1.8147123e-04, 3.3816733e-04,
        1.3892641e-04],
       [9.9993896e-01, 3.5287536e-05, 2.3630694e-06, 1.4706765e-05,
        8.7266753e-06],
       ...,
       [6.0088641e-05, 9.9985850e-01, 3.3370197e-05, 3.2709628e-05,
        1.5358351e-05],
       [3.5584158e-05, 9.9962091e-01, 1.4980376e-04, 8.2900791e-05,
        1.1066598e-04],
       [9.9980778e-01, 1.9327481e-05, 6.0158836e-06, 1.0730881e-04,
        5.9550835e-05]], dtype=float32)

In [21]:
pred.shape

(19617, 5)

In [22]:
#test_padded.shape

In [23]:
sample_submission[[str(i) for i in range(5)]] = pred
sample_submission

Unnamed: 0,index,0,1,2,3,4
0,0,0.001589,0.338605,0.656548,0.002721,0.000538
1,1,0.000217,0.999125,0.000181,0.000338,0.000139
2,2,0.999939,0.000035,0.000002,0.000015,0.000009
3,3,0.000699,0.024244,0.972810,0.002079,0.000168
4,4,0.988079,0.000150,0.001636,0.008328,0.001807
...,...,...,...,...,...,...
19612,19612,0.000036,0.999900,0.000025,0.000020,0.000020
19613,19613,0.000194,0.000013,0.000581,0.000251,0.998960
19614,19614,0.000060,0.999858,0.000033,0.000033,0.000015
19615,19615,0.000036,0.999621,0.000150,0.000083,0.000111


In [24]:
sample_submission.to_csv("submission_bert.csv", index=False, encoding="utf-8")