In [1]:
import os
import re

import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

from transformers import BertTokenizer, TFBertModel
from tqdm import tqdm


tf.random.set_seed(9999)

In [2]:
def get_chk_dir_path()->str:
    return os.path.abspath('../chk')    

def get_dat_dir_path()->str:
    return os.path.abspath('../dat')

def get_train_csv_path()->str:
    dat_dir = get_dat_dir_path()
    # print(dat_dir)
    return os.path.join(dat_dir, "train.csv")

def get_test_csv_path()->str:
    dat_dir = get_dat_dir_path()
    return os.path.join(dat_dir, "test_x.csv")

def get_sample_sub_path()->str:
    dat_dir = get_dat_dir_path()
    return os.path.join(dat_dir, "sample_submission.csv")


In [33]:
BATCH_SIZE = 32
NUM_EPOCHS = 10
VALID_SPLIT = 0.2
MAX_LEN = 500
BERT_PRETRAIND_MODEL = "bert-large-cased"

In [3]:
#### 텐서플로2 자연어처리 7장

tokenizer = BertTokenizer.from_pretrained(BERT_PRETRAIND_MODEL)


def bert_tokenizer(sent: str, max_length: int):

    encoded_dict = tokenizer.encode_plus(
        text=sent,
        add_special_tokens=True,
        max_length=max_length,
        pad_to_max_length=True,
        return_attention_mask=True
    )

    input_id = encoded_dict["input_ids"]
    attention_mask = encoded_dict["attention_mask"]
    token_type_id = encoded_dict["token_type_ids"]

    return input_id, attention_mask, token_type_id

In [4]:
encoded = tokenizer.encode("Hello darkness my old friend.")
print(encoded)
decoded = tokenizer.decode(encoded)
print(decoded)

[101, 8667, 4542, 1139, 1385, 1910, 119, 102]
[CLS] Hello darkness my old friend. [SEP]


In [5]:
r = bert_tokenizer("What should I do for a mended heart, dear", 40)
tokenizer.decode(r[0])

Truncation was not explicitely activated but `max_length` is provided a specific value, please use `truncation=True` to explicitely truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


'[CLS] What should I do for a mended heart, dear [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]'

In [6]:
train = pd.read_csv(get_train_csv_path())
test = pd.read_csv(get_test_csv_path())
sample_submission = pd.read_csv(get_sample_sub_path())

In [7]:
train.head()

Unnamed: 0,index,text,author
0,0,"He was almost choking. There was so much, so m...",3
1,1,"“Your sister asked for it, I suppose?”",2
2,2,"She was engaged one day as she walked, in per...",1
3,3,"The captain was in the porch, keeping himself ...",4
4,4,"“Have mercy, gentlemen!” odin flung up his han...",3


In [8]:
test.head()

Unnamed: 0,index,text
0,0,“Not at all. I think she is one of the most ch...
1,1,"""No,"" replied he, with sudden consciousness, ""..."
2,2,As the lady had stated her intention of scream...
3,3,“And then suddenly in the silence I heard a so...
4,4,His conviction remained unchanged. So far as I...


In [9]:
def transform_texts_to_bert_inputs(texts:pd.Series, MAX_LEN:int):
    input_ids = []
    attention_masks = []
    token_type_ids = []

    for train_sent in tqdm(texts):
        try:
            input_id, attention_mask, token_type_id = bert_tokenizer(train_sent, MAX_LEN)
            
            input_ids.append(input_id)
            attention_masks.append(attention_mask)
            token_type_ids.append(token_type_id)

        except Exception as e:
            print(e)
            print(train_sent)
            pass

    train_movie_input_ids = np.array(input_ids, dtype=int)
    train_movie_attention_masks = np.array(attention_masks, dtype=int)
    train_movie_type_ids = np.array(token_type_ids, dtype=int)
    train_movie_inputs = (train_movie_input_ids, train_movie_attention_masks, train_movie_type_ids)

    return train_movie_inputs
    

In [10]:

train_bert_inputs = transform_texts_to_bert_inputs(train["text"], 500)
test_bert_inputs = transform_texts_to_bert_inputs(test["text"], 500)

100%|██████████| 54879/54879 [00:42<00:00, 1302.47it/s]
100%|██████████| 19617/19617 [00:29<00:00, 665.23it/s]


In [11]:
# preprocesscingᆼ

In [12]:
#def alpha_num(txt:str)->str:
#    return re.sub(r"[^A-Za-z0-9 ]", "", txt)

#train["text"] = train["text"].apply(alpha_num)

In [13]:
#train

In [14]:
#train["text"] = train["text"].apply(alpha_num)
#test["text"] = test["text"].apply(alpha_num)

In [15]:
#train

In [16]:
# x_train = train["text"].values

In [17]:
# x_train

In [18]:
#x_train = np.array([x for x in train["text"]])
#x_test = np.array([x for x in test["text"]])
#y_train = np.array([x for x in train["author"]])

In [19]:
# x_train

In [20]:
# Modeling

In [21]:
#vocab_size = 20000
#embedding_dim = 128
##max_length = 500
#padding_type = "post"

In [22]:
#tokenizer = Tokenizer(num_words=vocab_size)
#tokenizer.fit_on_texts(x_train)
#word_index = tokenizer.word_index

In [23]:
#list(word_index)[:10]

In [24]:
#train_sequence = tokenizer.texts_to_sequences(x_train)
#train_padded = pad_sequences(train_sequence, padding=padding_type, maxlen=max_length)

#test_sequence = tokenizer.texts_to_sequences(x_test)
#test_padded = pad_sequences(test_sequence, padding=padding_type, maxlen=max_length)

In [25]:
#train_padded

In [31]:
############################################################################
## this model part is from 텐서플로2와 머신러닝으로 시작하는 자연어처리 07. 텍스트분류
############################################################################



In [41]:
class TFBertClassifier(tf.keras.Model):
    def __init__(self, model_name, dir_path, num_class):
        super(TFBertClassifier, self).__init__()

        self.bert = TFBertModel.from_pretrained(model_name, cache_dir=dir_path)
        self.dropout = tf.keras.layers.Dropout(self.bert.config.hidden_dropout_prob)
        self.classifier = tf.keras.layers.Dense(num_class,
                                                kernel_initializer=tf.keras.initializers.TruncatedNormal(self.bert.config.initializer_range),
                                                name="classifier")
    def call(self, inputs, attention_mask=None, token_type_ids=None, training=False):
        outputs = self.bert(inputs, attention_mask=attention_mask, token_type_ids=token_type_ids)
        pooled_output = outputs[1]
        pooled_output = self.dropout(pooled_output, training=training)
        logits = self.classifier(pooled_output)
        return logits

        

In [42]:
model = TFBertClassifier(model_name=BERT_PRETRAIND_MODEL, dir_path="bert_ckpt", num_class=5)


Some layers from the model checkpoint at bert-large-cased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-large-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [43]:
# 학습 준비하기
optimizer = tf.keras.optimizers.Adam(3e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

In [46]:

#model.build(train_padded.shape)
#print(model.summary())            

In [49]:
model_name = "bert_classifier_en"

# overfitting을 막기 위한 ealrystop 추가
earlystop_callback = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', min_delta=0.0001,patience=2)
# min_delta: the threshold that triggers the termination (acc should at least improve 0.0001)
# patience: no improvment epochs (patience = 1, 1번 이상 상승이 없으면 종료)\

checkpoint_path = os.path.join( get_chk_dir_path(), model_name, 'weights.h5')
checkpoint_dir = os.path.dirname(checkpoint_path)

# Create path if exists
if os.path.exists(checkpoint_dir):
    print("{} -- Folder already exists \n".format(checkpoint_dir))
else:
    os.makedirs(checkpoint_dir, exist_ok=True)
    print("{} -- Folder create complete \n".format(checkpoint_dir))
    
cp_callback = tf.keras.callbacks.ModelCheckpoint(
    checkpoint_path, monitor='val_accuracy', verbose=1, save_best_only=True, save_weights_only=True)

# 학습과 eval 시작
history = model.fit(train_bert_inputs, train["author"], epochs=NUM_EPOCHS, batch_size=BATCH_SIZE,
                    validation_split = VALID_SPLIT, callbacks=[earlystop_callback, cp_callback])

#steps_for_epoch

print(history.history)

/Users/daewonyoon/github/daewonyoon/misc/2020/10_dacon_predict_novelist/chk/tf2_bert_naver_movie -- Folder already exists 



NameError: name 'cls_model' is not defined

In [24]:


#num_epochs = 20
earlystop_callback = tf.keras.callbacks.EarlyStopping(monitor="val_accuracy", 
                                                    min_delta=0.0001, 
                                                    patience=2)

checkpoint_path = os.path.join( get_chk_dir_path(), model_name, "weights.h5" )
checkpoint_dir = os.path.dirname(checkpoint_path)

cp_callback = tf.keras.callbacks.ModelCheckpoint(checkpoint_path, 
                                                monitor="val_accuracy", 
                                                verbose=1, 
                                                save_best_only=True, 
                                                save_weights_only=True)

if not os.path.exists(checkpoint_dir):
    os.makedirs(checkpoint_dir, exist_ok=True)

history = model.fit(train_padded, y_train, 
                    batch_size=BATCH_SIZE, 
                    epochs=NUM_EPOCHS, 
                    verbose=2, 
                    validation_split=VALID_SPLIT, 
                    callbacks=[earlystop_callback, cp_callback])

Epoch 1/10

Epoch 00001: val_accuracy improved from -inf to 0.59457, saving model to d:\github\daewonyoon\misc\2020\10_dacon_predict_novelist\chk\cnn_classifier_en\weights.h5
97/97 - 41s - loss: 1.3976 - accuracy: 0.4045 - val_loss: 1.0473 - val_accuracy: 0.5946
Epoch 2/10

Epoch 00002: val_accuracy improved from 0.59457 to 0.73069, saving model to d:\github\daewonyoon\misc\2020\10_dacon_predict_novelist\chk\cnn_classifier_en\weights.h5
97/97 - 40s - loss: 0.8422 - accuracy: 0.6870 - val_loss: 0.7382 - val_accuracy: 0.7307
Epoch 3/10

Epoch 00003: val_accuracy improved from 0.73069 to 0.74289, saving model to d:\github\daewonyoon\misc\2020\10_dacon_predict_novelist\chk\cnn_classifier_en\weights.h5
97/97 - 40s - loss: 0.5750 - accuracy: 0.7934 - val_loss: 0.6951 - val_accuracy: 0.7429
Epoch 4/10

Epoch 00004: val_accuracy did not improve from 0.74289
97/97 - 40s - loss: 0.4470 - accuracy: 0.8397 - val_loss: 0.7061 - val_accuracy: 0.7403
Epoch 5/10

Epoch 00005: val_accuracy did not impr

In [25]:
model.load_weights(checkpoint_path)

pred = model.predict(test_padded)

In [26]:
pred

array([[1.7089713e-02, 2.5950196e-01, 5.0526047e-01, 2.0786697e-01,
        1.0280924e-02],
       [1.1359072e-01, 7.1428502e-01, 2.9176155e-02, 5.2005421e-02,
        9.0942726e-02],
       [9.1241038e-01, 8.2863055e-02, 1.2584317e-03, 5.5661576e-04,
        2.9115642e-03],
       ...,
       [1.0660500e-03, 9.9865985e-01, 9.4617180e-06, 2.6123933e-04,
        3.3049264e-06],
       [3.9650548e-02, 9.2917782e-01, 3.3951139e-03, 2.6424885e-02,
        1.3515840e-03],
       [9.5895571e-01, 9.5104389e-03, 5.2798851e-03, 7.5844228e-03,
        1.8669603e-02]], dtype=float32)

In [27]:
pred.shape

(19617, 5)

In [28]:
test_padded.shape

(19617, 500)

In [29]:
sample_submission[[str(i) for i in range(5)]] = pred
sample_submission

Unnamed: 0,index,0,1,2,3,4
0,0,0.017090,0.259502,0.505260,0.207867,0.010281
1,1,0.113591,0.714285,0.029176,0.052005,0.090943
2,2,0.912410,0.082863,0.001258,0.000557,0.002912
3,3,0.068767,0.001953,0.730606,0.006248,0.192426
4,4,0.487861,0.066780,0.018471,0.332050,0.094838
...,...,...,...,...,...,...
19612,19612,0.004014,0.994086,0.000073,0.001731,0.000096
19613,19613,0.113119,0.008009,0.029976,0.001777,0.847118
19614,19614,0.001066,0.998660,0.000009,0.000261,0.000003
19615,19615,0.039651,0.929178,0.003395,0.026425,0.001352


In [30]:
sample_submission.to_csv("submission_cnn.csv", index=False, encoding="utf-8")