In [6]:
import os
import re

import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

from transformers import BertTokenizer, TFBertModel


tf.random.set_seed(9999)

In [7]:
def get_chk_dir_path()->str:
    return os.path.abspath('../chk')    

def get_dat_dir_path()->str:
    return os.path.abspath('../dat')

def get_train_csv_path()->str:
    dat_dir = get_dat_dir_path()
    # print(dat_dir)
    return os.path.join(dat_dir, "train.csv")

def get_test_csv_path()->str:
    dat_dir = get_dat_dir_path()
    return os.path.join(dat_dir, "test_x.csv")

def get_sample_sub_path()->str:
    dat_dir = get_dat_dir_path()
    return os.path.join(dat_dir, "sample_submission.csv")


In [8]:
#### 텐서플로2 자연어처리 7장

tokenizer = BertTokenizer.from_pretrained("bert-large-cased")


def bert_tokenizer(sent: str, max_length: int):

    encoded_dict = tokenizer.encode_plus(
        text=sent,
        add_special_tokens=True,
        max_length=max_length,
        pad_to_max_length=True,
        return_attention_mask=True
    )

    input_id = encoded_dict["input_ids"]
    attention_mask = encoded_dict["attention_mask"]
    token_type_id = encoded_dict["token_type_ids"]

    return input_id, attention_mask, token_type_id

Downloading: 100%|██████████| 213k/213k [00:00<00:00, 354kB/s] 


In [9]:
encoded = tokenizer.encode("Hello darkness my old friend.")
print(encoded)
decoded = tokenizer.decode(encoded)
print(decoded)

[101, 8667, 4542, 1139, 1385, 1910, 119, 102]
[CLS] Hello darkness my old friend. [SEP]


In [10]:
r = bert_tokenizer("What should I do for a mended heart, dear", 40)
tokenizer.decode(r[0])

Truncation was not explicitely activated but `max_length` is provided a specific value, please use `truncation=True` to explicitely truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


'[CLS] What should I do for a mended heart, dear [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]'

In [3]:
train = pd.read_csv(get_train_csv_path())
test = pd.read_csv(get_test_csv_path())
sample_submission = pd.read_csv(get_sample_sub_path())

In [4]:
# preprocesscing

In [5]:
def alpha_num(txt:str)->str:
    return re.sub(r"[^A-Za-z0-9 ]", "", txt)

train["text"] = train["text"].apply(alpha_num)

In [6]:
train

Unnamed: 0,index,text,author
0,0,He was almost choking There was so much so muc...,3
1,1,Your sister asked for it I suppose,2
2,2,She was engaged one day as she walked in peru...,1
3,3,The captain was in the porch keeping himself c...,4
4,4,Have mercy gentlemen odin flung up his hands D...,3
...,...,...,...
54874,54874,Is that you Mr Smith odin whispered I hardly d...,2
54875,54875,I told my plan to the captain and between us w...,4
54876,54876,Your sincere wellwisher friend and sister LUC...,1
54877,54877,Then you wanted me to lend you money,3


In [7]:
stopwords = [ "a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "as", 
             "at", "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "could", 
             "did", "do", "does", "doing", "down", "during", "each", "few", "for", "from", "further", "had", "has", 
             "have", "having", "he", "he'd", "he'll", "he's", "her", "here", "here's", "hers", "herself", "him", "himself", 
             "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is", "it", "it's", "its", "itself", 
             "let's", "me", "more", "most", "my", "myself", "nor", "of", "on", "once", "only", "or", "other", "ought", "our", "ours", 
             "ourselves", "out", "over", "own", "same", "she", "she'd", "she'll", "she's", "should", "so", "some", "such", "than", "that", 
             "that's", "the", "their", "theirs", "them", "themselves", "then", "there", "there's", "these", "they", "they'd", "they'll", 
             "they're", "they've", "this", "those", "through", "to", "too", "under", "until", "up", "very", "was", "we", "we'd", "we'll", 
             "we're", "we've", "were", "what", "what's", "when", "when's", "where", "where's", "which", "while", "who", "who's", "whom", 
             "why", "why's", "with", "would", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves" ]

def remove_stopwords(text:str) -> str:
    final_txt = []
    for i in text.split():
        if i.strip().lower() not in stopwords:
            final_txt.append(i.strip())
    return " ".join(final_txt)

In [8]:
train["text"] = train["text"].apply(alpha_num).apply(remove_stopwords)
test["text"] = test["text"].apply(alpha_num).apply(remove_stopwords)

In [9]:
train

Unnamed: 0,index,text,author
0,0,almost choking much much wanted say strange ex...,3
1,1,sister asked suppose,2
2,2,engaged one day walked perusing Janes last let...,1
3,3,captain porch keeping carefully way treacherou...,4
4,4,mercy gentlemen odin flung hands Dont write an...,3
...,...,...,...
54874,54874,Mr Smith odin whispered hardly dared hope come,2
54875,54875,told plan captain us settled details accomplis...,4
54876,54876,sincere wellwisher friend sister LUCY odin,1
54877,54877,wanted lend money,3


In [10]:
# x_train = train["text"].values

In [11]:
# x_train

In [12]:
x_train = np.array([x for x in train["text"]])
x_test = np.array([x for x in test["text"]])
y_train = np.array([x for x in train["author"]])

In [13]:
# x_train

In [14]:
# Modeling

In [15]:
vocab_size = 20000
embedding_dim = 128
max_length = 500
padding_type = "post"

In [16]:
#tokenizer = Tokenizer(num_words=vocab_size)
#tokenizer.fit_on_texts(x_train)
#word_index = tokenizer.word_index

In [17]:
#list(word_index)[:10]

['odin', 'not', 'said', 'no', 'one', 'mr', 'will', 'upon', 'now', 'man']

In [18]:
train_sequence = tokenizer.texts_to_sequences(x_train)
train_padded = pad_sequences(train_sequence, padding=padding_type, maxlen=max_length)

test_sequence = tokenizer.texts_to_sequences(x_test)
test_padded = pad_sequences(test_sequence, padding=padding_type, maxlen=max_length)

In [19]:
train_padded

array([[  141,  7259,    20, ...,     0,     0,     0],
       [  217,    58,   221, ...,     0,     0,     0],
       [  682,     5,    59, ...,     0,     0,     0],
       ...,
       [ 2407, 19738,   126, ...,     0,     0,     0],
       [  316,  3532,   164, ...,     0,     0,     0],
       [  203,     2,   811, ...,     0,     0,     0]])

In [20]:
############################################################################
## this model part is from 텐서플로2와 머신러닝으로 시작하는 자연어처리 07. 텍스트분류
############################################################################

model_name = "cnn_classifier_en"
BATCH_SIZE=512
NUM_EPOCHS=10
VALID_SPLIT=0.1
MAX_LEN=train_padded.shape[1]

kargs = {
    "model_name": model_name,
    "vocab_size": vocab_size,
    "embedding_size": 256,
    "num_filters": 100,
    "dropout_rate": .5,
    "hidden_dimension": 250,
    "output_dimension": 5
}

In [None]:
class TBertClassifier(tf.keras.Model):
    def __init__(self, model_name, dir_path, num_class):
        super(TBertClassifier, self).__init__()

        self.bert = TFBertModel.from_pretrained(model_name, cache_dir=dir_path)
        self.dropout = tf.keras.layers.Dropout(self.bert.config.hidden_dropout_prob)
        self.classifier = tf.keras.layers.Dense(num_class,
                                                kernel_initializer=tf.keras.initializers.TruncatedNormal(self.bert.config.initialier_range),
                                                name="classifier")
    def call(self, inputs, attention_mask=None, token_type_ids=None, training=False):
        outputs = self.bert(inputs, attention_mask=attention_mask, token_type_ids=token_type_ids)
        pooled_output = outputs[1]
        pooled_output = self.dropout(pooled_output, training=training)
        logits = self.classifier(pooled_output)
        return logits

        

In [21]:
class CNNClassifier(tf.keras.Model):
    def __init__(self, **kargs):
        super(CNNClassifier, self).__init__(name=kargs["model_name"])
        self.embedding = tf.keras.layers.Embedding(input_dim=kargs["vocab_size"],
                                                   output_dim=kargs["embedding_size"])
        self.conv_list = [ tf.keras.layers.Conv1D(filters=kargs["num_filters"],
                                                  kernel_size=kernel_size,   
                                                  padding="valid", 
                                                  activation="relu", 
                                                  kernel_constraint=tf.keras.constraints.MaxNorm(max_value=3.))
                            for kernel_size in [3,4,5,6,7] ]
        self.pooling = tf.keras.layers.GlobalMaxPooling1D()
        self.dropout = tf.keras.layers.Dropout(kargs["dropout_rate"])
        self.fc1 = tf.keras.layers.Dense(units=kargs["hidden_dimension"], activation="relu", 
                                         kernel_constraint=tf.keras.constraints.MaxNorm(max_value=3.)
                                         )
        self.fc2 = tf.keras.layers.Dense(units=kargs["output_dimension"], activation="softmax",
                                         kernel_constraint=tf.keras.constraints.MaxNorm(max_value=3.)
                                         )


    def call(self, x):
        x = self.embedding(x)
        x = self.dropout(x)
        x = tf.concat([self.pooling(conv(x)) for conv in self.conv_list], axis=-1)
        x = self.fc1(x)
        x = self.dropout(x)
        x = self.fc2(x)

        return x                                         


In [22]:
model = CNNClassifier(**kargs)


In [23]:
model.compile(loss="sparse_categorical_crossentropy",
              optimizer="adam",
              metrics=["accuracy"])
model.build(train_padded.shape)
print(model.summary())            

Model: "cnn_classifier_en"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        multiple                  5120000   
_________________________________________________________________
conv1d (Conv1D)              multiple                  76900     
_________________________________________________________________
conv1d_1 (Conv1D)            multiple                  102500    
_________________________________________________________________
conv1d_2 (Conv1D)            multiple                  128100    
_________________________________________________________________
conv1d_3 (Conv1D)            multiple                  153700    
_________________________________________________________________
conv1d_4 (Conv1D)            multiple                  179300    
_________________________________________________________________
global_max_pooling1d (Global multiple            

In [24]:
#num_epochs = 20
earlystop_callback = tf.keras.callbacks.EarlyStopping(monitor="val_accuracy", 
                                                    min_delta=0.0001, 
                                                    patience=2)

checkpoint_path = os.path.join( get_chk_dir_path(), model_name, "weights.h5" )
checkpoint_dir = os.path.dirname(checkpoint_path)

cp_callback = tf.keras.callbacks.ModelCheckpoint(checkpoint_path, 
                                                monitor="val_accuracy", 
                                                verbose=1, 
                                                save_best_only=True, 
                                                save_weights_only=True)

if not os.path.exists(checkpoint_dir):
    os.makedirs(checkpoint_dir, exist_ok=True)

history = model.fit(train_padded, y_train, 
                    batch_size=BATCH_SIZE, 
                    epochs=NUM_EPOCHS, 
                    verbose=2, 
                    validation_split=VALID_SPLIT, 
                    callbacks=[earlystop_callback, cp_callback])

Epoch 1/10

Epoch 00001: val_accuracy improved from -inf to 0.59457, saving model to d:\github\daewonyoon\misc\2020\10_dacon_predict_novelist\chk\cnn_classifier_en\weights.h5
97/97 - 41s - loss: 1.3976 - accuracy: 0.4045 - val_loss: 1.0473 - val_accuracy: 0.5946
Epoch 2/10

Epoch 00002: val_accuracy improved from 0.59457 to 0.73069, saving model to d:\github\daewonyoon\misc\2020\10_dacon_predict_novelist\chk\cnn_classifier_en\weights.h5
97/97 - 40s - loss: 0.8422 - accuracy: 0.6870 - val_loss: 0.7382 - val_accuracy: 0.7307
Epoch 3/10

Epoch 00003: val_accuracy improved from 0.73069 to 0.74289, saving model to d:\github\daewonyoon\misc\2020\10_dacon_predict_novelist\chk\cnn_classifier_en\weights.h5
97/97 - 40s - loss: 0.5750 - accuracy: 0.7934 - val_loss: 0.6951 - val_accuracy: 0.7429
Epoch 4/10

Epoch 00004: val_accuracy did not improve from 0.74289
97/97 - 40s - loss: 0.4470 - accuracy: 0.8397 - val_loss: 0.7061 - val_accuracy: 0.7403
Epoch 5/10

Epoch 00005: val_accuracy did not impr

In [25]:
model.load_weights(checkpoint_path)

pred = model.predict(test_padded)

In [26]:
pred

array([[1.7089713e-02, 2.5950196e-01, 5.0526047e-01, 2.0786697e-01,
        1.0280924e-02],
       [1.1359072e-01, 7.1428502e-01, 2.9176155e-02, 5.2005421e-02,
        9.0942726e-02],
       [9.1241038e-01, 8.2863055e-02, 1.2584317e-03, 5.5661576e-04,
        2.9115642e-03],
       ...,
       [1.0660500e-03, 9.9865985e-01, 9.4617180e-06, 2.6123933e-04,
        3.3049264e-06],
       [3.9650548e-02, 9.2917782e-01, 3.3951139e-03, 2.6424885e-02,
        1.3515840e-03],
       [9.5895571e-01, 9.5104389e-03, 5.2798851e-03, 7.5844228e-03,
        1.8669603e-02]], dtype=float32)

In [27]:
pred.shape

(19617, 5)

In [28]:
test_padded.shape

(19617, 500)

In [29]:
sample_submission[[str(i) for i in range(5)]] = pred
sample_submission

Unnamed: 0,index,0,1,2,3,4
0,0,0.017090,0.259502,0.505260,0.207867,0.010281
1,1,0.113591,0.714285,0.029176,0.052005,0.090943
2,2,0.912410,0.082863,0.001258,0.000557,0.002912
3,3,0.068767,0.001953,0.730606,0.006248,0.192426
4,4,0.487861,0.066780,0.018471,0.332050,0.094838
...,...,...,...,...,...,...
19612,19612,0.004014,0.994086,0.000073,0.001731,0.000096
19613,19613,0.113119,0.008009,0.029976,0.001777,0.847118
19614,19614,0.001066,0.998660,0.000009,0.000261,0.000003
19615,19615,0.039651,0.929178,0.003395,0.026425,0.001352


In [30]:
sample_submission.to_csv("submission_cnn.csv", index=False, encoding="utf-8")