In [1]:
import os
import re

import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer


tf.random.set_seed(9999)

In [2]:
def get_chk_dir_path()->str:
    return os.path.abspath('../chk')    

def get_dat_dir_path()->str:
    return os.path.abspath('../dat')

def get_train_csv_path()->str:
    dat_dir = get_dat_dir_path()
    # print(dat_dir)
    return os.path.join(dat_dir, "train.csv")

def get_test_csv_path()->str:
    dat_dir = get_dat_dir_path()
    return os.path.join(dat_dir, "test_x.csv")

def get_sample_sub_path()->str:
    dat_dir = get_dat_dir_path()
    return os.path.join(dat_dir, "sample_submission.csv")


In [3]:
train = pd.read_csv(get_train_csv_path())
test = pd.read_csv(get_test_csv_path())
sample_submission = pd.read_csv(get_sample_sub_path())

In [4]:
# preprocesscing

In [5]:
def alpha_num(txt:str)->str:
    return re.sub(r"[^A-Za-z0-9 ]", "", txt)

train["text"] = train["text"].apply(alpha_num)

In [6]:
train

Unnamed: 0,index,text,author
0,0,He was almost choking There was so much so muc...,3
1,1,Your sister asked for it I suppose,2
2,2,She was engaged one day as she walked in peru...,1
3,3,The captain was in the porch keeping himself c...,4
4,4,Have mercy gentlemen odin flung up his hands D...,3
...,...,...,...
54874,54874,Is that you Mr Smith odin whispered I hardly d...,2
54875,54875,I told my plan to the captain and between us w...,4
54876,54876,Your sincere wellwisher friend and sister LUC...,1
54877,54877,Then you wanted me to lend you money,3


In [7]:
stopwords = [ "a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "as", 
             "at", "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "could", 
             "did", "do", "does", "doing", "down", "during", "each", "few", "for", "from", "further", "had", "has", 
             "have", "having", "he", "he'd", "he'll", "he's", "her", "here", "here's", "hers", "herself", "him", "himself", 
             "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is", "it", "it's", "its", "itself", 
             "let's", "me", "more", "most", "my", "myself", "nor", "of", "on", "once", "only", "or", "other", "ought", "our", "ours", 
             "ourselves", "out", "over", "own", "same", "she", "she'd", "she'll", "she's", "should", "so", "some", "such", "than", "that", 
             "that's", "the", "their", "theirs", "them", "themselves", "then", "there", "there's", "these", "they", "they'd", "they'll", 
             "they're", "they've", "this", "those", "through", "to", "too", "under", "until", "up", "very", "was", "we", "we'd", "we'll", 
             "we're", "we've", "were", "what", "what's", "when", "when's", "where", "where's", "which", "while", "who", "who's", "whom", 
             "why", "why's", "with", "would", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves" ]

def remove_stopwords(text:str) -> str:
    final_txt = []
    for i in text.split():
        if i.strip().lower() not in stopwords:
            final_txt.append(i.strip())
    return " ".join(final_txt)

In [8]:
train["text"] = train["text"].apply(alpha_num).apply(remove_stopwords)
test["text"] = test["text"].apply(alpha_num).apply(remove_stopwords)

In [9]:
train

Unnamed: 0,index,text,author
0,0,almost choking much much wanted say strange ex...,3
1,1,sister asked suppose,2
2,2,engaged one day walked perusing Janes last let...,1
3,3,captain porch keeping carefully way treacherou...,4
4,4,mercy gentlemen odin flung hands Dont write an...,3
...,...,...,...
54874,54874,Mr Smith odin whispered hardly dared hope come,2
54875,54875,told plan captain us settled details accomplis...,4
54876,54876,sincere wellwisher friend sister LUCY odin,1
54877,54877,wanted lend money,3


In [10]:
# x_train = train["text"].values

In [11]:
# x_train

In [12]:
x_train = np.array([x for x in train["text"]])
x_test = np.array([x for x in test["text"]])
y_train = np.array([x for x in train["author"]])

In [13]:
# x_train

In [14]:
# Modeling

In [15]:
vocab_size = 20000
embedding_dim = 256
max_length = 500
padding_type = "post"

In [16]:
tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(x_train)
word_index = tokenizer.word_index

In [17]:
list(word_index)[:10]

['odin', 'not', 'said', 'no', 'one', 'mr', 'will', 'upon', 'now', 'man']

In [18]:
train_sequence = tokenizer.texts_to_sequences(x_train)
train_padded = pad_sequences(train_sequence, padding=padding_type, maxlen=max_length)

test_sequence = tokenizer.texts_to_sequences(x_test)
test_padded = pad_sequences(test_sequence, padding=padding_type, maxlen=max_length)

In [19]:
train_padded

array([[  141,  7259,    20, ...,     0,     0,     0],
       [  217,    58,   221, ...,     0,     0,     0],
       [  682,     5,    59, ...,     0,     0,     0],
       ...,
       [ 2407, 19738,   126, ...,     0,     0,     0],
       [  316,  3532,   164, ...,     0,     0,     0],
       [  203,     2,   811, ...,     0,     0,     0]])

In [20]:
model_name = "rnn_bidi_classifier_en"
BATCH_SIZE=128
NUM_EPOCHS=10
VALID_SPLIT=0.1

In [21]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    #tf.keras.layers.Dropout(rate=.2),
    tf.keras.layers.Dense(256, activation="relu"),
    tf.keras.layers.Dropout(rate=.5),
    tf.keras.layers.Dense(5, activation="softmax")
])

In [22]:
model.compile(loss="sparse_categorical_crossentropy",
              optimizer="adam",
              metrics=["accuracy"])
print(model.summary())            

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 500, 256)          5120000   
_________________________________________________________________
bidirectional (Bidirectional (None, 128)               164352    
_________________________________________________________________
dense (Dense)                (None, 256)               33024     
_________________________________________________________________
dropout (Dropout)            (None, 256)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 5)                 1285      
Total params: 5,318,661
Trainable params: 5,318,661
Non-trainable params: 0
_________________________________________________________________
None


In [23]:
#num_epochs = 20
#history = model.fit(train_padded, y_train, epochs=num_epochs, verbose=2, validation_split=.2)

earlystop_callback = tf.keras.callbacks.EarlyStopping(monitor="val_accuracy", 
                                                      min_delta=0.0001, 
                                                      patience=2)

checkpoint_path = os.path.join( get_chk_dir_path(), model_name, "weights.h5" )
checkpoint_dir = os.path.dirname(checkpoint_path)

cp_callback = tf.keras.callbacks.ModelCheckpoint(checkpoint_path, 
                                                 monitor="val_accuracy", 
                                                 verbose=1, 
                                                 save_best_only=True, 
                                                 save_weights_only=True)

if not os.path.exists(checkpoint_dir):
    os.makedirs(checkpoint_dir, exist_ok=True)

history = model.fit(train_padded, y_train, 
                    batch_size=BATCH_SIZE, 
                    epochs=NUM_EPOCHS, 
                    verbose=2, 
                    validation_split=VALID_SPLIT, 
                    callbacks=[earlystop_callback, cp_callback])

Epoch 1/10

Epoch 00001: val_accuracy improved from -inf to 0.72522, saving model to d:\github\daewonyoon\misc\2020\10_dacon_predict_novelist\chk\rnn_tf_tutorial\weights.h5
386/386 - 38s - loss: 0.9915 - accuracy: 0.6063 - val_loss: 0.7320 - val_accuracy: 0.7252
Epoch 2/10

Epoch 00002: val_accuracy improved from 0.72522 to 0.73615, saving model to d:\github\daewonyoon\misc\2020\10_dacon_predict_novelist\chk\rnn_tf_tutorial\weights.h5
386/386 - 37s - loss: 0.5593 - accuracy: 0.7961 - val_loss: 0.6940 - val_accuracy: 0.7362
Epoch 3/10

Epoch 00003: val_accuracy did not improve from 0.73615
386/386 - 37s - loss: 0.4270 - accuracy: 0.8459 - val_loss: 0.7423 - val_accuracy: 0.7356
Epoch 4/10

Epoch 00004: val_accuracy did not improve from 0.73615
386/386 - 37s - loss: 0.3567 - accuracy: 0.8711 - val_loss: 0.8008 - val_accuracy: 0.7331


In [24]:
model.load_weights(checkpoint_path)

pred = model.predict(test_padded)

In [25]:
pred

array([[3.6490966e-02, 3.9549160e-01, 4.8912516e-01, 5.6070533e-02,
        2.2821754e-02],
       [2.7444288e-01, 1.9465953e-01, 1.3820781e-02, 3.9311837e-02,
        4.7776496e-01],
       [9.8951030e-01, 3.7105076e-03, 2.3614161e-03, 5.1012379e-04,
        3.9076610e-03],
       ...,
       [1.7023237e-03, 9.9762338e-01, 4.9616214e-05, 5.6828529e-04,
        5.6355719e-05],
       [1.3981982e-03, 9.9822432e-01, 4.7206857e-05, 2.4869232e-04,
        8.1520251e-05],
       [9.4465178e-01, 1.6832378e-03, 6.5315343e-03, 6.0424476e-04,
        4.6529327e-02]], dtype=float32)

In [26]:
pred.shape

(19617, 5)

In [27]:
test_padded.shape

(19617, 500)

In [28]:
sample_submission[[str(i) for i in range(5)]] = pred
sample_submission

Unnamed: 0,index,0,1,2,3,4
0,0,0.036491,0.395492,0.489125,0.056071,0.022822
1,1,0.274443,0.194660,0.013821,0.039312,0.477765
2,2,0.989510,0.003711,0.002361,0.000510,0.003908
3,3,0.003269,0.004962,0.922927,0.003131,0.065710
4,4,0.074457,0.014315,0.006470,0.901191,0.003567
...,...,...,...,...,...,...
19612,19612,0.002975,0.994405,0.000143,0.001402,0.001074
19613,19613,0.005169,0.000348,0.011586,0.000167,0.982729
19614,19614,0.001702,0.997623,0.000050,0.000568,0.000056
19615,19615,0.001398,0.998224,0.000047,0.000249,0.000082


In [29]:
sample_submission.to_csv("submission_rnn_bidi.csv", index=False, encoding="utf-8")