In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

import matplotlib.pyplot as plt
import pandas as pd
import re
import seaborn as sns

import os
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

from nltk.tag import pos_tag
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

os.chdir('/content/drive/My Drive/dacon_/')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [None]:
def alpha_num(text):
    return re.sub(r'[^A-Za-z0-9/]', ' ', text)

def tokenize(text):
    return " ".join([item[0] + "/" + item[1] for item in pos_tag(word_tokenize(text))])

# def tokenize(text):
#     return " ".join(word_tokenize(text))

def remove_stopwords(text):
    final_text = []
    for i in text.split():
        if i.strip().lower() not in stopwords:
            final_text.append(i.strip())
    return " ".join(final_text)


In [None]:
df = pd.read_csv('train.csv')
#df['text'] = df['text'].str.lower().apply(alpha_num)
df['text'] = df['text'].str.lower().apply(alpha_num).apply(tokenize)

In [None]:
df.head()

Unnamed: 0,index,text,author
0,0,he/PRP was/VBD almost/RB choking/VBG there/EX ...,3
1,1,your/PRP$ sister/NN asked/VBD for/IN it/PRP i/...,2
2,2,she/PRP was/VBD engaged/VBN one/CD day/NN as/I...,1
3,3,the/DT captain/NN was/VBD in/IN the/DT porch/N...,4
4,4,have/VB mercy/VBN gentlemen/NNS odin/RB flung/...,3


In [None]:
vocab_size = 40000
tokenizer = Tokenizer(vocab_size)
tokenizer.fit_on_texts(df['text'])
sequences = tokenizer.texts_to_sequences(df['text'])

max_len = 300

x_train = pad_sequences(sequences, maxlen=max_len)
y_train = df['author'].to_numpy()

# 모델 1 (simple Conv1D )

In [None]:
# val_loss: 0.7529
# val_loss: 0.8304 Not import NLTK 
embedding_size = 100
filter = 64


model = keras.models.Sequential([
    keras.layers.Embedding(vocab_size, embedding_size, input_shape=(max_len,)),
    keras.layers.Conv1D(filter, 3, padding="same", activation="relu"),
    keras.layers.GlobalMaxPooling1D(),
    keras.layers.Dropout(.3),
    keras.layers.Dense(32, activation="relu", kernel_regularizer=keras.regularizers.l2()),
    keras.layers.Dense(5, activation="softmax"),
])

model.compile(loss='sparse_categorical_crossentropy', optimizer=keras.optimizers.Adam(0.001), metrics=['acc'])
model.summary()


Model: "sequential_16"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_31 (Embedding)     (None, 300, 100)          4000000   
_________________________________________________________________
conv1d_71 (Conv1D)           (None, 300, 64)           19264     
_________________________________________________________________
global_max_pooling1d_49 (Glo (None, 64)                0         
_________________________________________________________________
dropout_40 (Dropout)         (None, 64)                0         
_________________________________________________________________
dense_38 (Dense)             (None, 32)                2080      
_________________________________________________________________
dense_39 (Dense)             (None, 5)                 165       
Total params: 4,021,509
Trainable params: 4,021,509
Non-trainable params: 0
___________________________________________

# 모델 2 (Bidirectional LSTM)

In [None]:
embedding_size = 32
filter = 64

x_input = keras.Input((max_len,))
embedding = keras.layers.Embedding(vocab_size, embedding_size)(x_input)
do1 = keras.layers.SpatialDropout1D(.5)(embedding)

lstm1 = keras.layers.Bidirectional(keras.layers.LSTM(filter, return_sequences=True))(do1)
lstm2 = keras.layers.Bidirectional(keras.layers.LSTM(filter, return_sequences=True))(lstm1)

hidden = keras.layers.Concatenate()([
    keras.layers.GlobalMaxPooling1D()(lstm2),
    keras.layers.GlobalAveragePooling1D()(lstm2),
])
d1 = keras.layers.Dense(filter * 2)(hidden)
do3 = keras.layers.Dropout(.3)(d1)
d2 = keras.layers.Dense(filter * 2)(do3)
do4 = keras.layers.Dropout(.3)(d2)
output = keras.layers.Dense(5, activation='sigmoid')(do4)

model = keras.models.Model(x_input, output)

model.compile(loss='sparse_categorical_crossentropy', optimizer=keras.optimizers.Adam(), metrics=['acc'])
model.summary()
# keras.utils.plot_model(model)

# 모델3 (Multi Conv1D)

In [None]:
# [3, 6, 12]
# val_loss: 0.5843 (filter: 64, embedding_size: 64)
# val_loss: 0.5753 (filter: 128, embedding_size: 64)

# val_loss: 0.5721 (filter: 128, embedding_size: 32)
# val_loss: 0.6686 (filter: 128, embedding_size: 32) Not import NLTK 

# val_loss: 0.6149  (filter: 128, embedding_size: 16) 마지막 에폭까지 수렴이 안 됨
# val_loss: 0.5974  (filter: 256, embedding_size: 32) 

# [3, 6, 9]

embedding_size = 32
filter = 128

def multi_kernel(filter_size, input_layer):
    kernel_size = [3, 6, 12]
    conv_blocks = []

    for ks in kernel_size:
        conv = keras.layers.Conv1D(filter_size, ks, padding="valid", activation="relu")(input_layer)
        max_pool = keras.layers.GlobalMaxPooling1D()(conv)
        conv_blocks.append(max_pool)

    return conv_blocks

x_input = keras.Input((max_len,))
embedding = keras.layers.Embedding(vocab_size, embedding_size)(x_input)
do1 = keras.layers.Dropout(.3)(embedding)
convs = multi_kernel(filter, do1)

concatenate = keras.layers.Concatenate()(convs)
do2 = keras.layers.Dropout(.3)(concatenate)
d1 = keras.layers.Dense(filter, activation='relu')(do2)
do3 = keras.layers.Dropout(.3)(d1)
output = keras.layers.Dense(5, activation='sigmoid')(do3)

model = keras.models.Model(x_input, output)

model.compile(loss='sparse_categorical_crossentropy', optimizer=keras.optimizers.Adam(), metrics=['acc'])
model.summary()
# keras.utils.plot_model(model)

Model: "functional_29"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_19 (InputLayer)           [(None, 300)]        0                                            
__________________________________________________________________________________________________
embedding_35 (Embedding)        (None, 300, 32)      1280000     input_19[0][0]                   
__________________________________________________________________________________________________
dropout_47 (Dropout)            (None, 300, 32)      0           embedding_35[0][0]               
__________________________________________________________________________________________________
conv1d_81 (Conv1D)              (None, 298, 128)     12416       dropout_47[0][0]                 
______________________________________________________________________________________

# 모델4 (Multi Conv1D stack)

In [None]:
# [3, 6, 12]
# val_loss: 0.5843 (filter: 64, embedding_size: 64)
# val_loss: 0.5753 (filter: 128, embedding_size: 64)

# val_loss: 0.5721 (filter: 128, embedding_size: 32)
# val_loss: 0.6686 (filter: 128, embedding_size: 32) Not import NLTK 

# val_loss: 0.6149  (filter: 128, embedding_size: 16) 마지막 에폭까지 수렴이 안 됨
# val_loss: 0.5974  (filter: 256, embedding_size: 32) 

# [3, 6, 9]

embedding_size = 64
filter = 128

def multi_kernel(filter_size, input_layer):
    kernel_size = [3, 6, 12]
    conv_blocks = []

    for ks in kernel_size:
        conv1 = keras.layers.Conv1D(filter_size, ks, padding="same", activation="relu")(input_layer)
        pool1 = keras.layers.MaxPool1D()(conv1)
        conv2 = keras.layers.Conv1D(filter_size, ks, padding="same", activation="relu")(pool1)
        global_pool = keras.layers.GlobalMaxPool1D()(conv2)

        conv_blocks.append(global_pool)

    return conv_blocks

x_input = keras.Input((max_len,))
embedding = keras.layers.Embedding(vocab_size, embedding_size, trainable=True)(x_input)
convs = multi_kernel(filter, embedding)

concatenate = keras.layers.Concatenate()(convs)
do2 = keras.layers.Dropout(.3)(concatenate)
d1 = keras.layers.Dense(64, activation='relu')(do2)
output = keras.layers.Dense(5, activation='softmax')(d1)

model = keras.models.Model(x_input, output)

model.compile(loss='sparse_categorical_crossentropy', optimizer=keras.optimizers.Adam(), metrics=['acc'])
model.summary()
# keras.utils.plot_model(model)

Model: "functional_33"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_21 (InputLayer)           [(None, 300)]        0                                            
__________________________________________________________________________________________________
embedding_37 (Embedding)        (None, 300, 64)      2560000     input_21[0][0]                   
__________________________________________________________________________________________________
conv1d_90 (Conv1D)              (None, 300, 128)     24704       embedding_37[0][0]               
__________________________________________________________________________________________________
conv1d_92 (Conv1D)              (None, 300, 128)     49280       embedding_37[0][0]               
______________________________________________________________________________________

# 모델 5 (ELMo  임베딩)

In [None]:
!pip install tensorflow_text
import tensorflow_text
import tensorflow_hub as hub

bert_preprocessor = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/1")
bert_encoder = hub.KerasLayer("https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-128_A-2/1",trainable=False)



In [None]:
df = pd.read_csv('train.csv')
#df['text'] = df['text'].str.lower().apply(alpha_num)
x_train = df['text'].to_numpy()
y_train = df['author'].to_numpy()

text_input = keras.layers.Input(shape=(), dtype=tf.string)
encoder_inputs = bert_preprocessor(text_input)
bert_output = bert_encoder(encoder_inputs)["sequence_output"]

bert_transformation = keras.models.Model(text_input, bert_output)
x_train = bert_transformation.predict(x_train, batch_size=256, verbose=1)



In [None]:
max_len = 128

x_input = keras.Input((max_len,))
d1 = keras.layers.Dense(max_len // 2, activation='relu')(x_input)
do1 = keras.layers.Dropout(.3)(d1)
d2 = keras.layers.Dense(64, activation='relu')(do1)
output = keras.layers.Dense(5, activation='softmax')(d2)

model = keras.models.Model(x_input, output)
model.compile(loss='sparse_categorical_crossentropy', optimizer=keras.optimizers.Adam(), metrics=['acc'])
model.summary()

# pooled_output = outputs["pooled_output"]      # [batch_size, 768].
# sequence_output = outputs["sequence_output"]  # [batch_size, seq_length, 768].

Model: "functional_13"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_7 (InputLayer)         [(None, 128)]             0         
_________________________________________________________________
dense_3 (Dense)              (None, 64)                8256      
_________________________________________________________________
dropout_1 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_4 (Dense)              (None, 64)                4160      
_________________________________________________________________
dense_5 (Dense)              (None, 5)                 325       
Total params: 12,741
Trainable params: 12,741
Non-trainable params: 0
_________________________________________________________________


# 학습

In [None]:
es = keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)
hist = model.fit(x_train, y_train, batch_size=64, epochs=40, validation_split=0.3, shuffle=True, verbose=1, callbacks=[es])

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40

KeyboardInterrupt: ignored

In [None]:
train_df = pd.read_csv('test_x.csv')
train_df['text'] = train_df['text'].str.lower().apply(alpha_num).apply(tokenize)

sequences = tokenizer.texts_to_sequences(train_df['text'])
x_test = pad_sequences(sequences, maxlen=max_len)

res = model.predict(x_test)

In [None]:
sample_submission = pd.read_csv('sample_submission.csv', encoding='utf-8')
sample_submission[['0', '1', '2', '3', '4']] = res
sample_submission.to_csv('writer_submission.csv', index = False, encoding = 'utf-8')