In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import pandas as pd
import warnings
warnings.filterwarnings(action = 'ignore')
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
import re

In [3]:
# 경로 설정
import os
os.chdir('/content/drive/My Drive')

In [125]:
# 파일 불러오기
train = pd.read_csv('open/train.csv', encoding = 'utf-8')
test = pd.read_csv('open/test_x.csv', encoding = 'utf-8')
sample_submission = pd.read_csv('open/sample_submission.csv', encoding = 'utf-8')

In [126]:
# train 데이터 살펴보기
train

Unnamed: 0,index,text,author
0,0,"He was almost choking. There was so much, so m...",3
1,1,"“Your sister asked for it, I suppose?”",2
2,2,"She was engaged one day as she walked, in per...",1
3,3,"The captain was in the porch, keeping himself ...",4
4,4,"“Have mercy, gentlemen!” odin flung up his han...",3
...,...,...,...
54874,54874,"“Is that you, Mr. Smith?” odin whispered. “I h...",2
54875,54875,"I told my plan to the captain, and between us ...",4
54876,54876,"""Your sincere well-wisher, friend, and sister...",1
54877,54877,“Then you wanted me to lend you money?”,3


In [127]:
# test 데이터 살펴보기
test

Unnamed: 0,index,text
0,0,“Not at all. I think she is one of the most ch...
1,1,"""No,"" replied he, with sudden consciousness, ""..."
2,2,As the lady had stated her intention of scream...
3,3,“And then suddenly in the silence I heard a so...
4,4,His conviction remained unchanged. So far as I...
...,...,...
19612,19612,"At the end of another day or two, odin growing..."
19613,19613,"All afternoon we sat together, mostly in silen..."
19614,19614,"odin, having carried his thanks to odin, proc..."
19615,19615,"Soon after this, upon odin's leaving the room,..."


In [128]:
# sample_submission
sample_submission

Unnamed: 0,index,0,1,2,3,4
0,0,0,0,0,0,0
1,1,0,0,0,0,0
2,2,0,0,0,0,0
3,3,0,0,0,0,0
4,4,0,0,0,0,0
...,...,...,...,...,...,...
19612,19612,0,0,0,0,0
19613,19613,0,0,0,0,0
19614,19614,0,0,0,0,0
19615,19615,0,0,0,0,0


# **전처리**

In [129]:
# 부호를 제거해주는 함수
def alpha_num(text) :
  return re.sub(r'[^A-Za-z0-9 ]', '', text)

train['text'] = train['text'].apply(alpha_num)

In [130]:
# 부호가 사라진 것을 확인할 수 있습니다.
train

Unnamed: 0,index,text,author
0,0,He was almost choking There was so much so muc...,3
1,1,Your sister asked for it I suppose,2
2,2,She was engaged one day as she walked in peru...,1
3,3,The captain was in the porch keeping himself c...,4
4,4,Have mercy gentlemen odin flung up his hands D...,3
...,...,...,...
54874,54874,Is that you Mr Smith odin whispered I hardly d...,2
54875,54875,I told my plan to the captain and between us w...,4
54876,54876,Your sincere wellwisher friend and sister LUC...,1
54877,54877,Then you wanted me to lend you money,3


In [131]:
# 이상치 제거
train = train[train.text != "     "]
train = train.reset_index(drop=True)
train

Unnamed: 0,index,text,author
0,0,He was almost choking There was so much so muc...,3
1,1,Your sister asked for it I suppose,2
2,2,She was engaged one day as she walked in peru...,1
3,3,The captain was in the porch keeping himself c...,4
4,4,Have mercy gentlemen odin flung up his hands D...,3
...,...,...,...
54831,54874,Is that you Mr Smith odin whispered I hardly d...,2
54832,54875,I told my plan to the captain and between us w...,4
54833,54876,Your sincere wellwisher friend and sister LUC...,1
54834,54877,Then you wanted me to lend you money,3


In [132]:
import nltk
nltk.download('stopwords')
stopwords=nltk.corpus.stopwords.words('english')
stopwords.append('odin')
stopwords.extend(['could', 'ought', 'would'])
# stopwords.extend(['mr', 'mrs'])

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [133]:
# 불용어 제거해주는 함수
def remove_stopwords(text):
    final_text = []
    for i in text.split():
        if i.strip().lower() not in stopwords:
            final_text.append(i.strip())
    return " ".join(final_text)

# 불용어
# stopwords = [ "a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "as", 
#              "at", "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "could", 
#              "did", "do", "does", "doing", "down", "during", "each", "few", "for", "from", "further", "had", "has", 
#              "have", "having", "he", "he'd", "he'll", "he's", "her", "here", "here's", "hers", "herself", "him", "himself", 
#              "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is", "it", "it's", "its", "itself", 
#              "let's", "me", "more", "most", "my", "myself", "nor", "of", "on", "once", "only", "or", "other", "ought", "our", "ours", 
#              "ourselves", "out", "over", "own", "same", "she", "she'd", "she'll", "she's", "should", "so", "some", "such", "than", "that", 
#              "that's", "the", "their", "theirs", "them", "themselves", "then", "there", "there's", "these", "they", "they'd", "they'll", 
#              "they're", "they've", "this", "those", "through", "to", "too", "under", "until", "up", "very", "was", "we", "we'd", "we'll", 
#              "we're", "we've", "were", "what", "what's", "when", "when's", "where", "where's", "which", "while", "who", "who's", "whom", 
#              "why", "why's", "with", "would", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves" ]

In [134]:
#from nltk.stem.porter import PorterStemmer
#stemmer = PorterStemmer()

# from nltk.stem.snowball import SnowballStemmer
# stemmer = SnowballStemmer('english')

In [135]:
# # 어간추출
# def stemming(text):
#     final_text = []
#     for i in text.split():
#       final_text.append(stemmer.stem(i))
#     return " ".join(final_text)

In [136]:
#전처리 적용
train['text'] = train['text'].str.lower()
test['text'] = test['text'].str.lower()
train['text'] = train['text'].apply(alpha_num).apply(remove_stopwords)
test['text'] = test['text'].apply(alpha_num).apply(remove_stopwords)

In [137]:
train

Unnamed: 0,index,text,author
0,0,almost choking much much wanted say strange ex...,3
1,1,sister asked suppose,2
2,2,engaged one day walked perusing janes last let...,1
3,3,captain porch keeping carefully way treacherou...,4
4,4,mercy gentlemen flung hands dont write anyway ...,3
...,...,...,...
54831,54874,mr smith whispered hardly dared hope come,2
54832,54875,told plan captain us settled details accomplis...,4
54833,54876,sincere wellwisher friend sister lucy,1
54834,54877,wanted lend money,3


In [138]:
# train test 분리
X_train = np.array([x for x in train['text']])
X_test = np.array([x for x in test['text']])
y_train = np.array([x for x in train['author']])

In [139]:
from sklearn.model_selection import train_test_split

In [140]:
# 자체 테스트를 위한 data split
xtrain, xtest, ytrain, ytest = train_test_split(X_train, y_train, test_size = 0.3, random_state = 2020)

In [141]:
X_train

array(['almost choking much much wanted say strange exclamations came lips pole gazed fixedly bundle notes hand looked evident perplexity',
       'sister asked suppose',
       'engaged one day walked perusing janes last letter dwelling passages proved jane written spirits instead surprised mr saw looking meeting putting away letter immediately forcing smile said',
       ..., 'sincere wellwisher friend sister lucy', 'wanted lend money',
       'certainly occurred said yes like'], dtype='<U1345')

# **모델링**

In [142]:
# 파라미터 설정
vocab_size = 20000
embedding_dim = 16
max_length = 500
padding_type = 'post'
# oov_tok = "<OOV>"

In [143]:
from nltk.tokenize import word_tokenize, sent_tokenize

In [144]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [145]:
train_token = [word_tokenize(sentence) for sentence in train['text']]
test_token = [word_tokenize(sentence) for sentence in test['text']]
all_token = train_token + test_token

In [146]:
from gensim.models import Word2Vec

In [147]:
model_all = Word2Vec(sentences = all_token, size = 10, window = 3, min_count = 1, sg = 1, workers = 4, iter = 20)

In [148]:
for i in range(len(all_token)) :
  for j in range(len(all_token[i])) :
    all_token[i][j] = model_all.wv.get_vector(all_token[i][j])

In [149]:
train_sequences = all_token[:len(train_token)]
train_padded = pad_sequences(train_sequences, padding = padding_type, maxlen = max_length)
test_sequences = all_token[len(train_token):]
test_padded = pad_sequences(test_sequences, padding = padding_type, maxlen = max_length)

In [150]:
train_padded.shape

(54836, 500, 10)

In [166]:
# 가벼운 NLP모델 생성
model = tf.keras.Sequential([
    tf.keras.layers.Input(shape = (500, 10)),
    # tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length = max_length),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(24, activation = 'relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dense(24, activation = 'relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dense(5, activation = 'softmax')
])

In [167]:
# compile model
model.compile(loss = 'sparse_categorical_crossentropy',
              optimizer = 'adam',
              metrics = ['accuracy'])

# model summary
print(model.summary())

Model: "sequential_18"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
global_average_pooling1d_12  (None, 10)                0         
_________________________________________________________________
dense_39 (Dense)             (None, 24)                264       
_________________________________________________________________
dropout_4 (Dropout)          (None, 24)                0         
_________________________________________________________________
batch_normalization_4 (Batch (None, 24)                96        
_________________________________________________________________
dense_40 (Dense)             (None, 24)                600       
_________________________________________________________________
dropout_5 (Dropout)          (None, 24)                0         
_________________________________________________________________
batch_normalization_5 (Batch (None, 24)              

In [168]:
# fit model
num_epochs = 30
history = model.fit(train_padded, y_train,
                    epochs = num_epochs, verbose = 2,
                    validation_split = 0.2)

# 문장 스코어?

Epoch 1/30
1371/1371 - 4s - loss: 1.4918 - accuracy: 0.3485 - val_loss: 1.3645 - val_accuracy: 0.4414
Epoch 2/30
1371/1371 - 3s - loss: 1.3862 - accuracy: 0.4058 - val_loss: 1.3669 - val_accuracy: 0.4055
Epoch 3/30
1371/1371 - 3s - loss: 1.3721 - accuracy: 0.4128 - val_loss: 1.3721 - val_accuracy: 0.4275
Epoch 4/30
1371/1371 - 3s - loss: 1.3621 - accuracy: 0.4206 - val_loss: 1.3069 - val_accuracy: 0.4672
Epoch 5/30
1371/1371 - 3s - loss: 1.3557 - accuracy: 0.4250 - val_loss: 1.3382 - val_accuracy: 0.4430
Epoch 6/30
1371/1371 - 3s - loss: 1.3519 - accuracy: 0.4283 - val_loss: 1.3352 - val_accuracy: 0.4350
Epoch 7/30
1371/1371 - 3s - loss: 1.3476 - accuracy: 0.4292 - val_loss: 1.3256 - val_accuracy: 0.4405
Epoch 8/30
1371/1371 - 3s - loss: 1.3435 - accuracy: 0.4344 - val_loss: 1.3426 - val_accuracy: 0.4219
Epoch 9/30
1371/1371 - 3s - loss: 1.3441 - accuracy: 0.4348 - val_loss: 1.4326 - val_accuracy: 0.3734
Epoch 10/30
1371/1371 - 3s - loss: 1.3421 - accuracy: 0.4351 - val_loss: 1.3146 - 

In [169]:
# predict values
pred = model.predict_proba(test_padded)

In [170]:
# submission
sample_submission[['0', '1', '2', '3', '4']] = pred
sample_submission

Unnamed: 0,index,0,1,2,3,4
0,0,0.266026,0.168320,0.151554,0.270682,0.143418
1,1,0.287524,0.217553,0.176764,0.150452,0.167708
2,2,0.271201,0.086914,0.182127,0.309151,0.150608
3,3,0.159232,0.030776,0.413015,0.019635,0.377343
4,4,0.319952,0.079516,0.294226,0.038546,0.267759
...,...,...,...,...,...,...
19612,19612,0.085666,0.878618,0.019192,0.004126,0.012399
19613,19613,0.298575,0.170154,0.143202,0.223720,0.164350
19614,19614,0.286311,0.458836,0.106789,0.034113,0.113950
19615,19615,0.146767,0.784642,0.027237,0.015757,0.025597


In [171]:
# sample_submission.to_csv('submission_w2v.csv', index = False, encoding = 'utf-8')

# **split을 통해 얻은 test set으로 model 테스트 해보기**

In [172]:
xtrain_token = [word_tokenize(sentence) for sentence in xtrain]
xtest_token = [word_tokenize(sentence) for sentence in xtest]

In [173]:
for i in range(len(xtrain_token)) :
  for j in range(len(xtrain_token[i])) :
    xtrain_token[i][j] = model_all.wv.get_vector(xtrain_token[i][j])

for i in range(len(xtest_token)) :
  for j in range(len(xtest_token[i])) :
    xtest_token[i][j] = model_all.wv.get_vector(xtest_token[i][j])

In [174]:
xtrain_sequences = xtrain_token
xtrain_padded = pad_sequences(xtrain_sequences, padding = padding_type, maxlen = max_length)
xtest_sequences = xtest_token
xtest_padded = pad_sequences(xtest_sequences, padding = padding_type, maxlen = max_length)

In [181]:
# 가벼운 NLP모델 생성
model = tf.keras.Sequential([
    tf.keras.layers.Input(shape = (500, 10)),
    # tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length = max_length),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(24, activation = 'relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dense(24, activation = 'relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dense(5, activation = 'softmax')
])

In [182]:
# compile model
model.compile(loss = 'sparse_categorical_crossentropy',
              optimizer = 'adam',
              metrics = ['accuracy'])

# model summary
print(model.summary())

Model: "sequential_19"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
global_average_pooling1d_13  (None, 10)                0         
_________________________________________________________________
dense_42 (Dense)             (None, 24)                264       
_________________________________________________________________
dropout_6 (Dropout)          (None, 24)                0         
_________________________________________________________________
batch_normalization_6 (Batch (None, 24)                96        
_________________________________________________________________
dense_43 (Dense)             (None, 24)                600       
_________________________________________________________________
dropout_7 (Dropout)          (None, 24)                0         
_________________________________________________________________
batch_normalization_7 (Batch (None, 24)              

In [183]:
# fit model
num_epochs = 30
history = model.fit(xtrain_padded, ytrain,
                    epochs = num_epochs, verbose = 2,
                    validation_split = 0.2)

Epoch 1/30
960/960 - 2s - loss: 1.5186 - accuracy: 0.3403 - val_loss: 1.4247 - val_accuracy: 0.4119
Epoch 2/30
960/960 - 2s - loss: 1.4057 - accuracy: 0.4053 - val_loss: 1.4104 - val_accuracy: 0.3903
Epoch 3/30
960/960 - 2s - loss: 1.3899 - accuracy: 0.4153 - val_loss: 1.3499 - val_accuracy: 0.4327
Epoch 4/30
960/960 - 2s - loss: 1.3808 - accuracy: 0.4195 - val_loss: 1.3642 - val_accuracy: 0.4196
Epoch 5/30
960/960 - 2s - loss: 1.3773 - accuracy: 0.4213 - val_loss: 1.3462 - val_accuracy: 0.4289
Epoch 6/30
960/960 - 2s - loss: 1.3697 - accuracy: 0.4212 - val_loss: 1.3287 - val_accuracy: 0.4370
Epoch 7/30
960/960 - 2s - loss: 1.3651 - accuracy: 0.4279 - val_loss: 1.3429 - val_accuracy: 0.4395
Epoch 8/30
960/960 - 2s - loss: 1.3660 - accuracy: 0.4267 - val_loss: 1.3441 - val_accuracy: 0.4489
Epoch 9/30
960/960 - 2s - loss: 1.3629 - accuracy: 0.4261 - val_loss: 1.3369 - val_accuracy: 0.4657
Epoch 10/30
960/960 - 2s - loss: 1.3607 - accuracy: 0.4309 - val_loss: 1.3685 - val_accuracy: 0.4331

In [184]:
pred_test = model.predict_proba(xtest_padded)

In [185]:
sample_submission_test = pd.read_csv('open/sample_submission.csv', encoding = 'utf-8')

In [186]:
sample_submission_test = sample_submission_test[:len(xtest)]

In [187]:
sample_submission_test[['0', '1', '2', '3', '4']] = pred_test
sample_submission_test

Unnamed: 0,index,0,1,2,3,4
0,0,0.269036,0.090928,0.240201,0.233277,0.166558
1,1,0.313305,0.043237,0.248182,0.093843,0.301433
2,2,0.221893,0.076630,0.270640,0.303202,0.127635
3,3,0.232975,0.028961,0.153958,0.488502,0.095604
4,4,0.336409,0.048874,0.213856,0.177565,0.223295
...,...,...,...,...,...,...
16446,16446,0.182664,0.294499,0.095062,0.391405,0.036370
16447,16447,0.283584,0.074862,0.245571,0.185974,0.210009
16448,16448,0.306565,0.063852,0.256476,0.072327,0.300780
16449,16449,0.320957,0.086771,0.199028,0.208884,0.184360


In [188]:
# Logloss 함수
def multiclass_logloss(actual, predicted, eps=1e-15):

    if len(actual.shape) == 1:
        actual2 = np.zeros((actual.shape[0], predicted.shape[1]))
        for i, val in enumerate(actual):
            actual2[i, val] = 1
        actual = actual2

    clip = np.clip(predicted, eps, 1 - eps)
    rows = actual.shape[0]
    vsota = np.sum(actual * np.log(clip))
    return -1.0 / rows * vsota

In [191]:
multiclass_logloss(ytest, sample_submission_test, eps=1e-15).mean()

0.2593186411436265