In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import warnings
warnings.filterwarnings(action = 'ignore')
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
import re

In [3]:
# 경로 설정
import os
os.chdir('/content/drive/My Drive')

In [4]:
# 파일 불러오기
train = pd.read_csv('open/train.csv', encoding = 'utf-8')
test = pd.read_csv('open/test_x.csv', encoding = 'utf-8')
sample_submission = pd.read_csv('open/sample_submission.csv', encoding = 'utf-8')

In [5]:
# train 데이터 살펴보기
train

Unnamed: 0,index,text,author
0,0,"He was almost choking. There was so much, so m...",3
1,1,"“Your sister asked for it, I suppose?”",2
2,2,"She was engaged one day as she walked, in per...",1
3,3,"The captain was in the porch, keeping himself ...",4
4,4,"“Have mercy, gentlemen!” odin flung up his han...",3
...,...,...,...
54874,54874,"“Is that you, Mr. Smith?” odin whispered. “I h...",2
54875,54875,"I told my plan to the captain, and between us ...",4
54876,54876,"""Your sincere well-wisher, friend, and sister...",1
54877,54877,“Then you wanted me to lend you money?”,3


In [6]:
# test 데이터 살펴보기
test

Unnamed: 0,index,text
0,0,“Not at all. I think she is one of the most ch...
1,1,"""No,"" replied he, with sudden consciousness, ""..."
2,2,As the lady had stated her intention of scream...
3,3,“And then suddenly in the silence I heard a so...
4,4,His conviction remained unchanged. So far as I...
...,...,...
19612,19612,"At the end of another day or two, odin growing..."
19613,19613,"All afternoon we sat together, mostly in silen..."
19614,19614,"odin, having carried his thanks to odin, proc..."
19615,19615,"Soon after this, upon odin's leaving the room,..."


In [7]:
# sample_submission
sample_submission

Unnamed: 0,index,0,1,2,3,4
0,0,0,0,0,0,0
1,1,0,0,0,0,0
2,2,0,0,0,0,0
3,3,0,0,0,0,0
4,4,0,0,0,0,0
...,...,...,...,...,...,...
19612,19612,0,0,0,0,0
19613,19613,0,0,0,0,0
19614,19614,0,0,0,0,0
19615,19615,0,0,0,0,0


# **전처리**

In [8]:
# 부호를 제거해주는 함수
def alpha_num(text) :
  return re.sub(r'[^A-Za-z0-9 ]', '', text)

train['text'] = train['text'].apply(alpha_num)

In [9]:
# 부호가 사라진 것을 확인할 수 있습니다.
train

Unnamed: 0,index,text,author
0,0,He was almost choking There was so much so muc...,3
1,1,Your sister asked for it I suppose,2
2,2,She was engaged one day as she walked in peru...,1
3,3,The captain was in the porch keeping himself c...,4
4,4,Have mercy gentlemen odin flung up his hands D...,3
...,...,...,...
54874,54874,Is that you Mr Smith odin whispered I hardly d...,2
54875,54875,I told my plan to the captain and between us w...,4
54876,54876,Your sincere wellwisher friend and sister LUC...,1
54877,54877,Then you wanted me to lend you money,3


In [10]:
# 이상치 제거
train = train[train.text != "     "]
train = train.reset_index(drop=True)
train

Unnamed: 0,index,text,author
0,0,He was almost choking There was so much so muc...,3
1,1,Your sister asked for it I suppose,2
2,2,She was engaged one day as she walked in peru...,1
3,3,The captain was in the porch keeping himself c...,4
4,4,Have mercy gentlemen odin flung up his hands D...,3
...,...,...,...
54831,54874,Is that you Mr Smith odin whispered I hardly d...,2
54832,54875,I told my plan to the captain and between us w...,4
54833,54876,Your sincere wellwisher friend and sister LUC...,1
54834,54877,Then you wanted me to lend you money,3


In [11]:
import nltk
nltk.download('stopwords')
stopwords=nltk.corpus.stopwords.words('english')
stopwords.append('odin')
stopwords.extend(['could', 'ought', 'would'])

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [12]:
# 불용어 제거해주는 함수
def remove_stopwords(text):
    final_text = []
    for i in text.split():
        if i.strip().lower() not in stopwords:
            final_text.append(i.strip())
    return " ".join(final_text)

# 불용어
# stopwords = [ "a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "as", 
#              "at", "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "could", 
#              "did", "do", "does", "doing", "down", "during", "each", "few", "for", "from", "further", "had", "has", 
#              "have", "having", "he", "he'd", "he'll", "he's", "her", "here", "here's", "hers", "herself", "him", "himself", 
#              "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is", "it", "it's", "its", "itself", 
#              "let's", "me", "more", "most", "my", "myself", "nor", "of", "on", "once", "only", "or", "other", "ought", "our", "ours", 
#              "ourselves", "out", "over", "own", "same", "she", "she'd", "she'll", "she's", "should", "so", "some", "such", "than", "that", 
#              "that's", "the", "their", "theirs", "them", "themselves", "then", "there", "there's", "these", "they", "they'd", "they'll", 
#              "they're", "they've", "this", "those", "through", "to", "too", "under", "until", "up", "very", "was", "we", "we'd", "we'll", 
#              "we're", "we've", "were", "what", "what's", "when", "when's", "where", "where's", "which", "while", "who", "who's", "whom", 
#              "why", "why's", "with", "would", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves" ]

In [13]:
#전처리 적용
train['text'] = train['text'].str.lower()
test['text'] = test['text'].str.lower()
train['text'] = train['text'].apply(alpha_num).apply(remove_stopwords)
test['text'] = test['text'].apply(alpha_num).apply(remove_stopwords)

In [14]:
train

Unnamed: 0,index,text,author
0,0,almost choking much much wanted say strange ex...,3
1,1,sister asked suppose,2
2,2,engaged one day walked perusing janes last let...,1
3,3,captain porch keeping carefully way treacherou...,4
4,4,mercy gentlemen flung hands dont write anyway ...,3
...,...,...,...
54831,54874,mr smith whispered hardly dared hope come,2
54832,54875,told plan captain us settled details accomplis...,4
54833,54876,sincere wellwisher friend sister lucy,1
54834,54877,wanted lend money,3


In [15]:
# train test 분리
X_train = np.array([x for x in train['text']])
X_test = np.array([x for x in test['text']])
y_train = np.array([x for x in train['author']])

In [16]:
X_train

array(['almost choking much much wanted say strange exclamations came lips pole gazed fixedly bundle notes hand looked evident perplexity',
       'sister asked suppose',
       'engaged one day walked perusing janes last letter dwelling passages proved jane written spirits instead surprised mr saw looking meeting putting away letter immediately forcing smile said',
       ..., 'sincere wellwisher friend sister lucy', 'wanted lend money',
       'certainly occurred said yes like'], dtype='<U1345')

# **모델링**

In [17]:
# 파라미터 설정
vocab_size = 20000
embedding_dim = 16
max_length = 500
padding_type = 'post'
# oov_tok = "<OOV>"

In [18]:
# tokenizer에 fit
tokenizer = Tokenizer(num_words = vocab_size) # , oov_token = oov_tok)
tokenizer.fit_on_texts(X_train)
word_index = tokenizer.word_index

In [19]:
# 데이터를 sequence로 변환해주고 padding 해줍니다.
train_sequences = tokenizer.texts_to_sequences(X_train)
train_padded = pad_sequences(train_sequences, padding = padding_type, maxlen = max_length)

test_sequences = tokenizer.texts_to_sequences(X_test)
test_padded = pad_sequences(test_sequences, padding = padding_type, maxlen = max_length)

In [20]:
# 가벼운 NLP모델 생성
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length = max_length),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(24, activation = 'relu'),
    tf.keras.layers.Dense(5, activation = 'softmax')
])

In [21]:
# compile model
model.compile(loss = 'sparse_categorical_crossentropy',
              optimizer = 'adam',
              metrics = ['accuracy'])

# model summary
print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 500, 16)           320000    
_________________________________________________________________
global_average_pooling1d (Gl (None, 16)                0         
_________________________________________________________________
dense (Dense)                (None, 24)                408       
_________________________________________________________________
dense_1 (Dense)              (None, 5)                 125       
Total params: 320,533
Trainable params: 320,533
Non-trainable params: 0
_________________________________________________________________
None


In [22]:
# fit model
num_epochs = 20
history = model.fit(train_padded, y_train,
                    epochs = num_epochs, verbose = 2,
                    validation_split = 0.2)

Epoch 1/20
1371/1371 - 7s - loss: 1.5702 - accuracy: 0.2747 - val_loss: 1.5661 - val_accuracy: 0.2683
Epoch 2/20
1371/1371 - 7s - loss: 1.5347 - accuracy: 0.3019 - val_loss: 1.4552 - val_accuracy: 0.3553
Epoch 3/20
1371/1371 - 7s - loss: 1.3115 - accuracy: 0.4680 - val_loss: 1.2312 - val_accuracy: 0.5001
Epoch 4/20
1371/1371 - 7s - loss: 1.1591 - accuracy: 0.5227 - val_loss: 1.1632 - val_accuracy: 0.5218
Epoch 5/20
1371/1371 - 8s - loss: 1.0872 - accuracy: 0.5515 - val_loss: 1.1017 - val_accuracy: 0.5465
Epoch 6/20
1371/1371 - 8s - loss: 1.0297 - accuracy: 0.5790 - val_loss: 1.0721 - val_accuracy: 0.5641
Epoch 7/20
1371/1371 - 8s - loss: 0.9728 - accuracy: 0.6070 - val_loss: 1.0201 - val_accuracy: 0.5805
Epoch 8/20
1371/1371 - 7s - loss: 0.9053 - accuracy: 0.6486 - val_loss: 0.9921 - val_accuracy: 0.6016
Epoch 9/20
1371/1371 - 7s - loss: 0.8428 - accuracy: 0.6787 - val_loss: 0.9182 - val_accuracy: 0.6524
Epoch 10/20
1371/1371 - 7s - loss: 0.7876 - accuracy: 0.7032 - val_loss: 0.8964 - 

In [23]:
# predict values
pred = model.predict_proba(test_padded)

Instructions for updating:
Please use `model.predict()` instead.


In [24]:
pred

array([[2.2220856e-05, 6.0771018e-01, 9.7821623e-02, 2.9295623e-01,
        1.4896442e-03],
       [2.1921597e-01, 4.3326786e-01, 6.7787521e-02, 3.2421380e-02,
        2.4730729e-01],
       [9.9450386e-01, 2.5473126e-05, 1.5764406e-06, 3.1771048e-08,
        5.4690801e-03],
       ...,
       [2.0791531e-04, 9.9768472e-01, 5.9560575e-06, 5.7958068e-06,
        2.0955598e-03],
       [1.7468612e-04, 9.9726897e-01, 1.2631829e-05, 8.4507219e-06,
        2.5353108e-03],
       [9.9406999e-01, 5.0172061e-06, 1.3873166e-05, 1.9133158e-07,
        5.9108799e-03]], dtype=float32)

In [25]:
# submission
sample_submission[['0', '1', '2', '3', '4']] = pred
sample_submission

Unnamed: 0,index,0,1,2,3,4
0,0,2.222086e-05,6.077102e-01,9.782162e-02,2.929562e-01,0.001490
1,1,2.192160e-01,4.332679e-01,6.778752e-02,3.242138e-02,0.247307
2,2,9.945039e-01,2.547313e-05,1.576441e-06,3.177105e-08,0.005469
3,3,4.613730e-06,5.665078e-10,9.915523e-01,6.881439e-11,0.008443
4,4,9.541207e-01,1.803538e-03,1.333417e-03,3.713495e-02,0.005607
...,...,...,...,...,...,...
19612,19612,4.612654e-08,9.999912e-01,4.384508e-12,3.247740e-13,0.000009
19613,19613,7.037356e-03,4.380584e-05,6.931027e-05,3.614834e-13,0.992849
19614,19614,2.079153e-04,9.976847e-01,5.956058e-06,5.795807e-06,0.002096
19615,19615,1.746861e-04,9.972690e-01,1.263183e-05,8.450722e-06,0.002535


In [26]:
sample_submission.to_csv('submission_stopwords_stemmer_update.csv', index = False, encoding = 'utf-8')