In [5]:
# 01. 라이브러리 불러오기
import os, json, glob, sys, numpy as np
import pandas as pd
import re
import nltk 
import matplotlib.pyplot as plt

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing import sequence
from keras.models import Sequential, Model
from keras.layers import Dense, Embedding, LSTM, Flatten, Dropout, Input, Conv1D, MaxPooling1D, Bidirectional, GlobalMaxPool1D
from keras.utils import np_utils
from keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.python.client import device_lib
import tensorflow.keras.backend as K
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import tensorflow as tf

config = tf.compat.v1.ConfigProto() # 버전 낮추기
config.gpu_options.allow_growth = True
session = tf.compat.v1.Session(config=config)

In [16]:
def load_data():
  train = pd.read_csv("data/train_final.csv")
  eval = pd.read_csv("data/eval_final_open.csv")
  train_X = train['Sentence'] # 본문 내용을 기준으로 분류하기
  train_Y = train['Category']
  eval_X = eval['Sentence']
  return train_X, train_Y, eval_X

In [7]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /home/docando/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/docando/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [8]:
from sklearn.preprocessing import LabelEncoder
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

def preprocessing(documents):
  ## 01. 영어, 숫자만 남기기
  documents = documents.str.replace("[^a-zA-Z0-9 ]","") 

  ## 02. 소문자화, 불용어 제거, 토큰화
  clean_documents = []
  for sentence in documents:
    # sentence = sentence.lower() # 소문자화
    tokens = word_tokenize(sentence) # 토큰화
    tokens = [word for word in tokens if word not in stopwords.words('english')] # 불용어 제거
    clean_texts = " ".join(tokens)
    clean_texts = re.sub('[?.,;:|\)*~`’!^\-_+<>@\#$%&-=#}※]', '', clean_texts) # 특수문자, 임티 제거
    clean_texts = re.sub(' +', ' ', clean_texts) # 다중 공백 제거
    clean_documents.append(clean_texts)
  return clean_documents 

In [9]:
def labeling(train_y):
  datasets = np.array(train_y).tolist()
  
  # 데이터 라벨링
  le = LabelEncoder()
  le.fit(datasets)
  print(le.classes_) # 확인해보기

  nb_classes = len(le.classes_)
  print('class 개수 : ', nb_classes)
  transform_data = le.transform(datasets)

  # 라벨링한 y 데이터를 원 핫 벡터로 바꿈
  transform_data = np_utils.to_categorical(transform_data, nb_classes)
  print(transform_data)

  # 다시 학습, 테스트 데이터로 분할
  y_train = transform_data

  return y_train, nb_classes

In [10]:
def Tokenization(max_word, max_len, train_x, test_x):
  div = len(train_x) # 분할 지점 저장
  dataset = train_x + test_x # 학습, 테스트 데이터 합치기
  print('총 데이터셋 크기 : ', len(dataset))

  tok = Tokenizer(num_words = max_word) # 단어 개수를 max_word 로 설정
  tok.fit_on_texts(dataset)

  sequences = tok.texts_to_sequences(dataset) # 문자를 시퀀스로 바꿈
  print(len(sequences[0]))
  print(sequences[0])

  sequences_matrix = sequence.pad_sequences(sequences, maxlen=max_len) # 길이를 max_len 으로 맞춤
  print(sequences_matrix)
  print(sequences_matrix[0])
  print(len(sequences_matrix[0]))
  print('총 단어 수 : ', len(tok.word_index))

  train_result = sequences_matrix[:div]
  test_result = sequences_matrix[div:]

  return train_result, test_result

In [17]:
# LSTM 모델
def LSTM_model_making(max_word, max_len, nb_classes):
  
  # with K.tf.device('/device:GPU:0'):
  with tf.device('/device:GPU:0'):
      model = Sequential()
      
      model.add(Embedding(max_word, 64, input_length=max_len))
      model.add(LSTM(60, return_sequences=True))
      model.add(GlobalMaxPool1D())
      model.add(Dropout(0.2))
      model.add(Dense(50, activation='relu'))
      model.add(Dropout(0.5))
      model.add(Dense(nb_classes, activation='softmax'))
      model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
      model_dir = './model'
      if not os.path.exists(model_dir):
          os.mkdir(model_dir)
      model_path = model_dir + "/predict_category_LSTM.model"
      checkpoint = ModelCheckpoint(filepath=model_path, monitor="val_loss", verbose=1, save_best_only=True)

      early_stopping = EarlyStopping(monitor='val_loss', patience=7)

  model.summary()
  return model, checkpoint, early_stopping

In [12]:
def CNN(max_word):
  embedding_dim = 256
  batch_size = 256

  model = Sequential()
  model.add(Embedding(max_word, 256))
  model.add(Dropout(0.3))
  model.add(Conv1D(256, 3, padding='valid', activation='relu'))
  model.add(GlobalMaxPooling1D())
  model.add(Dense(128, activation='relu'))
  model.add(Dropout(0.5))
  model.add(Dense(1, activation='sigmoid'))

  early_stopping = EarlyStopping(monitor = 'val_loss', mode = 'min', verbose = 1, patience = 3)
  checkpoint = ModelCheckpoint('best_model.h5', monitor = 'val_acc', mode = 'max', verbose = 1, save_best_only = True)

  model.compile(optimizer='adam', loss = 'binary_crossentropy', metrics = ['acc'])
  history = model.fit(X_train, y_train, epochs = 20, validation_data = (X_test, y_test), callbacks=[es, mc])

  return model


In [13]:
def model_training(X_train, y_train, model, checkpoint, early_stopping, batch_size, epoch):
  hist = model.fit(X_train, y_train, batch_size=batch_size, epochs=epoch, validation_split=0.2, callbacks=[checkpoint, early_stopping])

In [14]:
# from sklearn import metrics

# def evaluate(test_x, test_y, model, target_names):
#   predictions = model.predict(test_x)
#   y_predict = predictions.argmax(axis=-1)
#   print(metrics.classification_report(test_y, y_predict, target_names = target_names))

## 위에서 정의한 함수들 실행하기

In [19]:
# 데이터 로드
train_x, train_y, test_x = load_data()

# 데이터 전처리
train_x, test_x = preprocessing(train_x), preprocessing(test_x)

In [20]:
# 임시 저장

pd.DataFrame(train_x).to_csv('data/train_x.csv')
pd.DataFrame(test_x).to_csv('data/test_x.csv')

In [21]:
# 임시 불러오기
train_x, test_x = pd.read_csv('data/train_x.csv'), pd.read_csv('data/test_x.csv')

In [22]:
train_y

0        3
1        2
2        4
3        1
4        2
        ..
11539    3
11540    1
11541    4
11542    4
11543    4
Name: Category, Length: 11544, dtype: int64

In [23]:
train_x = train_x['0']
test_x = test_x['0']
print(train_x)
print(test_x)

0        LRB The film RRB tackles topic relationships s...
1                        Lavishly exhilaratingly tasteless
2                                It also beautifully acted
3                         But like Silence movie gets skin
4        It made innocent yet fervid conviction Hollywo...
                               ...                        
11539    Although Frailty fits classic genre script exe...
11540                          Mediocre fable Burkina Faso
11541    Like great films life never knew existed offer...
11542    Those nt put film austerity find capable rewar...
11543    An ambitious movie like Shiner organizing big ...
Name: 0, Length: 11544, dtype: object
0       Altogether successful film time touching recon...
1       Not cheap slasher flick subject matter would s...
2       plays like somebody spliced random moments Chr...
3       While Resident Evil games may set new standard...
4            Too lazy take advantage semihumorous premise
                       

In [24]:
train_x = train_x.to_list()
test_x = test_x.to_list()

In [25]:
print(train_x)
print(test_x)



In [26]:
# 카테고리 라벨링
y_train, nb_classes = labeling(train_y)

[0 1 2 3 4]
class 개수 :  5
[[0. 0. 0. 1. 0.]
 [0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 1.]
 ...
 [0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 1.]]


In [27]:
# 토큰화

max_word = 6000
max_len = 400
x_train, x_test = Tokenization(max_word, max_len, train_x, test_x)

총 데이터셋 크기 :  15855
19
[12, 1, 2, 11, 2920, 2251, 669, 2080, 563, 446, 942, 116, 370, 335, 2, 1020, 371, 1021, 2081]
[[   0    0    0 ...  371 1021 2081]
 [   0    0    0 ...    0    0 5581]
 [   0    0    0 ...   69  285  481]
 ...
 [   0    0    0 ...  142  276   24]
 [   0    0    0 ...   20 2078 1737]
 [   0    0    0 ... 4131 4104 1438]]
[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0  

In [28]:
# lstm 모델 만들고 학습시키기
batch_size = 128
epoch = 20

model, checkpoint, early_stopping = LSTM_model_making(max_word, max_len, nb_classes)
model_training(x_train, y_train, model, checkpoint, early_stopping, batch_size, epoch)

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 400, 64)           384000    
_________________________________________________________________
lstm (LSTM)                  (None, 400, 60)           30000     
_________________________________________________________________
global_max_pooling1d (Global (None, 60)                0         
_________________________________________________________________
dropout (Dropout)            (None, 60)                0         
_________________________________________________________________
dense (Dense)                (None, 50)                3050      
_________________________________________________________________
dropout_1 (Dropout)          (None, 50)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 5)                 2



INFO:tensorflow:Assets written to: ./model/predict_category_LSTM.model/assets


INFO:tensorflow:Assets written to: ./model/predict_category_LSTM.model/assets


Epoch 2/20

Epoch 00002: val_loss improved from 1.57121 to 1.47737, saving model to ./model/predict_category_LSTM.model




INFO:tensorflow:Assets written to: ./model/predict_category_LSTM.model/assets


INFO:tensorflow:Assets written to: ./model/predict_category_LSTM.model/assets


Epoch 3/20

Epoch 00003: val_loss improved from 1.47737 to 1.30098, saving model to ./model/predict_category_LSTM.model




INFO:tensorflow:Assets written to: ./model/predict_category_LSTM.model/assets


INFO:tensorflow:Assets written to: ./model/predict_category_LSTM.model/assets


Epoch 4/20

Epoch 00004: val_loss improved from 1.30098 to 1.22678, saving model to ./model/predict_category_LSTM.model




INFO:tensorflow:Assets written to: ./model/predict_category_LSTM.model/assets


INFO:tensorflow:Assets written to: ./model/predict_category_LSTM.model/assets


Epoch 5/20

Epoch 00005: val_loss did not improve from 1.22678
Epoch 6/20

Epoch 00006: val_loss did not improve from 1.22678
Epoch 7/20

Epoch 00007: val_loss did not improve from 1.22678
Epoch 8/20

Epoch 00008: val_loss did not improve from 1.22678
Epoch 9/20

Epoch 00009: val_loss did not improve from 1.22678
Epoch 10/20

Epoch 00010: val_loss did not improve from 1.22678
Epoch 11/20

Epoch 00011: val_loss did not improve from 1.22678


In [29]:
predictions = model.predict(x_test)
y_predict = predictions.argmax(axis=-1)
print(y_predict)

[3 1 1 ... 2 2 2]


In [30]:
y_predict

array([3, 1, 1, ..., 2, 2, 2])

In [31]:
# 예측 결과 저장

pd.DataFrame(y_predict).to_csv('data/predict_y.csv')