목표 : 청와대 국민 청원 글 카테고리 분류

참고 : https://dacon.io/competitions/open/235597/codeshare/1803?page=1&dtype=recent


## 1. 필수 라이브러리

In [143]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import tensorflow as tf
tf.random.set_seed(777)

import os
import re
from tqdm.auto import tqdm
tqdm.pandas()

## 2. Data 불러오기

In [187]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

Mounted at /content/gdrive


In [188]:
!ls gdrive/MyDrive

 노래   3학년   colab  'Colab Notebooks'   dataset  'Untitled Diagram.drawio'


In [83]:
path = "gdrive/My Drive/dataset/bluehouse"

In [84]:
# category와 data 열만 뽑아냄. 그리고 행에 혹시 모를 결측값이 있다면 제거
train_data = pd.read_csv(os.path.join(path, "train.csv")).iloc[:, 1:].dropna(how='any')

In [85]:
# 결측값이 제거되었다면 인덱스 불일치로 다시 인덱싱
train_data.index = range(len(train_data))

In [23]:
test_data = pd.read_csv(os.path.join(path, "test.csv")).iloc[:, 1:]

In [25]:
train_data.shape, test_data.shape

((39992, 2), (5000, 1))

## 3. 전처리 

In [107]:
def clean_text(text):
    text = text.replace('\\n', ' ') # 줄바꿈 제거

    text = re.sub(r'[@%\\*=()/~#&\+á?\xc3\xa1\-\|\.\:\;\!\-\,\_\~\$\'\"]', '',str(text)) #remove punctuation
    text = re.sub(r'\d+','', str(text))# remove number
    text = text.lower() #lower case
    text = re.sub(r'\s+', ' ', text) #remove extra space
    text = re.sub(r'<[^>]+>','',text) #remove Html tags
    text = re.sub(r'\s+', ' ', text) #remove spaces
    text = re.sub(r"^\s+", '', text) #remove space from start
    text = re.sub(r'\s+$', '', text) #remove space from the end
    text = re.sub('[-=+,#:;//●<>▲\?:^$.☆!★()Ⅰ@*\"※~>`\'…》]', ' ', text) # 특수문자 제거
    
    return text

In [108]:
train_data['clear_text'] = train_data['data'].progress_map(clean_text)

  0%|          | 0/39992 [00:00<?, ?it/s]

In [110]:
test_data['clear_text'] = test_data['data'].progress_map(clean_text)

  0%|          | 0/5000 [00:00<?, ?it/s]

## 4. tokenizer 및 불용어 제거
- Using Mecab for toeknizing

In [None]:
!curl -s https://raw.githubusercontent.com/teddylee777/machine-learning/master/99-Misc/01-Colab/mecab-colab.sh | bash

In [117]:
from konlpy.tag import Mecab

mecab = Mecab()

In [129]:
stop_df = pd.read_csv(os.path.join(path, '한국어불용어100.txt'), sep = '\t', header = None, names = ['형태','품사','비율'])
stop_df.head()

Unnamed: 0,형태,품사,비율
0,이,VCP,0.01828
1,있,VA,0.011699
2,하,VV,0.009774
3,것,NNB,0.009733
4,들,XSN,0.006898


In [None]:
stop_words = list(stop_df.형태)
stop_words

In [139]:
def tokenizer_and_stopword(text):
    temp_X = mecab.nouns(text) # 문장에서 명사 추출
    temp_X = [word for word in temp_X if not word in stop_words] # 불용어 제거
    temp_X = [word for word in temp_X if len(word) > 1] # 한글자 이상인 것만 추출

    return ' '.join(temp_X)

In [134]:
# mocab nouns 테스트 
print(mecab.nouns(train_data.loc[0, 'clear_text']))

['신혼', '부부', '주택', '정책', '보육', '시설', '국민', '세금', '일부', '정책', '보편', '국민', '수긍', '수', '복지', '정책', '저', '신혼', '부부', '당첨', '사람', '로또', '주택', '정책', '반대', '국민', '세금', '일부', '사람', '식', '세금', '우리', '수', '보육', '시설', '전국', '설치', '기업', '솔선수범', '사업장', '의무', '설치', '수', '집', '애', '데', '경력', '단절', '게', '집', '개인', '능력', '게', '능력', '수', '육아', '전담', '힘', '게', '우리', '아이', '부모', '거', '이제', '국가', '시대', '게', '부동산', '가격', '게', '정부', '정책', '부동산', '역효과']


In [140]:
train_data['clear_text2'] = train_data['clear_text'].progress_map(tokenizer_and_stopword)

  0%|          | 0/39992 [00:00<?, ?it/s]

In [153]:
test_data['clear_text2'] = test_data['clear_text'].progress_map(tokenizer_and_stopword)

  0%|          | 0/5000 [00:00<?, ?it/s]

## 단어 집합 만들기 및 인코딩

In [156]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer()

In [146]:
# pandas series -> list형으로 반환 후 단어사전 만듬
tokenizer.fit_on_texts(train_data['clear_text2'].tolist())

In [None]:
tokenizer.word_index

In [166]:
total_cnt = len(tokenizer.word_index) # 단어의 수
total_cnt

45441

In [174]:
vocab_size = 30000 # vocab_size를 30000개로 고정하자. 너무 빈도가 없는 단어는 어느정도 버리는게 맞는듯
tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(train_data['clear_text2'].tolist())

len(tokenizer.word_index)

45441

In [175]:
X_train = tokenizer.texts_to_sequences(train_data['clear_text2'].tolist())
X_test = tokenizer.texts_to_sequences(test_data['clear_text2'].tolist())

In [176]:
max_len = 300 # 최대 길이를 300으로 설정. 그리고 300이 안되는 녀석들은 padding 처리

pad_X_train = pad_sequences(X_train, maxlen = max_len)
pad_X_test = pad_sequences(X_test, maxlen = max_len)

In [152]:
y_train = to_categorical(np.array(train_data['category']))

array([[0., 0., 1.],
       [1., 0., 0.],
       [0., 1., 0.],
       ...,
       [0., 0., 1.],
       [0., 0., 1.],
       [1., 0., 0.]], dtype=float32)

## 간단한 모델로 학습 및 테스트

In [162]:
from tensorflow.keras.layers import *
from tensorflow.keras.models import Sequential, load_model, Model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint,  ReduceLROnPlateau
from tensorflow.keras.regularizers import *
from tensorflow.keras.optimizers import Adam, RMSprop

In [183]:
embedding_dim = 64

model1 = Sequential()

model1.add(Embedding(vocab_size, embedding_dim, input_length=max_len))
model1.add(Conv1D(filters=64, kernel_size=5, activation = 'relu', padding = 'same', kernel_regularizer=l2(0.01), kernel_initializer=tf.keras.initializers.TruncatedNormal(0.02)))
model1.add(GlobalMaxPooling1D())
model1.add(Dense(3, activation = 'softmax', kernel_initializer=tf.keras.initializers.TruncatedNormal(0.02)))

model1.summary()

Model: "sequential_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_6 (Embedding)     (None, 300, 64)           1920000   
                                                                 
 conv1d_6 (Conv1D)           (None, 300, 64)           20544     
                                                                 
 global_max_pooling1d_6 (Glo  (None, 64)               0         
 balMaxPooling1D)                                                
                                                                 
 dense_6 (Dense)             (None, 3)                 195       
                                                                 
Total params: 1,940,739
Trainable params: 1,940,739
Non-trainable params: 0
_________________________________________________________________


In [189]:
model1.compile(optimizer=RMSprop(lr = .0005), loss='categorical_crossentropy', metrics=['acc'])

reLR = ReduceLROnPlateau(patience = 5, verbose=1, factor = .2)
es = EarlyStopping(monitor='val_acc', mode='max', verbose=1, patience = 2)
mc = ModelCheckpoint(filepath = os.path.join(path, '1028_1.h5'), monitor='val_acc', mode='max', verbose=1, save_best_only=True, restore_best_weights = True)

  super(RMSprop, self).__init__(name, **kwargs)


In [190]:
val_rate = 0.2

history1 = model1.fit(pad_X_train, y_train, epochs=30, batch_size= 64, shuffle = True, validation_split=val_rate, verbose = 1, callbacks=[es, mc, reLR])

Epoch 1/30
Epoch 1: val_acc improved from -inf to 0.86161, saving model to gdrive/My Drive/dataset/bluehouse/1028_1.h5
Epoch 2/30
Epoch 2: val_acc improved from 0.86161 to 0.86823, saving model to gdrive/My Drive/dataset/bluehouse/1028_1.h5
Epoch 3/30
Epoch 3: val_acc improved from 0.86823 to 0.86998, saving model to gdrive/My Drive/dataset/bluehouse/1028_1.h5
Epoch 4/30
Epoch 4: val_acc improved from 0.86998 to 0.87086, saving model to gdrive/My Drive/dataset/bluehouse/1028_1.h5
Epoch 5/30
Epoch 5: val_acc improved from 0.87086 to 0.87248, saving model to gdrive/My Drive/dataset/bluehouse/1028_1.h5
Epoch 6/30
Epoch 6: val_acc improved from 0.87248 to 0.87473, saving model to gdrive/My Drive/dataset/bluehouse/1028_1.h5
Epoch 7/30
Epoch 7: val_acc improved from 0.87473 to 0.87536, saving model to gdrive/My Drive/dataset/bluehouse/1028_1.h5
Epoch 8/30
Epoch 8: val_acc improved from 0.87536 to 0.87598, saving model to gdrive/My Drive/dataset/bluehouse/1028_1.h5
Epoch 9/30
Epoch 9: val_acc

In [194]:
model1 = load_model( os.path.join(path, '1028_1.h5')) # val_acc = 0.8692

y_prob = model1.predict(pad_X_test, verbose=0) 
predicted = y_prob.argmax(axis=-1)

In [195]:
predicted

array([0, 2, 1, ..., 1, 0, 2])

## 실제 예시로 테스트 해보기

In [217]:
test_example = '유소년 아이스하키 선수들의 꿈을 짓밟은 ‘인천 **국제빙상장위탁업체를’를 고발 합니다.'
test_example = clean_text(test_example)
test_example = tokenizer_and_stopword(test_example)
test_arr = tokenizer.texts_to_sequences([test_example])
test_arr = pad_sequences(test_arr, maxlen = max_len)

In [218]:
test_example

'유소년 아이스하키 선수 인천 국제 상장 위탁 업체 고발'

In [219]:
model1.predict([test_arr], verbose = 0).argmax(axis=-1)

# 	0 : 인권/성평등
# 	1 : 문화/예술/체육/언론
# 	2 : 육아/교육

array([1])