<a href="https://colab.research.google.com/github/changyong93/Natural-language-processing-with-chat-bot/blob/main/%EB%94%A5%EB%9F%AC%EB%8B%9D%EC%9D%84_%EC%9D%B4%EC%9A%A9%ED%95%9C_%EC%9E%90%EC%97%B0%EC%96%B4%EC%B2%98%EB%A6%AC_%EC%9E%85%EB%AC%B8(12_5_NLP%EB%A5%BC_%EC%9C%84%ED%95%9C_%EC%8B%A0%EA%B2%BD%EB%A7%9D(CNN)_Multi_Kernel_1D_CNN%EC%9C%BC%EB%A1%9C_%EB%84%A4%EC%9D%B4%EB%B2%84_%EC%98%81%ED%99%94_%EB%A6%AC%EB%B7%B0_%EB%B6%84%EB%A5%98%ED%95%98%EA%B8%B0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Multi-Kernel 1D CNN으로 네이버 영화 리뷰 분류하기

In [None]:
!pip install konlpy

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import urllib.request

from konlpy.tag import Okt
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

## 데이터 로드

In [None]:
urllib.request.urlretrieve("https://raw.githubusercontent.com/e9t/nsmc/master/ratings_train.txt", filename="ratings_train.txt")
urllib.request.urlretrieve("https://raw.githubusercontent.com/e9t/nsmc/master/ratings_test.txt", filename="ratings_test.txt")

In [None]:
train_data = pd.read_table('ratings_train.txt')
test_data = pd.read_table('ratings_test.txt')

In [None]:
print("훈련용 데이터 개수: ", train_data.shape[0])
print("테스트용 데이터 개수: ", test_data.shape[0])

In [None]:
train_data.head()

In [None]:
test_data.head()

## 데이터 정제

In [None]:
train_data.nunique()

In [None]:
train_data = train_data.drop_duplicates(subset = ["document"])
train_data.shape

In [None]:
train_data.label.value_counts().reset_index(name = "count")
train_data.label.value_counts().plot.bar()

In [None]:
train_data.isnull().any()

In [None]:
train_data.isnull().sum()

In [None]:
train_data[train_data.document.isnull()]

In [None]:
train_data = train_data.dropna(how = 'any')
train_data.shape

In [None]:
train_data["document"] = train_data.document.str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]","")
train_data.head()

In [None]:
train_data["document"] = train_data["document"].str.replace("^ +", "")
train_data.head()

In [None]:
train_data["document"] = train_data["document"].replace("",np.nan)
train_data.isnull().sum()

In [None]:
train_data[train_data["document"].isnull()].head()

In [None]:
train_data = train_data.dropna(how = 'any')
train_data.shape

In [None]:
print(test_data.shape)
test_data = test_data.drop_duplicates(subset = ['document'])
test_data["document"] = test_data["document"].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]", "")
test_data["document"] = test_data["document"].str.replace("^ +", "")
test_data["document"] = test_data["document"].replace("", np.nan)
test_data = test_data.dropna(how = 'any')
print(test_data.shape)

## 토큰화

In [None]:
okt = Okt()
train_data["tokenized"] = train_data["document"].apply(lambda x : okt.morphs(x,stem = True))

In [None]:
stopwords = ['의','가','이','은','들','는','좀','잘','걍','과','도','를','으로','자','에','와','한','하다']
train_data["tokenized"] = train_data["tokenized"].map(lambda x : [word for word in x if not word in stopwords])

In [None]:
train_data["tokenized"][:5].values

In [None]:
test_data["tokenized"] = test_data["document"].apply(lambda x : okt.morphs(x,stem = True))
test_data["tokenized"] = test_data["tokenized"].map(lambda x : [word for word in x if not word in stopwords])

In [None]:
test_data["tokenized"][:5].values

## 정수인코딩

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_data["tokenized"])

In [None]:
print(tokenizer.word_index)

In [None]:
threshold = 3
total_cnt = len(tokenizer.word_index)
total_freq = 0

rare_cnt = 0
rare_freq = 0
for key,value in tokenizer.word_counts.items():
  total_freq += value
  if value < 3:
    rare_cnt += 1
    rare_freq += value

print("단어 집합 크기: ",total_cnt)
print(f"등반 빈도가 {threshold} 미만인 희귀 단어 수: {rare_cnt}")
print(f"전체 단어 중 희귀 단어 비율: {rare_cnt / total_cnt * 100:.3f}")
print(f"전체 등장 빈도 중 희귀 단어 비율: {rare_freq / total_freq * 100:.3f}")

In [None]:
vocab_size = total_cnt - rare_cnt + 1

In [None]:
vocab_size

In [None]:
tokenizer = Tokenizer(num_words = vocab_size)
tokenizer.fit_on_texts(train_data["tokenized"])

X_train = tokenizer.texts_to_sequences(train_data["tokenized"])
X_test = tokenizer.texts_to_sequences(test_data["tokenized"])

In [None]:
y_train = np.array(train_data["label"])
y_test = np.array(test_data["label"])

## 빈 샘플 제거

In [None]:
drop_train = [idx for idx, sentence in enumerate(X_train) if len(sentence) < 1]

#빈 샘플 제거
X_train = np.delete(X_train, drop_train, axis = 0)
y_train = np.delete(y_train, drop_train, axis = 0)

print(len(X_train), len(y_train))

## 패딩

In [None]:
max_len = max([len(s) for s in X_train])
len_list = [len(s) for s in X_train]
for length in range(1,max_len+1):
  cnt = 0
  for s in X_train:
    if len(s) <= length:
      cnt +=1
  print(f"샘플 중 길이가 {length} 이하인 샘플의 비율: {cnt / len(len_list)*100:.3f}")

In [None]:
max_len = 30
X_train = pad_sequences(sequences=X_train, maxlen = max_len)
X_test = pad_sequences(sequences=X_test, maxlen = max_len)

## Multi-Kernel 1D CNN으로 네이버 영화 리뷰 분류하기

In [None]:
from tensorflow.keras.models import load_model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.layers import Dense, Embedding, GlobalMaxPooling1D, Dropout, Conv1D, Input, Flatten, Concatenate


In [None]:
# 하이퍼 파라미터 정의
embedding_dim = 128
dropout_porb = (0,5, 0.8)
num_filters = 128

In [None]:
# 입력층과 임베딩층 정의
# 임베딩층 이후 50% 드랍아웃

model_input = Input(shape = (max_len,))
z = Embedding(input_dim = vocab_size, output_dim = embedding_dim,input_length = max_len, name = "embedding")(model_input)