In [1]:
import os
os.getcwd()
os.chdir("/content/drive/My Drive/Colab Notebooks/mecab")

In [2]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Embedding
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.models import load_model
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [3]:
ratings_data = pd.read_table('../ratings.txt')
ratings_data.head()

Unnamed: 0,id,document,label
0,8112052,어릴때보고 지금다시봐도 재밌어요ㅋㅋ,1
1,8132799,"디자인을 배우는 학생으로, 외국디자이너와 그들이 일군 전통을 통해 발전해가는 문화산...",1
2,4655635,폴리스스토리 시리즈는 1부터 뉴까지 버릴께 하나도 없음.. 최고.,1
3,9251303,와.. 연기가 진짜 개쩔구나.. 지루할거라고 생각했는데 몰입해서 봤다.. 그래 이런...,1
4,10067386,안개 자욱한 밤하늘에 떠 있는 초승달 같은 영화.,1


In [None]:
ratings_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Data columns (total 3 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   id        200000 non-null  int64 
 1   document  199992 non-null  object
 2   label     200000 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 4.6+ MB


# 1. LSTM 복습. naver 리뷰(split을 이용한 토크나이즈)

In [None]:
# 전처리---------------------------------------------
ratings_data['document'] = ratings_data['document'].str.replace(r'[^\w]', ' ')
ratings_data['document'] = ratings_data['document'].replace('', np.nan)
ratings_data = ratings_data.dropna(how='any', axis=0)
print("# preprocessing done")


# 토크나이즈-----------------------------------------
docu_train, docu_test, y_train, y_test = train_test_split(ratings_data['document'], ratings_data['label'], shuffle=False)
print("# split done")

X_train = []
for stc in docu_train:
    X_train.append(stc.split())
  
X_test = []
for stc in docu_test:
    X_test.append(stc.split())

print("# tokenization done")

# 정수 인코딩----------------------------------------
# 토크나이저는 빈도수가 높은 것부터 인덱스를 부여함
tokenizer = Tokenizer()  # 단어 설정을 안하면 전체 단어가 들어감
                         # 쓸데없는 단어까지 다 들어가게 되는데,
                         # 그렇게 되면 loss가 늘어나게 된다.
tokenizer.fit_on_texts(X_train)

X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)
print("# int_encoding done")

# preprocessing done
# split done
# tokenization done
# int_encoding done


In [None]:
print(len(tokenizer.word_index)) # 전체 단어 수

288992


In [None]:
# 빈도수가 2이하인 단어의 개수
low_count = 0
for word, word_count in tokenizer.word_counts.items():
  if word_count <= 2:
    low_count += 1
print(low_count)

246329


In [None]:
len(tokenizer.word_index) - low_count

42663

In [None]:
# max_length 구하기
max_length = 0
for data in X_train:
      if max_length < len(data):
          max_length = len(data)
print(max_length)

47


In [None]:
# padding---------------------------------------------
max_len = max([len(x) for x in X_train]) # 문장의 최대 길이 or 평균 길이
X_train = pad_sequences(X_train, maxlen=max_len)
X_test = pad_sequences(X_test, maxlen=max_len)
print("# padding done")

# padding done


In [None]:
# 모델 생성-------------------------------------------

model = Sequential()
# 단어 임베딩 -> 5000개의 단어를 120차원으로 내보내겠다(정수 -> 120 dimension vector)

# LSTM에서는 120~128 dimension일 때 성능이 제일 좋다는 경험적 성과가 있음
# dimension은 hyperparameter
# 긴 문장은 높게, 짧은 문장은 작게 잡는 편임
# 보통 2의 거듭제곱 형태로 지정한다.

model.add(Embedding(20000, 120))
# LSTM
model.add(LSTM(120))
# 이진 분류(sigmoid)
model.add(Dense(1, activation='sigmoid'))
#------------------------------------------------------

In [None]:
# # 모델 성능 향상---------------------------------------

# # validation loss를 계속 보다가 5회 이상 loss가 증가하면, 과적합될 수 있으므로 학습을 조기 종료하겠다.
# early_stop = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=5)
# # epoch를 반복하면서, 가장 검증데이터 정확도가 높았던 순간을 체크포인트(the_best.h5)로 저장
# # 정확도가 낮아지면 모델 버려라
# model_check = ModelCheckpoint('the_best_korean_split.h5', monitor='val_acc', mode='max', verbose=1, save_best_only=True)

In [None]:
# %%time
# model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])
# model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=10, batch_size=64, callbacks=[early_stop, model_check])

In [None]:
# # 정확도 측정
# print(model.evaluate(X_test, y_test)) # [loss, acc]

# 2. [캐글 스팸데이터](https://www.kaggle.com/uciml/sms-spam-collection-dataset)를 활용한 스팸 메일 분류

In [5]:
spam = pd.read_csv('../spam.csv', encoding='latin1')
spam.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [6]:
spam = spam.dropna(axis=1)

In [7]:
spam.isna().sum()

v1    0
v2    0
dtype: int64

In [8]:
spam.v1.value_counts()

ham     4825
spam     747
Name: v1, dtype: int64

In [9]:
# v1 전처리(y)
# -- 숫자로 변환
spam['v1'] = spam['v1'].replace('ham', 0)
spam['v1'] = spam['v1'].replace('spam', 1)

# v2 전처리(X)
# -- 단어만 남기기
spam['v2'] = spam['v2'].str.replace(r"[^\w]", ' ')
spam['v2'] = spam['v2'].replace('', np.nan)
spam['v2']
# print(spam.shape)

# -- null인 행 제거
spam = spam.dropna(how='any', axis=0)
# print(spam.shape)

# -- 소문자로 변경
spam['v2'] = spam['v2'].apply(lambda x: x.lower())

print("# preprocessing done")

#-----------------------------------------------------------
mail_train, mail_test, y_train, y_test = train_test_split(spam['v2'], spam['v1'], shuffle=False)

print('# split done')

#-----------------------------------------------------------
stopwords = ['the', 'a', 'an', 'i', 'my', 'me', 'mine', 'you', 'your', 'yours', 'she', 'her', 'hers', 'he', 'his', 'him']

X_train = []
for stc in mail_train:
    words = stc.split()
    for word in words:
        if word in stopwords:
            words.remove(word)
    X_train.append(words)

X_test = []
for stc in mail_test:
    words = stc.split()
    for word in words:
        if word in stopwords:
            words.remove(word)
    X_test.append(words)


print("# tokenization done")

# preprocessing done
# split done
# tokenization done


In [10]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)
print("전체 token 개수:", len(tokenizer.word_index))
print("빈도수가 3이하인 token 개수:", len([word for word, word_count in tokenizer.word_counts.items() if word_count <= 3]))

전체 token 개수: 7512
빈도수가 3이하인 token 개수: 5658


In [11]:
#----------------------------------------------------------------------------
tokenizer = Tokenizer(1800)
tokenizer.fit_on_texts(X_train)
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)
print("# int-encoding done")

# int-encoding done


In [13]:
# padding--------------------------------------------------------------------
max_len = max([len(x) for x in X_train]) # 137
X_train = pad_sequences(X_train, maxlen=max_len)
X_test = pad_sequences(X_test, maxlen=max_len)
print("# padding done")

# padding done


In [None]:
# 모델 생성----------------------------------------------------------------
model = Sequential()
model.add(Embedding(1800, 128, input_length=max_len))
model.add(LSTM(128))
model.add(Dense(1, activation='sigmoid'))

# callback-----------------------------------------------------------------
early_stop = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=5)
model_check = ModelCheckpoint('LSTM_spam_best.h5', monitor='val_accuracy', mode='max', verbose=1, save_best_only=True)

In [None]:
# 모델 학습----------------------------------------------------------------
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=10, batch_size=64, callbacks=[early_stop, model_check])

Epoch 1/10
Epoch 00001: val_accuracy improved from -inf to 0.98277, saving model to LSTM_spam_best.h5
Epoch 2/10
Epoch 00002: val_accuracy improved from 0.98277 to 0.98995, saving model to LSTM_spam_best.h5
Epoch 3/10
Epoch 00003: val_accuracy did not improve from 0.98995
Epoch 4/10
Epoch 00004: val_accuracy did not improve from 0.98995
Epoch 5/10
Epoch 00005: val_accuracy did not improve from 0.98995
Epoch 6/10
Epoch 00006: val_accuracy did not improve from 0.98995
Epoch 7/10
Epoch 00007: val_accuracy did not improve from 0.98995
Epoch 00007: early stopping


<tensorflow.python.keras.callbacks.History at 0x7f1afd0b8898>

In [None]:
# 모델 평가------------------------------------------------------------
model.evaluate(X_test, y_test)



[0.050080396234989166, 0.9856424927711487]

In [33]:
# 새로운 데이터 평가
model = load_model('LSTM_spam_best.h5')
sentence = input()

# 토큰화
token_stc = sentence.split()
token_stc = [word.lower() for word in token_stc]

# 정수 인코딩
encode_stc = tokenizer.texts_to_sequences([token_stc])

# 패딩
pad_stc = pad_sequences(encode_stc, maxlen=max_len)

score = model.predict(pad_stc)
print(score)

URGENT! Your Mobile No 07808726822 was awarded a L2,000 Bonus Caller Prize on 02/09/03! This is our 2nd attempt to contact YOU! Call 0871-872-9758 BOX95QU
[[0.99090266]]


에러 무엇?