# 머신러닝을 이용한 텍스트 분류기

In [1]:
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from tensorflow import keras
from tensorflow.keras.datasets import reuters
import pandas as pd
import sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB #다항분포 나이브 베이즈 모델
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import ComplementNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score #정확도 계산

print(tf.__version__)
print(sns.__version__)
print(np.__version__)
print(pd.__version__)
print(sklearn.__version__)

2.8.0
0.11.2
1.21.5
1.3.5
1.0.2


## 모델 준비(머신 러닝)

In [3]:
nb = MultinomialNB()
cb = ComplementNB()
lr = LogisticRegression(C=10000, penalty='l2')
lsvc = LinearSVC(C=1000, penalty='l1', max_iter=500, dual=False)
tree = DecisionTreeClassifier(max_depth=10, random_state=0)
forest = RandomForestClassifier(n_estimators=5, random_state=0)
grbt = GradientBoostingClassifier(random_state=0) # verbose=3
voting_classifier = VotingClassifier(estimators=[
         ('lr', LogisticRegression(C=10000, penalty='l2')),
        ('cb', ComplementNB()),
        ('grbt', GradientBoostingClassifier(random_state=0))
], voting='soft', n_jobs=-1)

## 모든 단어 활용

In [2]:
(x_train1, y_train1), (x_test1, y_test1) = reuters.load_data(num_words=None, test_split=0.2)

In [3]:
print('훈련용 뉴스의 최대 길이 :{}'.format(max(len(l) for l in x_train1)))
print('훈련용 뉴스의 평균 길이 :{}'.format(sum(map(len, x_train1))/len(x_train1)))

훈련용 뉴스의 최대 길이 :2376
훈련용 뉴스의 평균 길이 :145.5398574927633


In [4]:
num_classes = max(y_train1) + 1
print('클래스의 수 : {}'.format(num_classes))

클래스의 수 : 46


In [5]:
word_index = reuters.get_word_index(path="reuters_word_index.json")
index_to_word = { index+3 : word for word, index in word_index.items() }
for index, token in enumerate(("<pad>", "<sos>", "<unk>")):
  index_to_word[index]=token

In [6]:
decoded = []
for i in range(len(x_train1)):
    t = ' '.join([index_to_word[index] for index in x_train1[i]])
    decoded.append(t)

x_train1 = decoded
print(len(x_train1))

8982


In [7]:
decoded = []
for i in range(len(x_test1)):
    t = ' '.join([index_to_word[index] for index in x_test1[i]])
    decoded.append(t)

x_test1 = decoded
print(len(x_test1))

2246


In [9]:
x_train1[:5]

['<sos> mcgrath rentcorp said as a result of its december acquisition of space co it expects earnings per share in 1987 of 1 15 to 1 30 dlrs per share up from 70 cts in 1986 the company said pretax net should rise to nine to 10 mln dlrs from six mln dlrs in 1986 and rental operation revenues to 19 to 22 mln dlrs from 12 5 mln dlrs it said cash flow per share this year should be 2 50 to three dlrs reuter 3',
 "<sos> generale de banque sa lt genb br and lt heller overseas corp of chicago have each taken 50 pct stakes in factoring company sa belgo factors generale de banque said in a statement it gave no financial details of the transaction sa belgo factors' turnover in 1986 was 17 5 billion belgian francs reuter 3",
 '<sos> shr 3 28 dlrs vs 22 cts shr diluted 2 99 dlrs vs 22 cts net 46 0 mln vs 3 328 000 avg shrs 14 0 mln vs 15 2 mln year shr 5 41 dlrs vs 1 56 dlrs shr diluted 4 94 dlrs vs 1 50 dlrs net 78 2 mln vs 25 9 mln avg shrs 14 5 mln vs 15 1 mln note earnings per share reflect th

In [10]:
x_test1[:5]

['<sos> the great atlantic and pacific tea co said its three year 345 mln dlr capital program will be be substantially increased to accommodate growth and expansion plans for waldbaum inc and shopwell inc over the next two years a and p said the acquisition of shopwell in august 1986 and waldbaum in december helped us achieve better than expected results in the fourth quarter ended february 28 its net income from continuing operations jumped 52 6 pct to 20 7 mln dlrs or 55 cts a share in the latest quarter as sales increased 48 3 pct to 1 58 billion dlrs a and p gave no details on the expanded capital program but it did say it completed the first year of the program during 1986 a and p is 52 4 pct owned by lt tengelmann warenhandelsgesellschaft of west germany reuter 3',
 "<sos> philippine sugar production in the 1987 88 crop year ending august has been set at 1 6 mln tonnes up from a provisional 1 3 mln tonnes this year sugar regulatory administration sra chairman arsenio yulo said yu

In [8]:
dtmvector = CountVectorizer()
tfidf_transformer = TfidfTransformer()

In [9]:
x_train1_dtm = dtmvector.fit_transform(x_train1)
tfidfv1 = tfidf_transformer.fit_transform(x_train1_dtm)

In [10]:
x_test1_dtm = dtmvector.transform(x_test1)
tfidfv_test1 = tfidf_transformer.transform(x_test1_dtm)

In [14]:
nb.fit(tfidfv1, y_train1)
predicted = nb.predict(tfidfv_test1)
print("정확도:", accuracy_score(y_test1, predicted))

정확도: 0.5997328584149599


In [15]:
cb.fit(tfidfv1, y_train1)
predicted = cb.predict(tfidfv_test1)
print("정확도:", accuracy_score(y_test1, predicted))

정확도: 0.7649154051647373


In [16]:
lr.fit(tfidfv1, y_train1)
predicted = lr.predict(tfidfv_test1)
print("정확도:", accuracy_score(y_test1, predicted))

정확도: 0.813446126447017


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [17]:
lsvc.fit(tfidfv1, y_train1)
predicted = lsvc.predict(tfidfv_test1)
print("정확도:", accuracy_score(y_test1, predicted))

정확도: 0.7858414959928762




In [18]:
tree.fit(tfidfv1, y_train1)
predicted = tree.predict(tfidfv_test1)
print("정확도:", accuracy_score(y_test1, predicted))

정확도: 0.6211041852181657


In [19]:
forest.fit(tfidfv1, y_train1)
predicted = forest.predict(tfidfv_test1)
print("정확도:", accuracy_score(y_test1, predicted))

정확도: 0.6544968833481746


In [20]:
grbt.fit(tfidfv1, y_train1)
predicted = grbt.predict(tfidfv_test1)
print("정확도:", accuracy_score(y_test1, predicted))

정확도: 0.7702582368655387


In [21]:
voting_classifier.fit(tfidfv1, y_train1)
predicted = voting_classifier.predict(tfidfv_test1)
print("정확도:", accuracy_score(y_test1, predicted))

정확도: 0.8187889581478184


## 5,000개 활용

In [12]:
(x_train2, y_train2), (x_test2, y_test2) = reuters.load_data(num_words=5000, test_split=0.2)

In [13]:
decoded = []
for i in range(len(x_train2)):
    t = ' '.join([index_to_word[index] for index in x_train2[i]])
    decoded.append(t)

x_train2 = decoded
print(len(x_train2))

8982


In [14]:
decoded = []
for i in range(len(x_test2)):
    t = ' '.join([index_to_word[index] for index in x_test2[i]])
    decoded.append(t)

x_test2 = decoded
print(len(x_test2))

2246


In [15]:
x_train2_dtm = dtmvector.fit_transform(x_train2)
tfidfv2 = tfidf_transformer.fit_transform(x_train2_dtm)

In [16]:
x_test2_dtm = dtmvector.transform(x_test2)
tfidfv_test2 = tfidf_transformer.transform(x_test2_dtm)

In [27]:
nb.fit(tfidfv2, y_train2)
predicted = nb.predict(tfidfv_test2)
print("정확도:", accuracy_score(y_test2, predicted))

정확도: 0.6731967943009796


In [28]:
cb.fit(tfidfv2, y_train2)
predicted = cb.predict(tfidfv_test2)
print("정확도:", accuracy_score(y_test2, predicted))

정확도: 0.7707034728406055


In [29]:
lr.fit(tfidfv2, y_train2)
predicted = lr.predict(tfidfv_test2)
print("정확도:", accuracy_score(y_test2, predicted))

정확도: 0.8058771148708815


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [30]:
lsvc.fit(tfidfv2, y_train2)
predicted = lsvc.predict(tfidfv_test2)
print("정확도:", accuracy_score(y_test2, predicted))

정확도: 0.7671415850400712




In [31]:
tree.fit(tfidfv2, y_train2)
predicted = tree.predict(tfidfv_test2)
print("정확도:", accuracy_score(y_test2, predicted))

정확도: 0.6179875333926982


In [32]:
forest.fit(tfidfv2, y_train2)
predicted = forest.predict(tfidfv_test2)
print("정확도:", accuracy_score(y_test2, predicted))

정확도: 0.701246660730187


In [33]:
grbt.fit(tfidfv2, y_train2)
predicted = grbt.predict(tfidfv_test2)
print("정확도:", accuracy_score(y_test2, predicted))

정확도: 0.767586821015138


In [34]:
voting_classifier.fit(tfidfv2, y_train2)
predicted = voting_classifier.predict(tfidfv_test2)
print("정확도:", accuracy_score(y_test2, predicted))

정확도: 0.8161175422974176


## 20,000개 활용

In [17]:
(x_train3, y_train3), (x_test3, y_test3) = reuters.load_data(num_words=20000, test_split=0.2)

In [18]:
decoded = []
for i in range(len(x_train3)):
    t = ' '.join([index_to_word[index] for index in x_train3[i]])
    decoded.append(t)

x_train3 = decoded
print(len(x_train3))

8982


In [19]:
decoded = []
for i in range(len(x_test3)):
    t = ' '.join([index_to_word[index] for index in x_test3[i]])
    decoded.append(t)

x_test3 = decoded
print(len(x_test3))

2246


In [20]:
x_train3_dtm = dtmvector.fit_transform(x_train3)
tfidfv3 = tfidf_transformer.fit_transform(x_train3_dtm)

In [21]:
x_test3_dtm = dtmvector.transform(x_test3)
tfidfv_test3 = tfidf_transformer.transform(x_test3_dtm)

In [40]:
nb.fit(tfidfv3, y_train3)
predicted = nb.predict(tfidfv_test3)
print("정확도:", accuracy_score(y_test3, predicted))

정확도: 0.6193232413178985


In [41]:
cb.fit(tfidfv3, y_train3)
predicted = cb.predict(tfidfv_test3)
print("정확도:", accuracy_score(y_test3, predicted))

정확도: 0.7671415850400712


In [42]:
lr.fit(tfidfv3, y_train3)
predicted = lr.predict(tfidfv_test3)
print("정확도:", accuracy_score(y_test3, predicted))

정확도: 0.8098842386464826


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [43]:
lsvc.fit(tfidfv3, y_train3)
predicted = lsvc.predict(tfidfv_test3)
print("정확도:", accuracy_score(y_test3, predicted))

정확도: 0.7782724844167409




In [44]:
tree.fit(tfidfv3, y_train3)
predicted = tree.predict(tfidfv_test3)
print("정확도:", accuracy_score(y_test3, predicted))

정확도: 0.6211041852181657


In [45]:
forest.fit(tfidfv3, y_train3)
predicted = forest.predict(tfidfv_test3)
print("정확도:", accuracy_score(y_test3, predicted))

정확도: 0.6714158504007124


In [46]:
grbt.fit(tfidfv3, y_train3)
predicted = grbt.predict(tfidfv_test3)
print("정확도:", accuracy_score(y_test3, predicted))

정확도: 0.7702582368655387


In [47]:
voting_classifier.fit(tfidfv3, y_train3)
predicted = voting_classifier.predict(tfidfv_test3)
print("정확도:", accuracy_score(y_test3, predicted))

정확도: 0.8178984861976848


## RNN 학습

In [11]:
rnn_x_train1 = tfidfv1.toarray()
rnn_x_test1 = tfidfv_test1.toarray()

In [12]:
vocab_size = 28842
word_vector_dim = 200

In [13]:
num_classes

46

In [14]:
from tensorflow import keras

model = keras.Sequential()

model.add(keras.layers.Embedding(vocab_size, word_vector_dim, input_shape=(None,)))
model.add(keras.layers.LSTM(32, activation='tanh'))
model.add(keras.layers.Dense(num_classes, activation='softmax'))

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 200)         5768400   
                                                                 
 lstm (LSTM)                 (None, 32)                29824     
                                                                 
 dense (Dense)               (None, 46)                1518      
                                                                 
Total params: 5,799,742
Trainable params: 5,799,742
Non-trainable params: 0
_________________________________________________________________


In [15]:
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

In [16]:
es = keras.callbacks.EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=3)
epochs=10  # 몇 epoch를 훈련하면 좋을지 결과를 보면서 바꾸어 봅시다. 

history = model.fit(rnn_x_train1,
                    y_train1,
                    epochs=epochs,
                    batch_size=128,
                    callbacks=[es],
                    validation_split=0.2,
                    verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10

KeyboardInterrupt: ignored

In [None]:
from scipy import sparse 

sparse.issparse(rnn_x_test1)

results = model.evaluate(rnn_x_test1, y_test1, verbose=2)
print(results)

In [77]:
rnn_x_train2 = tfidfv2.toarray()
rnn_x_test2 = tfidfv_test2.toarray()

vocab_size = 5000

model2 = keras.Sequential()

model2.add(keras.layers.Embedding(vocab_size, word_vector_dim, input_shape=(None,)))
model2.add(keras.layers.LSTM(32, activation='tanh'))
model2.add(keras.layers.Dense(num_classes, activation='softmax'))

model2.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

history2 = model2.fit(rnn_x_train2,
                    y_train2,
                    epochs=epochs,
                    batch_size=60,
                    callbacks=[es],
                    validation_split=0.2,
                    verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 7: early stopping


In [78]:
sparse.issparse(rnn_x_test2)

results2 = model2.evaluate(rnn_x_test2, y_test1, verbose=2)
print(results2)

71/71 - 5s - loss: 2.4203 - accuracy: 0.3620 - 5s/epoch - 72ms/step
[2.420342445373535, 0.36197686195373535]


In [79]:
rnn_x_train3 = tfidfv3.toarray()
rnn_x_test3 = tfidfv_test3.toarray()

vocab_size = 20000

model3 = keras.Sequential()

model3.add(keras.layers.Embedding(vocab_size, word_vector_dim, input_shape=(None,)))
model3.add(keras.layers.LSTM(32, activation='tanh'))
model3.add(keras.layers.Dense(num_classes, activation='softmax'))

model3.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

history2 = model3.fit(rnn_x_train3,
                    y_train3,
                    epochs=epochs,
                    batch_size=60,
                    callbacks=[es],
                    validation_split=0.2,
                    verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10

KeyboardInterrupt: ignored

In [None]:
sparse.issparse(rnn_x_test3)

result3 = model3.evaluate(rnn_x_test3, y_test3, verbose=2)
print(result3)

#회고

## F-1 score

confusion matrix에 대한 개념은 사실 인공지능에서만 나오는 개념도 아닐 뿐더러, 내가 공부했던 정보보안에서도 꽤나 중요하게 다루는 내용이다 보니 어느정도 사전지식을 가지고 있고, 가지고 있던 개념을 토대로 인공지능에서의 관점을 적용해서 이해하고 있다고 생각했고, F1-score도 완벽하지는 않지만 대강은(recall도 precision도 온전히 신뢰할 수 없기 때문에 둘의 조화평균을 계산한다. 정도? 사실 조화평균이라는 표현도 뭔지 모르겠긴 하다.)이해를 하고 있었는데, 각 모델에 적용해보려니 머리가 새하얘졌다. 조금 더 시간을 투자할 수 있었더라면 그래도 적용해볼 수 있었겠지 싶은데 또 이렇게 매번 밀려서 시간투자가 아쉬워진다. 결국 코드가 약하면 결정적으로 결과를 낼 수가 없다. 어쨋든 저쨋든 결국 컴퓨터공학인 이상, 컴퓨터공학적으로 활용하고자 하는 이상 코딩이 정수다.

## 결과에 대하여

가장 의외이고 이해할 수 없었던 점은 가장 적은 단어를 활용한 사례가 오히려 높은 성능을 발휘했다는 것이다. 내가 잘못한 건지 이게 합리적인 결과인지를 모르겠어서 걱정스럽다. 심지어 모든 단어를 활용한 경우가 성능이 몹시 낮다. 하긴 글의 종류를 여러 개 주어주고 맞추는 문제라면 사실 인간의 입장에서는 단어가 5,000개까지도 필요가 없지 싶다. 항상 그래왔듯이 하고자 하는 태스크에 맞게 데이터의 갯수던 모델의 깊이던 설정해야 좋은 성능을 보여주는 것 같다. 문제는 아직 태스크에 따른 적절한 수준을 판단할 수 없다는 점이지만...

## fit_transform & transform

슥 보고 같은 함수라고 생각하고 긁어서 쓰다가 에러를 만나 얼떨결에 공부하게 되었다.
완벽하게 이해한 것은 아니지만 결론만 얘기하자면 fit_transform()은 train data에만 사용한다. mean과 varience를 학습하기 때문인데, test data에 fit_transform을 사용할 경우 새로운 mean과 varience를 학습하게 되어 학습의 성능을 점검하게 되는 것이 아닌 새로운 학습을 하게 된다는 것이다.

## 딥러닝 모델

딥러닝 모델과의 비교까지는 해보려 했는데 역시 순탄치 않았다. 몇 푼 안되지만 돈주고 등록한 GPU는 LSTM의 유닛을 늘리거나 다층 LSTM으로 쌓는 순간 터져서 멈추기 일쑤였고, 그럴 때마다 세션을 통째로 날려버려서 처음부터 다시 돌려야 했다. 뒤에 fully connected dense 레이어를 쌓아보기도 하고, scaler 기법? 이라는 알지 못했던 코드를 넣어보기도 했다. 아무튼 성능이 비교 가능한 수준으로 올라가지가 않는다. 이 태스크를 LSTM으로 보여줄만한 성능을 낼 수가 있는게 맞긴한걸까? 학습을 성공시킨 코드를 정말 보고싶다.

# REF
* fit_transform & transform<br>
https://deepinsight.tistory.com/165
<br>
<br>
* LSTM<br>
https://ebbnflow.tistory.com/135<br>
https://www.tensorflow.org/api_docs/python/tf/keras/layers/LSTM<br>
https://www.tensorflow.org/api_docs/python/tf/keras/layers/Dense<br>
https://blog.naver.com/PostView.nhn?blogId=htk1019&logNo=221255254613<br>
https://wdprogrammer.tistory.com/23