In [4]:
from tensorflow.keras.datasets import reuters
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import ComplementNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [2]:
word_index = reuters.get_word_index(path="reuters_word_index.json")
index_to_word = { index+3 : word for word, index in word_index.items() }
for index, token in enumerate(("<pad>", "<sos>", "<unk>")):
  index_to_word[index]=token

In [3]:
dtmvector = CountVectorizer()
tfidf_transformer = TfidfTransformer()

***

# 1. 모든 단어 사용

In [4]:
(x_train, y_train), (x_test, y_test) = reuters.load_data(num_words=None, test_split=0.2)

  x_train, y_train = np.array(xs[:idx]), np.array(labels[:idx])
  x_test, y_test = np.array(xs[idx:]), np.array(labels[idx:])


In [5]:
decoded = []
for i in range(len(x_train)):
    t = ' '.join([index_to_word[index] for index in x_train[i]])
    decoded.append(t)

x_train = decoded

In [6]:
decoded = []
for i in range(len(x_test)):
    t = ' '.join([index_to_word[index] for index in x_test[i]])
    decoded.append(t)

x_test = decoded

In [7]:
x_train_dtm = dtmvector.fit_transform(x_train)
tfidfv = tfidf_transformer.fit_transform(x_train_dtm)

In [8]:
x_test_dtm = dtmvector.transform(x_test)
tfidfv_test = tfidf_transformer.transform(x_test_dtm)

## 1_1. 나이브 베이즈 분류기

In [9]:
model = MultinomialNB()
model.fit(tfidfv, y_train)

MultinomialNB()

In [10]:
predicted = model.predict(tfidfv_test)
print("정확도:", accuracy_score(y_test, predicted))

정확도: 0.5997328584149599


In [11]:
print(classification_report(y_test, predicted, zero_division=0))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        12
           1       0.79      0.21      0.33       105
           2       0.00      0.00      0.00        20
           3       0.72      0.92      0.81       813
           4       0.45      0.96      0.61       474
           5       0.00      0.00      0.00         5
           6       0.00      0.00      0.00        14
           7       0.00      0.00      0.00         3
           8       0.00      0.00      0.00        38
           9       0.00      0.00      0.00        25
          10       0.00      0.00      0.00        30
          11       0.80      0.29      0.42        83
          12       0.00      0.00      0.00        13
          13       0.00      0.00      0.00        37
          14       0.00      0.00      0.00         2
          15       0.00      0.00      0.00         9
          16       0.75      0.18      0.29        99
          17       0.00    

## 1_2. 컴플리먼트 나이브 베이즈 분류기

In [12]:
cb = ComplementNB()
cb.fit(tfidfv, y_train)

ComplementNB()

In [13]:
predicted = cb.predict(tfidfv_test)
print("정확도:", accuracy_score(y_test, predicted))

정확도: 0.7649154051647373


In [14]:
print(classification_report(y_test, predicted, zero_division=0))

              precision    recall  f1-score   support

           0       0.86      0.50      0.63        12
           1       0.63      0.88      0.73       105
           2       0.91      0.50      0.65        20
           3       0.87      0.91      0.89       813
           4       0.75      0.93      0.83       474
           5       0.00      0.00      0.00         5
           6       0.92      0.86      0.89        14
           7       1.00      0.67      0.80         3
           8       0.43      0.08      0.13        38
           9       0.81      0.88      0.85        25
          10       0.96      0.73      0.83        30
          11       0.55      0.67      0.61        83
          12       0.00      0.00      0.00        13
          13       0.62      0.54      0.58        37
          14       0.00      0.00      0.00         2
          15       0.50      0.11      0.18         9
          16       0.67      0.77      0.71        99
          17       0.00    

## 1_3. 로지스틱 회귀

In [15]:
lr = LogisticRegression(C=10000, penalty='l2')
lr.fit(tfidfv, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(C=10000)

In [16]:
predicted = lr.predict(tfidfv_test)
print("정확도:", accuracy_score(y_test, predicted))

정확도: 0.813446126447017


In [17]:
print(classification_report(y_test, predicted, zero_division=0))

              precision    recall  f1-score   support

           0       0.89      0.67      0.76        12
           1       0.75      0.80      0.77       105
           2       0.70      0.70      0.70        20
           3       0.93      0.93      0.93       813
           4       0.81      0.87      0.84       474
           5       1.00      0.20      0.33         5
           6       0.93      1.00      0.97        14
           7       1.00      0.67      0.80         3
           8       0.68      0.71      0.69        38
           9       0.81      0.88      0.85        25
          10       0.93      0.87      0.90        30
          11       0.66      0.73      0.70        83
          12       0.57      0.31      0.40        13
          13       0.61      0.62      0.61        37
          14       0.67      1.00      0.80         2
          15       0.71      0.56      0.63         9
          16       0.71      0.77      0.74        99
          17       0.67    

## 1_4. 선형 서포트 백터 머신

In [18]:
lsvc = LinearSVC(C=1000, penalty='l1', max_iter=500, dual=False)
lsvc.fit(tfidfv, y_train)



LinearSVC(C=1000, dual=False, max_iter=500, penalty='l1')

In [19]:
predicted = lsvc.predict(tfidfv_test)
print("정확도:", accuracy_score(y_test, predicted))

정확도: 0.7871772039180766


In [20]:
print(classification_report(y_test, predicted, zero_division=0))

              precision    recall  f1-score   support

           0       0.80      0.67      0.73        12
           1       0.71      0.71      0.71       105
           2       0.73      0.80      0.76        20
           3       0.91      0.92      0.91       813
           4       0.81      0.86      0.84       474
           5       0.00      0.00      0.00         5
           6       0.72      0.93      0.81        14
           7       0.50      0.33      0.40         3
           8       0.62      0.63      0.62        38
           9       0.95      0.84      0.89        25
          10       0.96      0.87      0.91        30
          11       0.64      0.73      0.68        83
          12       0.36      0.38      0.37        13
          13       0.51      0.54      0.53        37
          14       0.50      0.50      0.50         2
          15       0.83      0.56      0.67         9
          16       0.63      0.73      0.68        99
          17       0.50    

## 1_5. 의사 결정 나무

In [21]:
tree = DecisionTreeClassifier(max_depth=10, random_state=0)
tree.fit(tfidfv, y_train)

DecisionTreeClassifier(max_depth=10, random_state=0)

In [22]:
predicted = tree.predict(tfidfv_test)
print("정확도:", accuracy_score(y_test, predicted))

정확도: 0.6211041852181657


In [23]:
print(classification_report(y_test, predicted, zero_division=0))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        12
           1       0.69      0.43      0.53       105
           2       0.75      0.45      0.56        20
           3       0.94      0.85      0.89       813
           4       0.40      0.89      0.55       474
           5       0.00      0.00      0.00         5
           6       0.00      0.00      0.00        14
           7       0.00      0.00      0.00         3
           8       0.00      0.00      0.00        38
           9       1.00      0.16      0.28        25
          10       0.89      0.80      0.84        30
          11       0.58      0.60      0.59        83
          12       0.00      0.00      0.00        13
          13       0.00      0.00      0.00        37
          14       0.00      0.00      0.00         2
          15       0.00      0.00      0.00         9
          16       0.61      0.83      0.70        99
          17       0.00    

## 1_6. 랜덤포레스트

In [24]:
forest = RandomForestClassifier(n_estimators=5, random_state=0)
forest.fit(tfidfv, y_train)

RandomForestClassifier(n_estimators=5, random_state=0)

In [25]:
predicted = forest.predict(tfidfv_test)
print("정확도:", accuracy_score(y_test, predicted))

정확도: 0.6544968833481746


In [26]:
print(classification_report(y_test, predicted, zero_division=0))

              precision    recall  f1-score   support

           0       0.25      0.58      0.35        12
           1       0.35      0.60      0.44       105
           2       0.32      0.40      0.36        20
           3       0.82      0.89      0.85       813
           4       0.62      0.84      0.71       474
           5       0.00      0.00      0.00         5
           6       0.67      0.43      0.52        14
           7       0.50      0.33      0.40         3
           8       0.51      0.47      0.49        38
           9       1.00      0.28      0.44        25
          10       0.46      0.20      0.28        30
          11       0.56      0.64      0.60        83
          12       0.40      0.15      0.22        13
          13       0.33      0.16      0.22        37
          14       0.00      0.00      0.00         2
          15       0.00      0.00      0.00         9
          16       0.59      0.46      0.52        99
          17       0.00    

## 1_7. 그래디언트부스팅

In [27]:
grbt = GradientBoostingClassifier(random_state=0)
grbt.fit(tfidfv, y_train)

GradientBoostingClassifier(random_state=0)

In [28]:
predicted = grbt.predict(tfidfv_test)
print("정확도:", accuracy_score(y_test, predicted))

정확도: 0.7702582368655387


In [29]:
print(classification_report(y_test, predicted, zero_division=0))

              precision    recall  f1-score   support

           0       0.60      0.50      0.55        12
           1       0.81      0.71      0.76       105
           2       0.58      0.70      0.64        20
           3       0.87      0.91      0.89       813
           4       0.78      0.86      0.82       474
           5       1.00      0.20      0.33         5
           6       0.77      0.71      0.74        14
           7       1.00      0.33      0.50         3
           8       0.60      0.63      0.62        38
           9       0.91      0.80      0.85        25
          10       0.79      0.77      0.78        30
          11       0.61      0.65      0.63        83
          12       0.50      0.46      0.48        13
          13       0.48      0.32      0.39        37
          14       0.00      0.00      0.00         2
          15       0.25      0.11      0.15         9
          16       0.72      0.71      0.71        99
          17       0.83    

## 1_8. 보팅

In [30]:
voting_classifier = VotingClassifier(estimators=[
         ('lr', LogisticRegression(C=10000, penalty='l2')),
        ('cb', ComplementNB()),
        ('grbt', GradientBoostingClassifier(random_state=0))
], voting='soft', n_jobs=-1)
voting_classifier.fit(tfidfv, y_train)

VotingClassifier(estimators=[('lr', LogisticRegression(C=10000)),
                             ('cb', ComplementNB()),
                             ('grbt',
                              GradientBoostingClassifier(random_state=0))],
                 n_jobs=-1, voting='soft')

In [31]:
predicted = voting_classifier.predict(tfidfv_test)
print("정확도:", accuracy_score(y_test, predicted))

정확도: 0.8187889581478184


In [32]:
print(classification_report(y_test, predicted, zero_division=0))

              precision    recall  f1-score   support

           0       0.75      0.75      0.75        12
           1       0.80      0.77      0.79       105
           2       0.67      0.80      0.73        20
           3       0.93      0.94      0.93       813
           4       0.82      0.88      0.85       474
           5       1.00      0.20      0.33         5
           6       0.87      0.93      0.90        14
           7       1.00      0.33      0.50         3
           8       0.69      0.71      0.70        38
           9       0.80      0.80      0.80        25
          10       0.90      0.90      0.90        30
          11       0.67      0.71      0.69        83
          12       0.60      0.46      0.52        13
          13       0.69      0.65      0.67        37
          14       0.29      1.00      0.44         2
          15       0.40      0.22      0.29         9
          16       0.73      0.76      0.74        99
          17       0.75    

***

# 2. 5000 단어 사용

In [33]:
(x_train, y_train), (x_test, y_test) = reuters.load_data(num_words=5000, test_split=0.2)

  x_train, y_train = np.array(xs[:idx]), np.array(labels[:idx])
  x_test, y_test = np.array(xs[idx:]), np.array(labels[idx:])


In [34]:
decoded = []
for i in range(len(x_train)):
    t = ' '.join([index_to_word[index] for index in x_train[i]])
    decoded.append(t)

x_train = decoded

In [35]:
decoded = []
for i in range(len(x_test)):
    t = ' '.join([index_to_word[index] for index in x_test[i]])
    decoded.append(t)

x_test = decoded

In [36]:
x_train_dtm = dtmvector.fit_transform(x_train)
tfidfv = tfidf_transformer.fit_transform(x_train_dtm)

In [37]:
x_test_dtm = dtmvector.transform(x_test)
tfidfv_test = tfidf_transformer.transform(x_test_dtm)

## 2_1. 나이브 베이즈 분류기

In [38]:
model = MultinomialNB()
model.fit(tfidfv, y_train)

MultinomialNB()

In [39]:
predicted = model.predict(tfidfv_test)
print("정확도:", accuracy_score(y_test, predicted))

정확도: 0.6731967943009796


In [40]:
print(classification_report(y_test, predicted, zero_division=0))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        12
           1       0.50      0.80      0.62       105
           2       0.00      0.00      0.00        20
           3       0.86      0.89      0.87       813
           4       0.59      0.95      0.73       474
           5       0.00      0.00      0.00         5
           6       0.00      0.00      0.00        14
           7       0.00      0.00      0.00         3
           8       0.00      0.00      0.00        38
           9       1.00      0.28      0.44        25
          10       0.00      0.00      0.00        30
          11       0.48      0.73      0.58        83
          12       0.00      0.00      0.00        13
          13       1.00      0.14      0.24        37
          14       0.00      0.00      0.00         2
          15       0.00      0.00      0.00         9
          16       0.60      0.66      0.62        99
          17       0.00    

## 2_2. 컴플리먼트 나이브 베이즈 분류기

In [41]:
cb = ComplementNB()
cb.fit(tfidfv, y_train)

ComplementNB()

In [42]:
predicted = cb.predict(tfidfv_test)
print("정확도:", accuracy_score(y_test, predicted))

정확도: 0.7707034728406055


In [43]:
print(classification_report(y_test, predicted, zero_division=0))

              precision    recall  f1-score   support

           0       0.88      0.58      0.70        12
           1       0.63      0.86      0.73       105
           2       0.91      0.50      0.65        20
           3       0.91      0.89      0.90       813
           4       0.74      0.92      0.82       474
           5       0.00      0.00      0.00         5
           6       0.86      0.86      0.86        14
           7       1.00      0.67      0.80         3
           8       0.57      0.21      0.31        38
           9       0.82      0.92      0.87        25
          10       0.96      0.80      0.87        30
          11       0.54      0.76      0.63        83
          12       0.00      0.00      0.00        13
          13       0.69      0.59      0.64        37
          14       0.00      0.00      0.00         2
          15       0.00      0.00      0.00         9
          16       0.67      0.79      0.72        99
          17       0.00    

## 2_3. 로지스틱 회귀

In [44]:
lr = LogisticRegression(C=10000, penalty='l2')
lr.fit(tfidfv, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(C=10000)

In [45]:
predicted = lr.predict(tfidfv_test)
print("정확도:", accuracy_score(y_test, predicted))

정확도: 0.8058771148708815


In [46]:
print(classification_report(y_test, predicted, zero_division=0))

              precision    recall  f1-score   support

           0       0.89      0.67      0.76        12
           1       0.77      0.80      0.79       105
           2       0.74      0.85      0.79        20
           3       0.91      0.93      0.92       813
           4       0.81      0.87      0.84       474
           5       0.00      0.00      0.00         5
           6       0.92      0.86      0.89        14
           7       1.00      0.67      0.80         3
           8       0.64      0.74      0.68        38
           9       0.81      0.88      0.85        25
          10       0.93      0.87      0.90        30
          11       0.64      0.73      0.68        83
          12       0.57      0.31      0.40        13
          13       0.64      0.62      0.63        37
          14       0.50      0.50      0.50         2
          15       0.83      0.56      0.67         9
          16       0.67      0.73      0.70        99
          17       0.82    

## 2_4. 선형 서포트 백터 머신

In [47]:
lsvc = LinearSVC(C=1000, penalty='l1', max_iter=500, dual=False)
lsvc.fit(tfidfv, y_train)



LinearSVC(C=1000, dual=False, max_iter=500, penalty='l1')

In [48]:
predicted = lsvc.predict(tfidfv_test)
print("정확도:", accuracy_score(y_test, predicted))

정확도: 0.7653606411398041


In [49]:
print(classification_report(y_test, predicted, zero_division=0))

              precision    recall  f1-score   support

           0       0.80      0.67      0.73        12
           1       0.73      0.69      0.71       105
           2       0.76      0.80      0.78        20
           3       0.89      0.90      0.90       813
           4       0.80      0.84      0.82       474
           5       0.00      0.00      0.00         5
           6       0.87      0.93      0.90        14
           7       0.50      0.33      0.40         3
           8       0.51      0.66      0.57        38
           9       0.84      0.84      0.84        25
          10       0.80      0.80      0.80        30
          11       0.62      0.76      0.68        83
          12       0.44      0.31      0.36        13
          13       0.50      0.59      0.54        37
          14       0.50      0.50      0.50         2
          15       0.50      0.11      0.18         9
          16       0.65      0.63      0.64        99
          17       0.71    

## 2_5. 의사 결정 나무

In [50]:
tree = DecisionTreeClassifier(max_depth=10, random_state=0)
tree.fit(tfidfv, y_train)

DecisionTreeClassifier(max_depth=10, random_state=0)

In [51]:
predicted = tree.predict(tfidfv_test)
print("정확도:", accuracy_score(y_test, predicted))

정확도: 0.6179875333926982


In [52]:
print(classification_report(y_test, predicted, zero_division=0))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        12
           1       0.72      0.40      0.52       105
           2       0.60      0.45      0.51        20
           3       0.94      0.84      0.89       813
           4       0.39      0.91      0.55       474
           5       0.00      0.00      0.00         5
           6       1.00      0.57      0.73        14
           7       0.00      0.00      0.00         3
           8       0.00      0.00      0.00        38
           9       0.88      0.88      0.88        25
          10       0.87      0.87      0.87        30
          11       0.62      0.48      0.54        83
          12       0.17      0.08      0.11        13
          13       0.00      0.00      0.00        37
          14       0.00      0.00      0.00         2
          15       0.00      0.00      0.00         9
          16       0.60      0.82      0.69        99
          17       0.00    

## 2_6. 랜덤포레스트

In [53]:
forest = RandomForestClassifier(n_estimators=5, random_state=0)
forest.fit(tfidfv, y_train)

RandomForestClassifier(n_estimators=5, random_state=0)

In [54]:
predicted = forest.predict(tfidfv_test)
print("정확도:", accuracy_score(y_test, predicted))

정확도: 0.701246660730187


In [55]:
print(classification_report(y_test, predicted, zero_division=0))

              precision    recall  f1-score   support

           0       0.28      0.42      0.33        12
           1       0.42      0.78      0.55       105
           2       0.44      0.35      0.39        20
           3       0.84      0.90      0.87       813
           4       0.68      0.84      0.75       474
           5       0.00      0.00      0.00         5
           6       0.86      0.43      0.57        14
           7       1.00      0.33      0.50         3
           8       0.59      0.53      0.56        38
           9       0.71      0.40      0.51        25
          10       0.89      0.53      0.67        30
          11       0.57      0.69      0.62        83
          12       0.33      0.15      0.21        13
          13       0.46      0.32      0.38        37
          14       0.00      0.00      0.00         2
          15       1.00      0.11      0.20         9
          16       0.70      0.67      0.68        99
          17       0.00    

## 2_7. 그래디언트부스팅

In [56]:
grbt = GradientBoostingClassifier(random_state=0)
grbt.fit(tfidfv, y_train)

GradientBoostingClassifier(random_state=0)

In [57]:
predicted = grbt.predict(tfidfv_test)
print("정확도:", accuracy_score(y_test, predicted))

정확도: 0.767586821015138


In [58]:
print(classification_report(y_test, predicted, zero_division=0))

              precision    recall  f1-score   support

           0       0.89      0.67      0.76        12
           1       0.80      0.68      0.73       105
           2       0.70      0.70      0.70        20
           3       0.90      0.90      0.90       813
           4       0.76      0.83      0.79       474
           5       0.14      0.20      0.17         5
           6       0.93      0.93      0.93        14
           7       0.50      0.33      0.40         3
           8       0.64      0.66      0.65        38
           9       0.91      0.84      0.87        25
          10       0.87      0.87      0.87        30
          11       0.62      0.66      0.64        83
          12       0.46      0.46      0.46        13
          13       0.55      0.43      0.48        37
          14       0.08      0.50      0.14         2
          15       0.33      0.22      0.27         9
          16       0.72      0.77      0.75        99
          17       0.33    

## 2_8. 보팅

In [59]:
voting_classifier = VotingClassifier(estimators=[
         ('lr', LogisticRegression(C=10000, penalty='l2')),
        ('cb', ComplementNB()),
        ('grbt', GradientBoostingClassifier(random_state=0))
], voting='soft', n_jobs=-1)
voting_classifier.fit(tfidfv, y_train)

VotingClassifier(estimators=[('lr', LogisticRegression(C=10000)),
                             ('cb', ComplementNB()),
                             ('grbt',
                              GradientBoostingClassifier(random_state=0))],
                 n_jobs=-1, voting='soft')

In [60]:
predicted = voting_classifier.predict(tfidfv_test)
print("정확도:", accuracy_score(y_test, predicted))

정확도: 0.8161175422974176


In [61]:
print(classification_report(y_test, predicted, zero_division=0))

              precision    recall  f1-score   support

           0       0.90      0.75      0.82        12
           1       0.80      0.77      0.79       105
           2       0.71      0.85      0.77        20
           3       0.92      0.94      0.93       813
           4       0.82      0.88      0.85       474
           5       0.33      0.20      0.25         5
           6       0.93      0.93      0.93        14
           7       0.67      0.67      0.67         3
           8       0.72      0.68      0.70        38
           9       0.81      0.84      0.82        25
          10       0.93      0.90      0.92        30
          11       0.67      0.70      0.68        83
          12       0.60      0.46      0.52        13
          13       0.68      0.62      0.65        37
          14       0.12      0.50      0.20         2
          15       0.67      0.44      0.53         9
          16       0.74      0.74      0.74        99
          17       0.57    

***

# 3. 10000 단어 사용

In [62]:
(x_train, y_train), (x_test, y_test) = reuters.load_data(num_words=10000, test_split=0.2)

  x_train, y_train = np.array(xs[:idx]), np.array(labels[:idx])
  x_test, y_test = np.array(xs[idx:]), np.array(labels[idx:])


In [63]:
decoded = []
for i in range(len(x_train)):
    t = ' '.join([index_to_word[index] for index in x_train[i]])
    decoded.append(t)

x_train = decoded

In [64]:
decoded = []
for i in range(len(x_test)):
    t = ' '.join([index_to_word[index] for index in x_test[i]])
    decoded.append(t)

x_test = decoded

In [65]:
x_train_dtm = dtmvector.fit_transform(x_train)
tfidfv = tfidf_transformer.fit_transform(x_train_dtm)

In [66]:
x_test_dtm = dtmvector.transform(x_test)
tfidfv_test = tfidf_transformer.transform(x_test_dtm)

## 3_1. 나이브 베이즈 분류기

In [67]:
model = MultinomialNB()
model.fit(tfidfv, y_train)

MultinomialNB()

In [68]:
predicted = model.predict(tfidfv_test)
print("정확도:", accuracy_score(y_test, predicted))

정확도: 0.6567230632235085


In [69]:
print(classification_report(y_test, predicted, zero_division=0))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        12
           1       0.62      0.69      0.65       105
           2       0.00      0.00      0.00        20
           3       0.81      0.90      0.85       813
           4       0.51      0.96      0.67       474
           5       0.00      0.00      0.00         5
           6       0.00      0.00      0.00        14
           7       0.00      0.00      0.00         3
           8       0.00      0.00      0.00        38
           9       1.00      0.08      0.15        25
          10       0.00      0.00      0.00        30
          11       0.66      0.63      0.64        83
          12       0.00      0.00      0.00        13
          13       1.00      0.03      0.05        37
          14       0.00      0.00      0.00         2
          15       0.00      0.00      0.00         9
          16       0.69      0.56      0.61        99
          17       0.00    

## 3_2. 컴플리먼트 나이브 베이즈 분류기

In [70]:
cb = ComplementNB()
cb.fit(tfidfv, y_train)

ComplementNB()

In [71]:
predicted = cb.predict(tfidfv_test)
print("정확도:", accuracy_score(y_test, predicted))

정확도: 0.7707034728406055


In [72]:
print(classification_report(y_test, predicted, zero_division=0))

              precision    recall  f1-score   support

           0       0.89      0.67      0.76        12
           1       0.64      0.88      0.74       105
           2       0.91      0.50      0.65        20
           3       0.91      0.89      0.90       813
           4       0.75      0.92      0.83       474
           5       0.00      0.00      0.00         5
           6       0.93      0.93      0.93        14
           7       1.00      0.67      0.80         3
           8       0.50      0.13      0.21        38
           9       0.82      0.92      0.87        25
          10       0.96      0.80      0.87        30
          11       0.55      0.73      0.63        83
          12       0.00      0.00      0.00        13
          13       0.58      0.59      0.59        37
          14       0.00      0.00      0.00         2
          15       0.50      0.11      0.18         9
          16       0.67      0.79      0.73        99
          17       0.00    

## 3_3. 로지스틱 회귀

In [73]:
lr = LogisticRegression(C=10000, penalty='l2')
lr.fit(tfidfv, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(C=10000)

In [74]:
predicted = lr.predict(tfidfv_test)
print("정확도:", accuracy_score(y_test, predicted))

정확도: 0.8076580587711487


In [75]:
print(classification_report(y_test, predicted, zero_division=0))

              precision    recall  f1-score   support

           0       0.89      0.67      0.76        12
           1       0.75      0.78      0.76       105
           2       0.74      0.85      0.79        20
           3       0.92      0.93      0.93       813
           4       0.81      0.87      0.84       474
           5       0.00      0.00      0.00         5
           6       0.92      0.86      0.89        14
           7       1.00      0.67      0.80         3
           8       0.68      0.71      0.69        38
           9       0.81      0.84      0.82        25
          10       0.93      0.87      0.90        30
          11       0.64      0.73      0.68        83
          12       0.57      0.31      0.40        13
          13       0.59      0.59      0.59        37
          14       0.50      0.50      0.50         2
          15       0.67      0.44      0.53         9
          16       0.68      0.75      0.71        99
          17       0.75    

## 3_4. 선형 서포트 백터 머신

In [76]:
lsvc = LinearSVC(C=1000, penalty='l1', max_iter=500, dual=False)
lsvc.fit(tfidfv, y_train)



LinearSVC(C=1000, dual=False, max_iter=500, penalty='l1')

In [77]:
predicted = lsvc.predict(tfidfv_test)
print("정확도:", accuracy_score(y_test, predicted))

정확도: 0.7796081923419412


In [78]:
print(classification_report(y_test, predicted, zero_division=0))

              precision    recall  f1-score   support

           0       0.75      0.75      0.75        12
           1       0.68      0.70      0.69       105
           2       0.72      0.65      0.68        20
           3       0.91      0.93      0.92       813
           4       0.80      0.85      0.82       474
           5       0.00      0.00      0.00         5
           6       0.86      0.86      0.86        14
           7       0.33      0.33      0.33         3
           8       0.61      0.66      0.63        38
           9       0.88      0.84      0.86        25
          10       0.93      0.83      0.88        30
          11       0.63      0.77      0.69        83
          12       0.60      0.46      0.52        13
          13       0.49      0.46      0.47        37
          14       0.50      0.50      0.50         2
          15       0.71      0.56      0.63         9
          16       0.63      0.70      0.66        99
          17       0.60    

## 3_5. 의사 결정 나무

In [79]:
tree = DecisionTreeClassifier(max_depth=10, random_state=0)
tree.fit(tfidfv, y_train)

DecisionTreeClassifier(max_depth=10, random_state=0)

In [80]:
predicted = tree.predict(tfidfv_test)
print("정확도:", accuracy_score(y_test, predicted))

정확도: 0.6202137132680321


In [81]:
print(classification_report(y_test, predicted, zero_division=0))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        12
           1       0.72      0.42      0.53       105
           2       0.62      0.50      0.56        20
           3       0.93      0.83      0.88       813
           4       0.40      0.90      0.56       474
           5       0.00      0.00      0.00         5
           6       0.90      0.64      0.75        14
           7       0.00      0.00      0.00         3
           8       0.00      0.00      0.00        38
           9       0.88      0.88      0.88        25
          10       0.85      0.77      0.81        30
          11       0.64      0.51      0.56        83
          12       0.14      0.08      0.10        13
          13       0.00      0.00      0.00        37
          14       0.00      0.00      0.00         2
          15       0.00      0.00      0.00         9
          16       0.59      0.84      0.69        99
          17       0.00    

## 3_6. 랜덤포레스트

In [82]:
forest = RandomForestClassifier(n_estimators=5, random_state=0)
forest.fit(tfidfv, y_train)

RandomForestClassifier(n_estimators=5, random_state=0)

In [83]:
predicted = forest.predict(tfidfv_test)
print("정확도:", accuracy_score(y_test, predicted))

정확도: 0.674087266251113


In [84]:
print(classification_report(y_test, predicted, zero_division=0))

              precision    recall  f1-score   support

           0       0.22      0.33      0.27        12
           1       0.45      0.77      0.57       105
           2       0.30      0.30      0.30        20
           3       0.82      0.90      0.86       813
           4       0.61      0.83      0.70       474
           5       0.00      0.00      0.00         5
           6       0.67      0.43      0.52        14
           7       0.50      0.33      0.40         3
           8       0.67      0.53      0.59        38
           9       0.70      0.28      0.40        25
          10       0.75      0.30      0.43        30
          11       0.55      0.59      0.57        83
          12       0.40      0.15      0.22        13
          13       0.37      0.19      0.25        37
          14       0.00      0.00      0.00         2
          15       0.00      0.00      0.00         9
          16       0.59      0.59      0.59        99
          17       0.00    

## 3_7. 그래디언트부스팅

In [85]:
grbt = GradientBoostingClassifier(random_state=0)
grbt.fit(tfidfv, y_train)

GradientBoostingClassifier(random_state=0)

In [86]:
predicted = grbt.predict(tfidfv_test)
print("정확도:", accuracy_score(y_test, predicted))

정확도: 0.7666963490650045


In [87]:
print(classification_report(y_test, predicted, zero_division=0))

              precision    recall  f1-score   support

           0       0.82      0.75      0.78        12
           1       0.77      0.68      0.72       105
           2       0.78      0.70      0.74        20
           3       0.88      0.91      0.89       813
           4       0.76      0.83      0.79       474
           5       0.50      0.20      0.29         5
           6       0.80      0.86      0.83        14
           7       1.00      0.33      0.50         3
           8       0.64      0.66      0.65        38
           9       0.74      0.80      0.77        25
          10       0.90      0.87      0.88        30
          11       0.63      0.64      0.63        83
          12       0.33      0.46      0.39        13
          13       0.62      0.49      0.55        37
          14       0.14      0.50      0.22         2
          15       0.38      0.33      0.35         9
          16       0.73      0.73      0.73        99
          17       0.27    

## 3_8. 보팅

In [88]:
voting_classifier = VotingClassifier(estimators=[
         ('lr', LogisticRegression(C=10000, penalty='l2')),
        ('cb', ComplementNB()),
        ('grbt', GradientBoostingClassifier(random_state=0))
], voting='soft', n_jobs=-1)
voting_classifier.fit(tfidfv, y_train)

VotingClassifier(estimators=[('lr', LogisticRegression(C=10000)),
                             ('cb', ComplementNB()),
                             ('grbt',
                              GradientBoostingClassifier(random_state=0))],
                 n_jobs=-1, voting='soft')

In [89]:
predicted = voting_classifier.predict(tfidfv_test)
print("정확도:", accuracy_score(y_test, predicted))

정확도: 0.8116651825467498


In [90]:
print(classification_report(y_test, predicted, zero_division=0))

              precision    recall  f1-score   support

           0       0.90      0.75      0.82        12
           1       0.77      0.74      0.76       105
           2       0.73      0.80      0.76        20
           3       0.92      0.94      0.93       813
           4       0.83      0.88      0.85       474
           5       1.00      0.20      0.33         5
           6       0.86      0.86      0.86        14
           7       1.00      0.67      0.80         3
           8       0.70      0.68      0.69        38
           9       0.81      0.84      0.82        25
          10       0.93      0.90      0.92        30
          11       0.65      0.69      0.67        83
          12       0.46      0.46      0.46        13
          13       0.68      0.62      0.65        37
          14       0.14      0.50      0.22         2
          15       0.57      0.44      0.50         9
          16       0.72      0.75      0.73        99
          17       0.53    

***

# 종합 결과

# 1. 모든 단어 사용

|머신러닝 기법|accuracy|macro avg F1|weighted avg F1|
|:----|----|----|----|
|나이브 베이즈 분류기|0.60|0.07|0.50|
|컴플리먼트 나이브 베이즈 분류기|0.76|0.46|0.73|
|로지스틱 회귀|0.81|0.67|0.81|
|선형 서포트 벡터 머신|0.79|0.61|0.78|
|의사 결정 나무|0.62|0.15|0.58|
|랜덤포레스트|0.65|0.28|0.62|
|그래디언트부스팅|0.77|0.57|0.76|
|보팅|0.82|0.66|0.81|

# 2. 5000 단어 사용

|머신러닝 기법|accuracy|macro avg F1|weighted avg F1|
|:----|----|----|----|
|나이브 베이즈 분류기|0.67|0.11|0.60|
|컴플리먼트 나이브 베이즈 분류기|0.77|0.48|0.75|
|로지스틱 회귀|0.81|0.64|0.80|
|선형 서포트 벡터 머신|0.77|0.58|0.76|
|의사 결정 나무|0.62|0.18|0.57|
|랜덤포레스트|0.70|0.36|0.68|
|그래디언트부스팅|0.77|0.58|0.77|
|보팅|0.82|0.66|0.81|

# 3. 10000 단어 사용

|머신러닝 기법|accuracy|macro avg F1|weighted avg F1|
|:----|----|----|----|
|나이브 베이즈 분류기|0.66|0.10|0.58|
|컴플리먼트 나이브 베이즈 분류기|0.77|0.48|0.75|
|로지스틱 회귀|0.81|0.64|0.80|
|선형 서포트 벡터 머신|0.78|0.59|0.77|
|의사 결정 나무|0.62|0.18|0.58|
|랜덤포레스트|0.67|0.31|0.64|
|그래디언트부스팅|0.77|0.58|0.76|
|보팅|0.81|0.66|0.81|

로지스틱 회귀와 보팅이 세 가지 경우 모두 가장 높게 나왔습니다.  
미묘하게 보팅이 성능이 더 좋게 나왔지만, 연산에 시간이 많이 걸리는 만큼 일말의 정확도 차이가 중요한 문제가 아니라면 가능하면 로지스틱 회귀가 더 현실적인 기법이라 생각됩니다.  
전반적으로 단어 개수 차이는 크게 반영되지 않는 것으로 보입니다.  
다만 나이브 베이즈 분류기와 랜덤포레스트에서 단어가 적을 수록 성능이 상승하였습니다.  
두 기법 외의 경우에는 큰 성능차이를 보여주지 않지만 연산 속도를 위하여 5000 단어를 사용하는 것이 바람직해보입니다.  

## 최종 선정 : 5000단어 + 로지스틱 회귀
### accuracy : 0.81
### macro avg F1 : 0.64
### weighted avg F1 : 0.80

***

# LSTM 적용

In [5]:
import tensorflow as tf
from tensorflow import keras

In [6]:
(x_train, y_train), (x_test, y_test) = reuters.load_data(num_words=5000, test_split=0.2)

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/reuters.npz


  x_train, y_train = np.array(xs[:idx]), np.array(labels[:idx])
  x_test, y_test = np.array(xs[idx:]), np.array(labels[idx:])


In [7]:
total_data_text = list(x_train) + list(x_test)
num_tokens = [len(tokens) for tokens in total_data_text]
num_tokens = np.array(num_tokens)
print('기사의 최대 길이 :', np.max(num_tokens))
print('기사의 평균 길이 :', np.mean(num_tokens))
print('기사의 표준편차 : ', np.std(num_tokens))

기사의 최대 길이 : 2376
기사의 평균 길이 : 145.96419665122906
기사의 표준편차 :  145.8784764459447


In [8]:
max_tokens = np.mean(num_tokens) + 3 * np.std(num_tokens)
maxlen = int(max_tokens)
print('pad_sequences maxlen : ', maxlen)
print('전체 기사의 {}%가 maxlen 설정값 이내에 포함됩니다. '.format(np.sum(num_tokens < max_tokens) / len(num_tokens)))

pad_sequences maxlen :  583
전체 기사의 0.9730138938368365%가 maxlen 설정값 이내에 포함됩니다. 


In [9]:
x_train = keras.preprocessing.sequence.pad_sequences(x_train, padding='pre', maxlen=maxlen)
x_test = keras.preprocessing.sequence.pad_sequences(x_test, padding='pre', maxlen=maxlen)

In [15]:
word_vector_dim = 16

model = keras.Sequential()
model.add(keras.layers.Embedding(5000, word_vector_dim, input_shape=(None,)))
model.add(keras.layers.LSTM(512))
model.add(keras.layers.Dense(512, activation='relu'))
model.add(keras.layers.Dense(46, activation='softmax'))

model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, None, 16)          80000     
_________________________________________________________________
lstm_2 (LSTM)                (None, 512)               1083392   
_________________________________________________________________
dense_4 (Dense)              (None, 512)               262656    
_________________________________________________________________
dense_5 (Dense)              (None, 46)                23598     
Total params: 1,449,646
Trainable params: 1,449,646
Non-trainable params: 0
_________________________________________________________________


In [16]:
from sklearn.model_selection import train_test_split

x_train_split, x_train_val, y_train_split, y_val = train_test_split(x_train, y_train, test_size = 0.2, random_state = 12)
print(x_train_split.shape)
print(y_train_split.shape)

(7185, 583)
(7185,)


In [18]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy',metrics=['accuracy'])
history = model.fit(x_train_split, y_train_split, epochs=20, batch_size=64, validation_data=(x_train_val, y_val), verbose=1)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [19]:
results = model.evaluate(x_test,  y_test, verbose=2)

print(results)

71/71 - 6s - loss: 1.5179 - accuracy: 0.6282
[1.5179413557052612, 0.628227949142456]


# 로직스틱 회귀 : 0.81
# LSTM : 0.63
LSTM이 더 낮은 accuracy를 보여줍니다.    
로이터 기사 길이가 너무 길어서 LSTM이 힘을 못 쓴 것으로 보입니다. (평균 길이 : 145, max_len : 583)  
차라리 트랜스포머를 사용한다면 더 좋은 성능이 나올 것이라 생각됩니다.