In [1]:
from tensorflow.keras.datasets import reuters
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd

# data load and split
(x_train, y_train), (x_test, y_test) = reuters.load_data(num_words=10000, test_split=0.2)

In [2]:
print('훈련 샘플의 수: {}'.format(len(x_train)))
print('테스트 샘플의 수: {}'.format(len(x_test)))

훈련 샘플의 수: 8982
테스트 샘플의 수: 2246


### 데이터 복원

In [3]:
word_index = reuters.get_word_index(path="reuters_word_index.json")

index_to_word = { index+3 : word for word, index in word_index.items() }

for index, token in enumerate(("<pad>", "<sos>", "<unk>")):
    index_to_word[index]=token

# 텍스트 복원 확인
print(' '.join([index_to_word[index] for index in x_train[0]]))

<sos> <unk> <unk> said as a result of its december acquisition of space co it expects earnings per share in 1987 of 1 15 to 1 30 dlrs per share up from 70 cts in 1986 the company said pretax net should rise to nine to 10 mln dlrs from six mln dlrs in 1986 and rental operation revenues to 19 to 22 mln dlrs from 12 5 mln dlrs it said cash flow per share this year should be 2 50 to three dlrs reuter 3


In [4]:
# train data : 텍스트로 변환
decoded = []
for i in range(len(x_train)):
    t = ' '.join([index_to_word[index] for index in x_train[i]])
    decoded.append(t)

x_train = decoded
print(len(x_train))


8982


In [5]:
# test data: 텍스트로 변환

decoded = []
for i in range(len(x_test)):
    t = ' '.join([index_to_word[index] for index in x_test[i]])
    decoded.append(t)

x_test = decoded
print(len(x_test))

2246


### Vectorization

In [6]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

# train data : dtm
dtmvector = CountVectorizer()
x_train_dtm = dtmvector.fit_transform(x_train)

# train data : tf-idf
tfidf_transformer = TfidfTransformer()
tfidfv = tfidf_transformer.fit_transform(x_train_dtm)

In [7]:
# test data : dtm and tf-idf
x_test_dtm = dtmvector.transform(x_test)
tfidfv_test = tfidf_transformer.transform(x_test_dtm)

### model test

In [16]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import ComplementNB
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import VotingClassifier

models = {
    "LogisticRegression": LogisticRegression(
        penalty='l2',
        max_iter=1000,
        random_state=0
    ),

    "SVM": SVC(
        kernel='linear',
        probability=True,
        random_state=0
    ),

    "LinearSVC": LinearSVC(
        random_state=0
    ),

    "RandomForest": RandomForestClassifier(
        n_estimators=100,
        random_state=0,
        n_jobs=-1
    ),

    "ComplementNB": ComplementNB(),

    "XGBoost": XGBClassifier(
        n_estimators=200,
        max_depth=6,
        learning_rate=0.1,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=0,
        eval_metric='logloss',
        use_label_encoder=False
    ),

    "LightGBM": LGBMClassifier(
        n_estimators=200,
        learning_rate=0.1,
        random_state=0
    )
}



In [17]:
# voting 추가

models["Voting"] = VotingClassifier(
    estimators=[
        ("lr", models["LogisticRegression"]),
        ("svm", models["SVM"]),
        ("cnb", models["ComplementNB"]),
        ("lgbm", models["LightGBM"])
    ],
    voting="soft"
)

In [14]:
from sklearn.metrics import accuracy_score, f1_score
import pandas as pd

results = []

for name, model in models.items():
    print(f"\n Training {name}")
    
    model.fit(tfidfv, y_train)
    y_pred = model.predict(tfidfv_test)
    
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average="macro")
    
    results.append({
        "Vocabulary Size": 20000,
        "Model": name,
        "Accuracy": acc,
        "F1-Score": f1
    })
    
    print(f"Accuracy: {acc:.4f}, F1: {f1:.4f}")


 Training LogisticRegression
Accuracy: 0.7956, F1: 0.4721

 Training SVM
Accuracy: 0.8219, F1: 0.6355

 Training LinearSVC
Accuracy: 0.8299, F1: 0.6808

 Training RandomForest
Accuracy: 0.7600, F1: 0.4445

 Training ComplementNB
Accuracy: 0.7707, F1: 0.4784

 Training XGBoost
Accuracy: 0.8090, F1: 0.6574

 Training LightGBM




Accuracy: 0.3998, F1: 0.0244

 Training Voting




Accuracy: 0.7155, F1: 0.3180




In [18]:
# voting 개선

models["Voting_v2"] = VotingClassifier(
    estimators=[
        ("lr", models["LogisticRegression"]),
        ("svm", models["SVM"]),
        ("cnb", models["ComplementNB"])
    ],
    voting="soft",
    weights=[2, 2, 1]   # 선형 모델 가중
)

In [19]:

print("\n Training Voting_v2")

model = models["Voting_v2"]
model.fit(tfidfv, y_train)
y_pred = model.predict(tfidfv_test)

acc = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average="macro")

print(f"Accuracy: {acc:.4f}, F1: {f1:.4f}")


 Training Voting_v2
Accuracy: 0.8286, F1: 0.6557


In [20]:
results.append({
    "Vocabulary Size": 20000,
    "Model": "Voting_v2",
    "Accuracy": acc,
    "F1-Score": f1
})

### dense test

In [21]:
# dense layer 준비
X_train_dense = tfidfv.toarray()
X_test_dense = tfidfv_test.toarray()

In [22]:
import tensorflow as tf
from tensorflow.keras import layers, models

n_features = X_train_dense.shape[1]   # 20000
n_classes = 46

In [23]:
dense_model = models.Sequential([
    layers.Input(shape=(n_features,)),

    layers.Dense(256, activation="relu"),
    layers.Dropout(0.5),

    layers.Dense(128, activation="relu"),
    layers.Dropout(0.5),

    layers.Dense(n_classes, activation="softmax")
])

In [24]:
dense_model.compile(
    optimizer="adam",
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"]
)

In [26]:
history = dense_model.fit(
    X_train_dense,
    y_train,
    validation_split=0.2,
    epochs=20,
    batch_size=256,
    verbose=1
)

Epoch 1/10
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.8778 - loss: 0.5107 - val_accuracy: 0.7991 - val_loss: 0.8943
Epoch 2/10
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.8884 - loss: 0.4535 - val_accuracy: 0.8036 - val_loss: 0.8927
Epoch 3/10
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.8980 - loss: 0.4112 - val_accuracy: 0.8080 - val_loss: 0.8920
Epoch 4/10
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.9094 - loss: 0.3666 - val_accuracy: 0.8136 - val_loss: 0.8987
Epoch 5/10
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.9182 - loss: 0.3351 - val_accuracy: 0.8130 - val_loss: 0.9079
Epoch 6/10
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.9253 - loss: 0.3024 - val_accuracy: 0.8147 - val_loss: 0.9113
Epoch 7/10
[1m29/29[0m [32m━━━━

In [28]:
import numpy as np
y_pred_prob = dense_model.predict(X_test_dense)
y_pred = np.argmax(y_pred_prob, axis=1)

acc = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average="macro")

print(np.round(acc, 4), np.round(f1, 4))

[1m71/71[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 746us/step
0.7992 0.5824


In [29]:
results.append({
    "Vocabulary Size": 20000,
    "Model": "Dense",
    "Accuracy": acc,
    "F1-Score": f1
})

In [30]:
results

[{'Vocabulary Size': 20000,
  'Model': 'LogisticRegression',
  'Accuracy': 0.7956366874443455,
  'F1-Score': 0.4721138497182076},
 {'Vocabulary Size': 20000,
  'Model': 'SVM',
  'Accuracy': 0.8219056099732859,
  'F1-Score': 0.6355105649296497},
 {'Vocabulary Size': 20000,
  'Model': 'LinearSVC',
  'Accuracy': 0.8299198575244879,
  'F1-Score': 0.6808429075185041},
 {'Vocabulary Size': 20000,
  'Model': 'RandomForest',
  'Accuracy': 0.7600178094390027,
  'F1-Score': 0.44449233566244767},
 {'Vocabulary Size': 20000,
  'Model': 'ComplementNB',
  'Accuracy': 0.7707034728406055,
  'F1-Score': 0.47835757543524193},
 {'Vocabulary Size': 20000,
  'Model': 'XGBoost',
  'Accuracy': 0.808993766696349,
  'F1-Score': 0.6574499256270908},
 {'Vocabulary Size': 20000,
  'Model': 'LightGBM',
  'Accuracy': 0.39982190560997327,
  'F1-Score': 0.02439743751125569},
 {'Vocabulary Size': 20000,
  'Model': 'Voting',
  'Accuracy': 0.7154942119323241,
  'F1-Score': 0.3180114666330721},
 {'Vocabulary Size': 20000