In [8]:
# 필요한 라이브러리 임포트
import numpy as np
from tensorflow.keras.datasets import reuters
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB, ComplementNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier

# 데이터 준비 함수 정의
def load_and_prepare_data(num_words):
    # 로이터 데이터 로드
    (x_train, y_train), (x_test, y_test) = reuters.load_data(num_words=num_words, test_split=0.2)
    
    # 단어 인덱스 로드 및 디코딩
    word_index = reuters.get_word_index()
    reverse_word_index = {value: key for key, value in word_index.items()}
    
    # 텍스트 변환
    x_train_text = [" ".join([reverse_word_index.get(i - 3, "?") for i in x]) for x in x_train]
    x_test_text = [" ".join([reverse_word_index.get(i - 3, "?") for i in x]) for x in x_test]
    
    return x_train_text, x_test_text, y_train, y_test

# 단어 수 리스트
num_words_list = [None, 5000, 1000, 7000, 3000, 500]  # 모든 단어, 상위 5000개, 상위 1000개

# 모델 정의
models = {
    "Naive Bayes": MultinomialNB(),
    "Complement NB": ComplementNB(),
    "Logistic Regression": LogisticRegression(max_iter=500),
    "SVM": SVC(kernel='linear'),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(n_estimators=100),
    "Gradient Boosting": GradientBoostingClassifier(n_estimators=10, max_depth=3, random_state=0),
}

# 성능 저장 변수
results = {}

# 데이터 로드 및 모델 학습
for num_words in num_words_list:
    # 데이터 준비
    x_train_text, x_test_text, y_train, y_test = load_and_prepare_data(num_words)
    
    # TF-IDF 변환
    vectorizer = TfidfVectorizer(max_features=num_words)
    x_train_tfidf = vectorizer.fit_transform(x_train_text)
    x_test_tfidf = vectorizer.transform(x_test_text)
    
    # 각 모델 학습 및 평가
    results[num_words] = {}
    for model_name, model in models.items():
        model.fit(x_train_tfidf, y_train)
        y_pred = model.predict(x_test_tfidf)
        accuracy = accuracy_score(y_test, y_pred)
        results[num_words][model_name] = accuracy

# 결과 정리
import pandas as pd
results_df = pd.DataFrame(results)

results_df


Unnamed: 0,NaN,5000.0,1000.0,7000.0,3000.0,500.0
Naive Bayes,0.599733,0.678094,0.707925,0.667409,0.691006,0.707035
Complement NB,0.764915,0.77382,0.738646,0.769813,0.764915,0.720392
Logistic Regression,0.79163,0.799199,0.794746,0.797418,0.800089,0.788513
SVM,0.822351,0.826358,0.815672,0.824132,0.825022,0.794746
Decision Tree,0.700356,0.700801,0.689225,0.709261,0.691897,0.685218
Random Forest,0.739537,0.769368,0.77382,0.758237,0.774265,0.772484
Gradient Boosting,0.749777,0.740873,0.715494,0.736866,0.742654,0.696349


In [9]:
# VotingClassifier 정의
logistic_reg = LogisticRegression(penalty='l2', random_state=0)
complement_nb = ComplementNB()
gradient_boosting = GradientBoostingClassifier(n_estimators=10, max_depth=3, random_state=0)

# Voting 앙상블 구성
voting_classifier = VotingClassifier(
    estimators=[
        ('logistic_reg', logistic_reg),
        ('complement_nb', complement_nb),
        ('gradient_boosting', gradient_boosting)
    ],
    voting='soft'
)

In [10]:
voting_classifier.fit(x_train_tfidf, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [11]:
predicted = voting_classifier.predict(x_test_tfidf) #테스트 데이터에 대한 예측
print("정확도:", accuracy_score(y_test, predicted)) #예측값과 실제값 비교

정확도: 0.7524487978628673
