# 나이브 베이즈 분류기를 이용한 문서 분류2
## - CountVectorizer 대신 TfidfVectorizer로 특성 추출

In [1]:
from sklearn.datasets import fetch_20newsgroups

# 무교, 종교, 그래픽, 우주 카테고리에 해당하는 포스트 가져옴
categories = ['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space']

news_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'), categories=categories)
news_test = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'), categories=categories)

print("Train size: ", len(news_train.data))
print("Test size: ", len(news_test.data))
print("Category: ", news_train.target_names)
print("Category value: ", set(news_train.target))

Train size:  2034
Test size:  1353
Category:  ['alt.atheism', 'comp.graphics', 'sci.space', 'talk.religion.misc']
Category value:  {0, 1, 2, 3}


In [2]:
# train/test split
X_train = news_train.data
y_train = news_train.target

X_test = news_test.data
y_test = news_test.target

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(max_features=2000, min_df=5, max_df=0.5)

X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

from sklearn.naive_bayes import MultinomialNB

NB = MultinomialNB()

NB.fit(X_train_tfidf, y_train)

print("Train Accuracy: {:.3f}".format(NB.score(X_train_tfidf, y_train)))  # {:.3f} -> format을 이용해 소수 셋째자리까지 나타냄
print("Test Accuracy: {:.3f}".format(NB.score(X_test_tfidf, y_test)))

Train Accuracy: 0.862
Test Accuracy: 0.741


In [7]:
# 카테고리별로 영향이 큰 특성(단어) 10개 추출

import numpy as np

def top10_features(classifier, vectorizer, categories):
    feature_names = np.asarray(vectorizer.get_feature_names_out())
    
    for i, category in enumerate(categories):
        # 계수에 음수를 취해 내림차순으로 정렬
        top10 = np.argsort(-classifier.coef_[i])[:10]
        print("%s: %s" % (category, ", ".join(feature_names[top10])))
        
top10_features(NB, tfidf, news_train.target_names)

alt.atheism: you, not, are, be, this, have, as, what, they, if
comp.graphics: you, on, graphics, this, have, any, can, or, with, thanks
sci.space: space, on, you, be, was, this, as, they, have, are
talk.religion.misc: you, not, he, are, as, this, be, god, was, they


