In [3]:
from sklearn.datasets import fetch_20newsgroups

newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))

newsgroups_test = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'))

print(len(newsgroups_train.data))
print(len(newsgroups_test.data))
print()
print(set(newsgroups_train.target_names))
print(set(newsgroups_train.target))

11314
7532

{'comp.sys.mac.hardware', 'sci.space', 'alt.atheism', 'comp.graphics', 'talk.politics.mideast', 'sci.med', 'rec.motorcycles', 'sci.crypt', 'comp.os.ms-windows.misc', 'rec.sport.hockey', 'comp.sys.ibm.pc.hardware', 'comp.windows.x', 'soc.religion.christian', 'rec.autos', 'rec.sport.baseball', 'talk.politics.misc', 'talk.religion.misc', 'talk.politics.guns', 'misc.forsale', 'sci.electronics'}
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19}


In [8]:
X_train = newsgroups_train.data
Y_train = newsgroups_train.target

X_test = newsgroups_test.data
Y_test = newsgroups_test.target

from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(max_features=2000, min_df = 5, max_df=0.5)

X_train_cv = cv.fit_transform(X_train) # train set을 변환
print(X_train_cv.shape)
print()
X_test_cv = cv.transform(X_test) # test set을 변환
print(X_test_cv.shape)

(11314, 2000)

(7532, 2000)


In [9]:
for word, count in zip(
  cv.get_feature_names_out()[:100], X_train_cv.toarray()[0, :100]
):
  print(word, count)

00 0
000 0
01 0
02 0
03 0
04 0
05 0
06 0
0d 0
0t 0
10 0
100 0
11 0
12 0
128 0
13 0
14 0
145 0
15 0
150 0
16 0
17 0
18 0
19 0
1988 0
1989 0
1990 0
1991 0
1992 0
1993 0
1d9 0
1st 0
1t 0
20 0
200 0
21 0
22 0
23 0
24 0
25 0
250 0
256 0
26 0
27 0
28 0
29 0
2di 0
2nd 0
2tm 0
30 0
300 0
31 0
32 0
33 0
34 0
34u 0
35 0
36 0
37 0
38 0
386 0
39 0
3d 0
3l 0
3rd 0
3t 0
40 0
400 0
41 0
42 0
43 0
44 0
45 0
46 0
47 0
48 0
486 0
49 0
4t 0
50 0
500 0
51 0
52 0
53 0
54 0
55 0
56 0
57 0
58 0
59 0
5u 0
60 0
600 0
61 0
63 0
64 0
65 0
66 0
68 0
6ei 0


In [10]:
from sklearn.naive_bayes import MultinomialNB

#분류기 선언
NB_clf = MultinomialNB()

#train set을 이용해 분류기 (classifier) 학습
NB_clf.fit(X_train_cv, Y_train)

#train set 에 대한 예측 정확도 확인

print(NB_clf.score(X_train_cv, Y_train))
print(NB_clf.score(X_test_cv, Y_test))


0.6248011313417006
0.5031864046733935


In [11]:
print(X_test[0],Y_test[0])
print(X_test[1],Y_test[1])

print()

pred = NB_clf.predict(X_test_cv[:2])

print(pred)
print(
  newsgroups_train.target_names[pred[0]],
  newsgroups_train.target_names[pred[1]]
)

I am a little confused on all of the models of the 88-89 bonnevilles.
I have heard of the LE SE LSE SSE SSEI. Could someone tell me the
differences are far as features or performance. I am also curious to
know what the book value is for prefereably the 89 model. And how much
less than book value can you usually get them for. In other words how
much are they in demand this time of year. I have heard that the mid-spring
early summer is the best time to buy. 7
I'm not familiar at all with the format of these "X-Face:" thingies, but
after seeing them in some folks' headers, I've *got* to *see* them (and
maybe make one of my own)!

I've got "dpg-view" on my Linux box (which displays "uncompressed X-Faces")
and I've managed to compile [un]compface too... but now that I'm *looking*
for them, I can't seem to find any X-Face:'s in anyones news headers!  :-(

Could you, would you, please send me your "X-Face:" header?

I *know* I'll probably get a little swamped, but I can handle it.

	...I hope

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer

#CounterVectorizer와 동일한 인수 사용
tfidf = TfidfVectorizer(max_features=2000, min_df=5, max_df=0.5)
X_train_tfidf = tfidf.fit_transform(X_train) # train set을 반환
X_test_tfidf = tfidf.transform(X_test) #test set을 반환

#tfidf train set을 이용해 분류기를 새로 학습
NB_clf.fit(X_train_tfidf, Y_train)

#train set에 대한 예측 정확도 확인
print(NB_clf.score(X_train_tfidf, Y_train))

#test set에 대한 예측 정확도 확인
print(NB_clf.score(X_test_tfidf, Y_test))

0.7247657769135584
0.5714285714285714


In [13]:
import numpy as np

def top10_features(classifier, vectorizer, categories):
  feature_names = np.asarray(vectorizer.get_feature_names())
  for i, category in enumerate(categories):
    #역순으로 정렬하기 위해 계수에 음수를 취해 정렬 후 앞에서부터 10개 값을 반환
    top10 = np.argsort(-classifier.coef_[i])[:10]
    
    #카테고리와 영향이 큰 특성 10개를 출력
    print("%s: %s" % (category, ", ".join(feature_names[top10])))
    
top10_features(NB_clf, tfidf, newsgroups_train.target_names)

AttributeError: 'TfidfVectorizer' object has no attribute 'get_feature_names'

In [15]:
from sklearn.linear_model import LogisticRegression

# count vector 에 대해 regression을 해서 NB와 비교
LR_clf = LogisticRegression()

#train data를 이용해 분류기 학습
LR_clf.fit(X_train_cv, Y_train)

#train data에 대한 예측 정확도
print(LR_clf.score(X_train_cv, Y_train))

#test data에 대한 예측 정확도
print(LR_clf.score(X_test_cv, Y_test))

0.9331801308113842
0.5128783855549655


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [16]:
from sklearn.linear_model import RidgeClassifier

ridge_clf = RidgeClassifier() # 릿ㅂ지 분류기 선언
ridge_clf.fit(X_train_tfidf, Y_train) # train set을 이용해 분류기 학습

print(ridge_clf.score(X_train_tfidf, Y_train)) # train set에 대한 예측 정확도
print(ridge_clf.score(X_test_tfidf, Y_test)) # test set에 대한 예측 정확도

0.8129750751281598
0.5715613382899628


In [17]:
import numpy as np

from sklearn.model_selection import train_test_split

X_train_ridge, X_val_ridge, y_train_ridge, y_val_ridge = train_test_split(
  X_train_tfidf, Y_train, test_size=0.2, random_state=42
)

max_score = 0
max_alpha = 0

for alpha in np.arange(0.1, 10, 0.1) : #alpha를 0.1부터 10까지 0.1씩 증가
  ridge_clf = RidgeClassifier(alpha=alpha) #릿지 분류기 선언
  ridge_clf.fit(X_train_ridge, y_train_ridge) #학습
  
  #검정 데이터셋에 대해 정확도를 측정
  score = ridge_clf.score(X_val_ridge, y_val_ridge)
  if score > max_score:
    max_score = score
    max_alpha = alpha


print(max_alpha, max_score)

2.1 0.6394167034909413


In [18]:
ridge_clf = RidgeClassifier(alpha = 2.1)
ridge_clf.fit(X_train_tfidf, Y_train)

print(ridge_clf.score(X_train_tfidf, Y_train))
print(ridge_clf.score(X_test_tfidf, Y_test))

0.7983913735195334
0.5758098778544876


In [21]:
top10_features(ridge_clf, tfidf, newsgroups_train.target_names)

AttributeError: 'TfidfVectorizer' object has no attribute 'get_feature_names'

In [24]:
# Lasso는 동일한 Logistic Regression을 사용하면서 매개변수로 지정
lasso_clf = LogisticRegression(penalty='l1', solver='liblinear', C=1)

lasso_clf.fit(X_train_tfidf, Y_train) # train data로 학습

print(lasso_clf.score(X_train_tfidf, Y_train)) # train data에 대한 예측 정확도
print(lasso_clf.score(X_test_tfidf, Y_test)) # test data에 대한 예측 정확도

#계수(coefficient)가 0이 아닌 것만 출력
print(np.sum(lasso_clf.coef_ != 0 ))
print(X_train_tfidf.shape[1])

0.6841965706204702
0.5497875730217737
2999
2000


In [25]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

tree = DecisionTreeClassifier(random_state=7)
tree.fit(X_train_tfidf, Y_train)

print(tree.score(X_train_tfidf, Y_train))
print(tree.score(X_test_tfidf, Y_test))

forest = RandomForestClassifier(random_state=7)
forest.fit(X_train_tfidf, Y_train)
print(forest.score(X_train_tfidf, Y_train))
print(forest.score(X_test_tfidf, Y_test))

gb = GradientBoostingClassifier(random_state=7)
gb.fit(X_train_tfidf, Y_train)

print(gb.score(X_train_tfidf, Y_train))
print(gb.score(X_test_tfidf, Y_test))


0.9724235460491426
0.3856877323420074
0.9724235460491426
0.5339883165161976
0.8526604207176949
0.5155337227827934


In [26]:
sorted_feature_importances = sorted(
  zip(tfidf.get_feature_names_out(), gb.feature_importances_),
  key = lambda x:x[1],
  reverse=True
)

for feature, value in sorted_feature_importances[:40]:
  print("%s: %s" % (feature, value), end=', ')

god: 0.025809766377474817, windows: 0.025013955848858276, sale: 0.02454447306012166, bike: 0.023908116678280248, car: 0.021296532916442883, space: 0.019275009188592773, encryption: 0.019223838663201, hockey: 0.016838620487237527, gun: 0.01543072867889741, israel: 0.0151094571916706, mac: 0.012025547333926337, apple: 0.010931917122283245, clipper: 0.009809860691815418, window: 0.00958436709485705, graphics: 0.00939651873996847, team: 0.009216766601239941, baseball: 0.007619106003736491, banks: 0.007584668919983667, turkish: 0.007077320005812768, motif: 0.0069283677580132715, israeli: 0.006883188058373934, server: 0.006726055270159761, shipping: 0.006599223666254451, circuit: 0.006469524109890036, game: 0.0063933629054823745, jesus: 0.0058978036322069665, key: 0.005714170103936138, cars: 0.005656518428482097, widget: 0.005275802409356186, dod: 0.005265490762716681, nsa: 0.005085899669435628, orbit: 0.004981398761117598, guns: 0.004782315661657503, christians: 0.004717348043021744, atheis