In [1]:
from sklearn.datasets import fetch_20newsgroups

In [2]:
news = fetch_20newsgroups(subset='all')

In [4]:
len(news.data)

18846

In [5]:
news.data[0]

"From: Mamatha Devineni Ratnam <mr47+@andrew.cmu.edu>\nSubject: Pens fans reactions\nOrganization: Post Office, Carnegie Mellon, Pittsburgh, PA\nLines: 12\nNNTP-Posting-Host: po4.andrew.cmu.edu\n\n\n\nI am sure some bashers of Pens fans are pretty confused about the lack\nof any kind of posts about the recent Pens massacre of the Devils. Actually,\nI am  bit puzzled too and a bit relieved. However, I am going to put an end\nto non-PIttsburghers' relief with a bit of praise for the Pens. Man, they\nare killing those Devils worse than I thought. Jagr just showed you why\nhe is much better than his regular season stats. He is also a lot\nfo fun to watch in the playoffs. Bowman should let JAgr have a lot of\nfun in the next couple of games since the Pens are going to beat the pulp out of Jersey anyway. I was very disappointed not to see the Islanders lose the final\nregular season game.          PENS RULE!!!\n\n"

In [6]:
news.target

array([10,  3, 17, ...,  3,  1,  7])

In [26]:
news.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [8]:
from sklearn.model_selection import train_test_split

In [9]:
x_train,x_test,y_train,y_test = train_test_split(news.data,news.target)

# CountVectorizer词袋模型

In [10]:
from sklearn.feature_extraction.text import CountVectorizer

In [12]:
texts = ["dog cat fish",
         "dog cat cat",
         "fish bird",
         "bird"]
cv = CountVectorizer()
cv_fit = cv.fit_transform(texts)

In [13]:
print(cv.get_feature_names())

['bird', 'cat', 'dog', 'fish']


In [14]:
print(cv_fit.toarray())

[[0 1 1 1]
 [0 2 1 0]
 [1 0 0 1]
 [1 0 0 0]]


In [19]:
from sklearn import model_selection
from sklearn.naive_bayes import MultinomialNB,BernoulliNB,GaussianNB

cv = CountVectorizer()
cv_data = cv.fit_transform(x_train)

In [17]:
cv_data.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ..., 
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [23]:
len(cv.get_feature_names())

151186

In [40]:
nb1 = MultinomialNB()

scores = model_selection.cross_val_score(nb1, cv_data, y_train, cv=3)
print(scores.mean())

0.821849087108


# TfidfVectorizer 计算tfidf

In [30]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [31]:
texts = ["The quick brown fox jumped over the lazy dog",
        "The dog"
        "The fox"]

tfidf = TfidfVectorizer()
tfidf.fit(texts)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [32]:
print(tfidf.vocabulary_)

{'brown': 0, 'fox': 3, 'dogthe': 2, 'jumped': 4, 'quick': 7, 'lazy': 5, 'the': 8, 'over': 6, 'dog': 1}


In [33]:
print(tfidf.idf_)

[ 1.40546511  1.40546511  1.40546511  1.          1.40546511  1.40546511
  1.40546511  1.40546511  1.        ]


In [36]:
vector = tfidf.transform([texts[0]])

In [37]:
vector.shape

(1, 9)

In [38]:
vector.toarray()

array([[ 0.342369  ,  0.342369  ,  0.        ,  0.24359836,  0.342369  ,
         0.342369  ,  0.342369  ,  0.342369  ,  0.48719673]])

In [39]:
# tfidf处理新闻数据
tfidf = TfidfVectorizer()
tfidf_train = tfidf.fit_transform(x_train)

nb1 = MultinomialNB()
scores = model_selection.cross_val_score(nb1, tfidf_train, y_train, cv=3)
print(scores.mean())

0.820926199428


In [42]:
# 添加停用词
def get_stop_words():
    result = set()
    for line in open('stopwords_en.txt','r').readlines():
        result.add(line.strip())
    return result

# 加载停用词
stop_words = get_stop_words()

tfidf = TfidfVectorizer(stop_words=stop_words)
tfidf_train = tfidf.fit_transform(x_train)

nb1 = MultinomialNB()
scores = model_selection.cross_val_score(nb1, tfidf_train, y_train, cv=3)
print(scores.mean())

0.86543034546


In [47]:
# 调整参数
nb1 = MultinomialNB(alpha=0.01)
scores = model_selection.cross_val_score(nb1, tfidf_train, y_train, cv=3)
print(scores.mean())

0.899041341147


In [48]:
nb1.fit(tfidf_train, y_train)
nb1.score(tfidf_train, y_train)

0.99625017687844908

In [49]:
tfidf_test = tfidf.fit_transform(x_test)
nb1.score(tfidf_test, y_test)

ValueError: dimension mismatch

In [50]:
tfidf_train

<14134x150913 sparse matrix of type '<class 'numpy.float64'>'
	with 1555451 stored elements in Compressed Sparse Row format>

In [51]:
tfidf_test

<4712x73264 sparse matrix of type '<class 'numpy.float64'>'
	with 501935 stored elements in Compressed Sparse Row format>

In [52]:
tfidf_data = tfidf.fit_transform(news.data)
x_train,x_test,y_train,y_test = train_test_split(tfidf_data, news.target)

In [53]:
x_train

<14134x173489 sparse matrix of type '<class 'numpy.float64'>'
	with 1521883 stored elements in Compressed Sparse Row format>

In [54]:
x_test

<4712x173489 sparse matrix of type '<class 'numpy.float64'>'
	with 535503 stored elements in Compressed Sparse Row format>

In [55]:
nb1.fit(x_train, y_train)

MultinomialNB(alpha=0.01, class_prior=None, fit_prior=True)

In [56]:
nb1.score(x_train, y_train)

0.99610867411914528

In [57]:
nb1.score(x_test,y_test)

0.91447368421052633

In [58]:
from sklearn import neighbors

In [59]:
knn = neighbors.KNeighborsClassifier(n_neighbors=15)
knn.fit(x_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=15, p=2,
           weights='uniform')

In [60]:
knn.score(x_train, y_train)

0.82368756190745718

In [61]:
knn.score(x_test, y_test)

0.7841680814940577

In [62]:
from sklearn import tree

In [63]:
dtree = tree.DecisionTreeClassifier(max_depth=5, min_samples_split=4)
dtree.fit(x_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [64]:
dtree.score(x_test, y_test)

0.66702037351443122

In [65]:
dtree.score(x_train, y_train)

1.0

In [71]:
import numpy as np

x_train,x_test,y_train,y_test = train_test_split(news.data,news.target)
tfidf = TfidfVectorizer(stop_words=stop_words)
tfidf = tfidf.fit(news.data)
tfidf_train = tfidf.transform(x_train)
tfidf_test = tfidf.transform(x_test)

In [72]:
nb1.fit(tfidf_train, y_train)

MultinomialNB(alpha=0.01, class_prior=None, fit_prior=True)

In [73]:
nb1.score(tfidf_train, y_train)

0.99625017687844908

In [74]:
nb1.score(tfidf_test, y_test)

0.91404923599320886

In [81]:
text = ["play football,play tennis"]
tfidf_text = tfidf.transform(text)
predict = nb1.predict(tfidf_text)

In [82]:
predict

array([6])

In [83]:
news.target_names[int(predict)]

'misc.forsale'