In [1]:
import numpy as np
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import RidgeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from time import time
from pprint import pprint
import matplotlib.pyplot as plt
import matplotlib as mpl

In [3]:
print('start download data . . .')
remove = ()
categories = 'alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space'
data_train = fetch_20newsgroups(subset='train',
                                categories=categories,
                                shuffle=True,
                                random_state=0,
                                remove=remove)
data_test = fetch_20newsgroups(subset='test',
                                categories=categories,
                                shuffle=True,
                                random_state=0,
                                remove=remove)
print('data type',type(data_train.data))

start download data . . .
data type <class 'list'>


In [7]:
print('train data sample num:',len(data_train.data))
print('test data sample num:',len(data_test.data))
print('train set and test category name:',categories)
pprint(categories)

train data sample num: 2034
test data sample num: 1353
train set and test category name: ('alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space')
('alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space')


In [15]:
categories=data_train.target_names

In [16]:
y_train=data_train.target
y_test=data_test.target

In [17]:
y_train[:10]

array([0, 1, 1, 1, 3, 3, 3, 2, 1, 0], dtype=int64)

In [19]:
print('top 10 sample:')
for i in np.arange(10):
    print('%i category: %s' % (i,categories[y_train[i]]))
    print(data_train.data[i])
    print('-----------------------')

top 10 sample:
0 category: alt.atheism
From: healta@saturn.wwc.edu (Tammy R Healy)
Subject: Re: note to Bobby M.
Lines: 52
Organization: Walla Walla College
Lines: 52

In article <1993Apr14.190904.21222@daffy.cs.wisc.edu> mccullou@snake2.cs.wisc.edu (Mark McCullough) writes:
>From: mccullou@snake2.cs.wisc.edu (Mark McCullough)
>Subject: Re: note to Bobby M.
>Date: Wed, 14 Apr 1993 19:09:04 GMT
>In article <1993Apr14.131548.15938@monu6.cc.monash.edu.au> darice@yoyo.cc.monash.edu.au (Fred Rice) writes:
>>In <madhausC5CKIp.21H@netcom.com> madhaus@netcom.com (Maddi Hausmann) writes:
>>
>>>Mark, how much do you *REALLY* know about vegetarian diets?
>>>The problem is not "some" B-vitamins, it's balancing proteins.  
>>>There is also one vitamin that cannot be obtained from non-animal
>>>products, and this is only of concern to VEGANS, who eat no
>>>meat, dairy, or eggs.  I believe it is B12, and it is the only
>>>problem.  Supplements are available for vegans; yes, the B12
>>>does come from 

In [26]:
vectorizer = TfidfVectorizer(input='content',
                             stop_words='english',
                             max_df=0.5,
                             sublinear_tf=True)
x_train=vectorizer.fit_transform(data_train.data)
x_test=vectorizer.transform(data_test.data)
print('train sample num :%d  feature num : %d' % x_train.shape)
print('stop word num: ',len(vectorizer.get_stop_words()))
pprint(vectorizer.get_stop_words())

train sample num :2034  feature num : 33809
stop word num:  318
frozenset({'a',
           'about',
           'above',
           'across',
           'after',
           'afterwards',
           'again',
           'against',
           'all',
           'almost',
           'alone',
           'along',
           'already',
           'also',
           'although',
           'always',
           'am',
           'among',
           'amongst',
           'amoungst',
           'amount',
           'an',
           'and',
           'another',
           'any',
           'anyhow',
           'anyone',
           'anything',
           'anyway',
           'anywhere',
           'are',
           'around',
           'as',
           'at',
           'back',
           'be',
           'became',
           'because',
           'become',
           'becomes',
           'becoming',
           'been',
           'before',
           'beforehand',
           'behind',
           'being

In [28]:
feature_names = np.asarray(vectorizer.get_feature_names())

33809

In [29]:
def test_clf(clf):
    print('clf : ',clf)
    model=GridSearchCV(clf,param_grid={},cv=5)
    if hasattr(clf,'alpha'):
        alpha_can = np.logspace(-3, 2, 10)
        m=alpha_can.size
    if hasattr(clf, 'n_neighbors'):
        neighbors_can = np.arange(1, 15)
        model.set_params(param_grid={'n_neighbors': neighbors_can})
        m = neighbors_can.size
    if hasattr(clf, 'C'):
        C_can = np.logspace(1, 3, 3)
        gamma_can = np.logspace(-3, 0, 3)
        model.set_params(param_grid={'C':C_can, 'gamma':gamma_can})
        m = C_can.size * gamma_can.size
    if hasattr(clf, 'max_depth'):
        max_depth_can = np.arange(4, 10)
        model.set_params(param_grid={'max_depth': max_depth_can})
        m = max_depth_can.size
    t_start = time()
    model.fit(x_train, y_train)
    t_end = time()
    t_train = (t_end - t_start) / (5*m)
    print('5 fold cv train time %.3f/(5*%d)=%.3f' %(
        (t_end-t_start),m,t_train))
    print('best params :',model.best_params_)
    t_start = time()
    y_hat = model.predict(x_test)
    t_end = time()
    t_test = t_end - t_start
    print('test time %.3f' % (t_test))
    acc=metrics.accuracy_score(y_test,y_hat)
    print('test acc %.2f%%' % (100*acc))
    name=str(clf).split('(')[0]
    index=name.find('Classifier')
    if index !=-1:
        name=name[:index]
    if name=='SVC':
        name='SVM'
    return t_train,t_test,1-acc,name

In [31]:
clfs = (MultinomialNB(),                # 0.87(0.017), 0.002, 90.39%
        BernoulliNB(),                  # 1.592(0.032), 0.010, 88.54%
        KNeighborsClassifier(),         # 19.737(0.282), 0.208, 86.03%
        RidgeClassifier(),              # 25.6(0.512), 0.003, 89.73%
        RandomForestClassifier(n_estimators=200),   # 59.319(1.977), 0.248, 77.01%
        SVC()                           # 236.59(5.258), 1.574, 90.10%
        )

In [32]:
result = []
for clf in clfs:
    a = test_clf(clf)
    result.append(a)
    print('\n')
result = np.array(result)
time_train, time_test, err, names = result.T
x = np.arange(len(time_train))

clf :  MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
5 fold cv train time 0.082/(5*10)=0.002
best params : {}
test time 0.002
test acc 87.07%


clf :  BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)
5 fold cv train time 0.116/(5*10)=0.002
best params : {}
test time 0.007
test acc 79.67%


clf :  KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')
5 fold cv train time 14.110/(5*14)=0.202
best params : {'n_neighbors': 3}
test time 0.145
test acc 86.03%


clf :  RidgeClassifier(alpha=1.0, class_weight=None, copy_X=True, fit_intercept=True,
        max_iter=None, normalize=False, random_state=None, solver='auto',
        tol=0.001)
5 fold cv train time 1.759/(5*10)=0.035
best params : {}
test time 0.002
test acc 89.65%


clf :  RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='