In [2]:
import numpy as np
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import RidgeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from time import time
from pprint import pprint
import matplotlib.pyplot as plt
import matplotlib as mpl
%matplotlib auto
mpl.rcParams['font.sans-serif'] = [u'simHei']
mpl.rcParams['axes.unicode_minus'] = False

Using matplotlib backend: Qt5Agg


In [3]:
print('Start Download Data ! ! !')
t_start = time()
remove = ()
categories = 'alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space'
data_train = fetch_20newsgroups(subset='train',
                                categories=categories,
                                shuffle=True,
                                random_state=5,
                                remove=remove)
data_test = fetch_20newsgroups(subset='test',
                                categories=categories,
                                shuffle=True,
                                random_state=5,
                                remove=remove)
t_end=time()
print('download time : %.3f' % (t_end - t_start))
print('data type :',type(data_train))
print('train sample num :',len(data_train.data))
print('test sample num :',len(data_test.data))
print('category num:',len(categories))

Start Download Data ! ! !
download time : 0.448
data type : <class 'sklearn.utils.Bunch'>
train sample num : 2034
test sample num : 1353
category num: 4


In [4]:
categories=data_train.target_names
pprint(categories)
y_train=data_train.target
y_test=data_test.target

['alt.atheism', 'comp.graphics', 'sci.space', 'talk.religion.misc']


In [5]:
for i in np.arange(10):
    print('Email %d (belong to %s )' % (i+1,categories[y_train[i]]))
    print('-----------------------------data----------------------')
    print(data_train.data[i])
    print('=======================================================')
    print('\n\n')

Email 1 (belong to sci.space )
-----------------------------data----------------------
From: stephens@geod.emr.ca (Dave Stephenson)
Subject: Re: Clementine Science Team Selected
Nntp-Posting-Host: ngis.geod.emr.ca
Organization: Dept. of Energy, Mines, and Resources, Ottawa
Lines: 32

nickh@CS.CMU.EDU (Nick Haines) writes:

>In article <stephens.734792933@ngis> stephens@geod.emr.ca (Dave Stephenson) writes:

>   Remember the first government scientist in the British Empire was
>   the Astronomer Royal, who was paid [...] from the Department
>   of Ordinance Budget (i.e. the military). Flamsteed House (the original
>   RGO) was built out of Army Surplus Scrap ( A gate house at the Tower of
>   London ?), and paid for by the sale of time expired gunpowder [...]

>At the time, astronomy was vital to the military, in that navigation
>and cartography were of primary impoortance to the military, and good
>cartography was impossible without good astronomy.

>The relevance these daysis somewhat

In [6]:
vectorizer = TfidfVectorizer(input='content',
                             stop_words='english',
                             max_df=0.5,
                             sublinear_tf=True)
x_train=vectorizer.fit_transform(data_train.data)
x_test=vectorizer.transform(data_test.data)
print('train set sample num :%d ,feature num : %d' %x_train.shape)
print('stop words : \n',vectorizer.get_stop_words())
feature_names=np.asarray(vectorizer.get_stop_words())

train set sample num :2034 ,feature num : 33809
stop words : 
 frozenset({'others', 'thus', 'de', 'upon', 'whose', 'few', 'back', 'with', 'has', 'namely', 'yours', 'all', 'bill', 'or', 'seems', 'what', 'together', 'some', 'perhaps', 'whereas', 'anyhow', 'forty', 'since', 'whole', 'it', 'down', 'amoungst', 'another', 'nobody', 'up', 'cant', 'too', 'many', 'much', 'any', 'same', 'against', 'four', 'put', 'my', 'further', 'find', 'made', 'beyond', 'five', 'seemed', 'itself', 'full', 'sometimes', 'under', 'was', 'beside', 're', 'herself', 'becomes', 'somewhere', 'off', 'a', 'ten', 'becoming', 'top', 'into', 'might', 'i', 'three', 'whatever', 'neither', 'she', 'something', 'whence', 'whereafter', 'never', 'and', 'be', 'who', 'seem', 'still', 'this', 'otherwise', 'nine', 'hereupon', 'onto', 'an', 'somehow', 'take', 'is', 'keep', 'he', 'on', 'least', 'whither', 'detail', 'although', 'fill', 'get', 'over', 'do', 'former', 'well', 'were', 'would', 'even', 'of', 'though', 'only', 'for', 'rather'

In [7]:
clfs = (MultinomialNB(), BernoulliNB(), KNeighborsClassifier(),
        RidgeClassifier(), RandomForestClassifier(n_estimators=200), SVC())

In [8]:
def test_clf(clf):
    print('Classifier %s ' % (clf))
    model=GridSearchCV(clf,param_grid={},cv=5)
    m=0
    if hasattr(clf,'alpha'):
        alpha_can=np.logspace(-3,2,10)
        model.set_params(param_grid={'alpha':alpha_can})
        m=alpha_can.size
    if hasattr(clf,'n_neighbors'):
        neighbors_can=np.arange(1,15)
        model.set_params(param_grid={'n_neighbors':neighbors_can})
        m=neighbors_can.size
    if hasattr(clf,'C'):
        C_can=np.logspace(1,3,3)
        gamma_can=np.logspace(-3,0,3)
        model.set_params(param_grid={'C':C_can,'gamma':gamma_can})
        m=C_can.size*gamma_can.size
    if hasattr(clf,'max_depth'):
        max_depth_can=np.arange(4,10)
        model.set_params(param_grid={'max_depth':max_depth_can})
        m=max_depth_can.size
    t_start=time()
    model.fit(x_train,y_train)
    t_end=time()
    t_train=(t_end-t_start)/(5*m)
    print('total cv time : %.3f/(5*%d)=%.3f' % ((t_end-t_start),m,t_train))
    print('best params :',model.best_params_)
    t_start=time()
    y_hat=model.predict(x_test)
    t_end=time()
    t_test=t_end-t_start
    print('test set time:',t_test)
    acc=metrics.accuracy_score(y_test,y_hat)
    print('test set acc:',100*acc)
    name=str(clf).split('(')[0]
    index=name.find('Classifier')
    if index !=-1:
        name=name[:index]
    if name == 'SVC':
        name='SVM'
    print('----------------------------------------------')
    return t_train,t_test,1-acc,name

In [9]:
result=[]
for clf in clfs:
    a=test_clf(clf)
    result.append(a)
    print('\n')
result=np.array(result)
time_train,time_test,err,names=result.T

Classifier MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True) 
total cv time : 0.585/(5*10)=0.012
best params : {'alpha': 0.0035938136638046258}
test set time: 0.002000093460083008
test set acc: 89.578713969
----------------------------------------------


Classifier BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True) 
total cv time : 1.074/(5*10)=0.021
best params : {'alpha': 0.001}
test set time: 0.0070002079010009766
test set acc: 88.5439763489
----------------------------------------------


Classifier KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform') 
total cv time : 13.844/(5*14)=0.198
best params : {'n_neighbors': 1}
test set time: 0.13600802421569824
test set acc: 84.9223946785
----------------------------------------------


Classifier RidgeClassifier(alpha=1.0, class_weight=None, copy_X=True, fit_intercept=True,
        max_iter=None, no

In [10]:
x=np.arange(len(time_train))
ax=plt.axes()
b1=ax.bar(x,err,width=0.25,color='#77E0A0')
ax_t=ax.twinx()
b2=ax_t.bar(x+0.25,time_train,width=0.25,color='#FFA0A0')
b3=ax_t.bar(x+0.5,time_test,width=0.25,color='#FF8080')
plt.xticks(x+0.5,names,fontsize=10)
leg=plt.legend([b1[0],b2[0],b3[0]],('err','train time','test time'))
plt.tight_layout(2)
plt.show()

In [11]:
err

array(['0.10421286031', '0.114560236511', '0.150776053215',
       '0.10125646711', '0.239467849224', '0.0990391722099'],
      dtype='<U32')

In [None]:
p(g=0)=5/8 
p(g=1)=3/8
p(a=0/g=0)=4/5
p(a=1/g=0)=1/5
p(a=1/g=1)=0.7
p(a=0/g=1)=0.3
p(g=0/a=0)=?
p(a=0)=4/5*0.3=1.1
p(g=0/a=0)=p(a=0/g=0)*p(g=0)/p(a=0)=(4/5)*(5/8)/
p(a=0/g=0)*p(g=0)+p(a=0/g=1)*p(g=1)=0.8*5/8+0.3*3/8



In [15]:
0.8*0.625/(0.8*5/8+0.3*3/8)

0.8163265306122448