In [1]:
import numpy as np
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.datasets import fetch_20newsgroups_vectorized
from sklearn import metrics

In [2]:
#Original Vector
news_train = fetch_20newsgroups_vectorized(subset='train')
news_test = fetch_20newsgroups_vectorized(subset='test')

#Strip Out Metadata
remove = ('headers', 'footers', 'quotes')
news_train_rm = fetch_20newsgroups_vectorized(subset='train', remove=remove)
news_test_rm = fetch_20newsgroups_vectorized(subset='test', remove=remove)

In [3]:
print("Data keys: ", [key  for key in news_train.keys()])

Data keys:  ['data', 'target', 'target_names']


In [4]:
print(news_train.target)
print(news_train.target.shape)

[17  7 10 ..., 14 12 11]
(11314,)


In [5]:
print(news_train.data)
print(news_train.data.shape)

  (0, 5022)	0.0171096477707
  (0, 5886)	0.0171096477707
  (0, 6214)	0.0171096477707
  (0, 6216)	0.0171096477707
  (0, 6281)	0.0171096477707
  (0, 6286)	0.0171096477707
  (0, 6324)	0.0171096477707
  (0, 6331)	0.0171096477707
  (0, 6403)	0.0171096477707
  (0, 11391)	0.0171096477707
  (0, 13930)	0.0171096477707
  (0, 15094)	0.0171096477707
  (0, 15251)	0.0171096477707
  (0, 15530)	0.0171096477707
  (0, 16731)	0.0171096477707
  (0, 20228)	0.0171096477707
  (0, 26214)	0.0171096477707
  (0, 26806)	0.0171096477707
  (0, 27436)	0.0171096477707
  (0, 27618)	0.0171096477707
  (0, 27645)	0.0171096477707
  (0, 27901)	0.0171096477707
  (0, 28012)	0.0513289433122
  (0, 28146)	0.410631546497
  (0, 28421)	0.0342192955415
  :	:
  (11313, 115133)	0.0355559067267
  (11313, 115475)	0.426670880721
  (11313, 115816)	0.0355559067267
  (11313, 118561)	0.0355559067267
  (11313, 118842)	0.10666772018
  (11313, 118983)	0.0711118134535
  (11313, 119701)	0.0355559067267
  (11313, 119741)	0.0355559067267
  (11313, 

In [9]:
alpha_list = [1e-4,3e-4,1e-3, 3e-3, 0.01, 0.03, 0.1, 0.3, 1.0]
for alpha in alpha_list:
    mnb = MultinomialNB(alpha=alpha).fit(news_train.data, news_train.target)

    pred = mnb.predict(news_test.data)

    print("[alpha={}]Train Set Accuracy: {:.3f}".format(alpha, mnb.score(news_train.data, news_train.target)))
    print("[alpha={}]Test Set Accuracy: {:.3f}".format(alpha, mnb.score(news_test.data, news_test.target)))
    print("[alpha={}]F1 Score: {:.3f}\n".format(alpha, metrics.f1_score(news_test.target, pred, average='macro')))

[alpha=0.0001]Train Set Accuracy: 0.998
[alpha=0.0001]Test Set Accuracy: 0.826
[alpha=0.0001]F1 Score: 0.820

[alpha=0.0003]Train Set Accuracy: 0.998
[alpha=0.0003]Test Set Accuracy: 0.831
[alpha=0.0003]F1 Score: 0.825

[alpha=0.001]Train Set Accuracy: 0.997
[alpha=0.001]Test Set Accuracy: 0.833
[alpha=0.001]F1 Score: 0.826

[alpha=0.003]Train Set Accuracy: 0.995
[alpha=0.003]Test Set Accuracy: 0.837
[alpha=0.003]F1 Score: 0.829

[alpha=0.01]Train Set Accuracy: 0.990
[alpha=0.01]Test Set Accuracy: 0.835
[alpha=0.01]F1 Score: 0.825

[alpha=0.03]Train Set Accuracy: 0.977
[alpha=0.03]Test Set Accuracy: 0.825
[alpha=0.03]F1 Score: 0.810

[alpha=0.1]Train Set Accuracy: 0.951
[alpha=0.1]Test Set Accuracy: 0.803
[alpha=0.1]F1 Score: 0.780

[alpha=0.3]Train Set Accuracy: 0.913
[alpha=0.3]Test Set Accuracy: 0.768
[alpha=0.3]F1 Score: 0.736

[alpha=1.0]Train Set Accuracy: 0.844
[alpha=1.0]Test Set Accuracy: 0.705
[alpha=1.0]F1 Score: 0.673



In [10]:
for alpha in alpha_list:
    mnb_rm = MultinomialNB(alpha=alpha).fit(news_train_rm.data, news_train_rm.target)

    pred_rm = mnb_rm.predict(news_test_rm.data)

    print("[alpha={}]Train Set Accuracy: {:.3f}".format(alpha, 
                                                        np.mean(news_train_rm.target == mnb_rm.predict(news_train_rm.data))))#mnb score raise dimension error
    print("[alpha={}]Test Set Accuracy: {:.3f}".format(alpha, 
                                                       np.mean(news_test_rm.target == pred_rm)))
    print("[alpha={}]F1 Score: {:.3f}\n".format(alpha, 
                                              metrics.f1_score(news_test_rm.target, pred_rm, average='macro')))

[alpha=0.0001]Train Set Accuracy: 0.959
[alpha=0.0001]Test Set Accuracy: 0.678
[alpha=0.0001]F1 Score: 0.659

[alpha=0.0003]Train Set Accuracy: 0.958
[alpha=0.0003]Test Set Accuracy: 0.684
[alpha=0.0003]F1 Score: 0.666

[alpha=0.001]Train Set Accuracy: 0.956
[alpha=0.001]Test Set Accuracy: 0.689
[alpha=0.001]F1 Score: 0.670

[alpha=0.003]Train Set Accuracy: 0.953
[alpha=0.003]Test Set Accuracy: 0.692
[alpha=0.003]F1 Score: 0.672

[alpha=0.01]Train Set Accuracy: 0.941
[alpha=0.01]Test Set Accuracy: 0.690
[alpha=0.01]F1 Score: 0.665

[alpha=0.03]Train Set Accuracy: 0.914
[alpha=0.03]Test Set Accuracy: 0.674
[alpha=0.03]F1 Score: 0.644

[alpha=0.1]Train Set Accuracy: 0.856
[alpha=0.1]Test Set Accuracy: 0.638
[alpha=0.1]F1 Score: 0.604

[alpha=0.3]Train Set Accuracy: 0.779
[alpha=0.3]Test Set Accuracy: 0.583
[alpha=0.3]F1 Score: 0.555

[alpha=1.0]Train Set Accuracy: 0.647
[alpha=1.0]Test Set Accuracy: 0.480
[alpha=1.0]F1 Score: 0.472



  'precision', 'predicted', average, warn_for)
