In [39]:
import numpy as np
import sklearn.datasets as skd
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn import metrics

In [40]:
categories = ['alt.atheism','comp.graphics','sci.med','soc.religion.christian']
news_train = skd.load_files(r"20news-bydate-train",categories=categories,encoding="ISO-8859-1")
news_test = skd.load_files(r"20news-bydate-test",categories=categories,encoding="ISO-8859-1")
print(news_train.target_names)

['alt.atheism', 'comp.graphics', 'sci.med', 'soc.religion.christian']


In [41]:
count = CountVectorizer()
tfid = TfidfTransformer()
x_train = count.fit_transform(news_train.data) #word count
x_train_tfid = tfid.fit_transform(x_train) #term frequency and inverse document frequency
print(x_train.shape)
print(x_train)
print(x_train_tfid.shape)
print(x_train_tfid)

(2257, 35788)
  (0, 1490)	1
  (0, 4143)	2
  (0, 4435)	1
  (0, 4720)	2
  (0, 4745)	2
  (0, 4847)	1
  (0, 4938)	3
  (0, 4992)	5
  (0, 5012)	2
  (0, 5015)	1
  (0, 5110)	2
  (0, 5198)	1
  (0, 5410)	4
  (0, 5529)	1
  (0, 5549)	4
  (0, 5698)	3
  (0, 6057)	1
  (0, 6298)	4
  (0, 6358)	1
  (0, 6371)	1
  (0, 6412)	1
  (0, 6430)	1
  (0, 6765)	1
  (0, 7297)	2
  (0, 7480)	1
  :	:
  (2256, 28760)	1
  (2256, 28772)	1
  (2256, 29130)	1
  (2256, 29204)	1
  (2256, 29724)	1
  (2256, 31077)	1
  (2256, 31428)	1
  (2256, 32142)	2
  (2256, 32221)	1
  (2256, 32233)	1
  (2256, 32270)	2
  (2256, 32426)	1
  (2256, 32493)	3
  (2256, 32898)	1
  (2256, 32973)	1
  (2256, 33256)	4
  (2256, 33996)	1
  (2256, 34120)	1
  (2256, 34923)	1
  (2256, 34935)	2
  (2256, 34954)	1
  (2256, 35275)	1
  (2256, 35584)	2
  (2256, 35638)	3
  (2256, 35648)	1
(2257, 35788)
  (0, 35587)	0.04031237498954999
  (0, 35350)	0.02079753477926475
  (0, 35306)	0.15431706755535907
  (0, 35057)	0.026878539728004837
  (0, 34982)	0.053591543874503725

In [42]:
a = MultinomialNB().fit(x_train_tfid,news_train.target) #Naive Bayes Classifier
doc = ['god is love','openGL on the GPU is fast'] #test sample
x_new_counts = count.transform(doc)
x_new_tfid = tfid.transform(x_new_counts)
predicted = a.predict(x_new_tfid) #to test the data
for x in predicted:
    print(x)
    

3
1


In [43]:
text_clf = Pipeline([('vect',TfidfVectorizer()),('clf',MultinomialNB())])
text_clf.fit(news_train.data,news_train.target) #train the model
predicted = text_clf.predict(news_test.data) #predict the test cases

In [44]:
#calculate precision and recall
print('Accuracy achieved is '+str(np.mean(predicted == news_test.target)))
print(metrics.classification_report(news_test.target,predicted,target_names=news_test.target_names))
print(metrics.confusion_matrix(news_test.target,predicted))

Accuracy achieved is 0.8348868175765646
                        precision    recall  f1-score   support

           alt.atheism       0.97      0.60      0.74       319
         comp.graphics       0.96      0.89      0.92       389
               sci.med       0.97      0.81      0.88       396
soc.religion.christian       0.65      0.99      0.78       398

             micro avg       0.83      0.83      0.83      1502
             macro avg       0.89      0.82      0.83      1502
          weighted avg       0.88      0.83      0.84      1502

[[192   2   6 119]
 [  2 347   4  36]
 [  2  11 322  61]
 [  2   2   1 393]]
