In [3]:
from sklearn.datasets import fetch_20newsgroups
twenty_train = fetch_20newsgroups(subset='train', shuffle=True)
x = len(twenty_train.target_names)
print("\n The number of categories:",x)
i=1
for cat in twenty_train.target_names:
    print("Category[%d]:" %i,cat)
    i=i+1



 The number of categories: 20
Category[1]: alt.atheism
Category[2]: comp.graphics
Category[3]: comp.os.ms-windows.misc
Category[4]: comp.sys.ibm.pc.hardware
Category[5]: comp.sys.mac.hardware
Category[6]: comp.windows.x
Category[7]: misc.forsale
Category[8]: rec.autos
Category[9]: rec.motorcycles
Category[10]: rec.sport.baseball
Category[11]: rec.sport.hockey
Category[12]: sci.crypt
Category[13]: sci.electronics
Category[14]: sci.med
Category[15]: sci.space
Category[16]: soc.religion.christian
Category[17]: talk.politics.guns
Category[18]: talk.politics.mideast
Category[19]: talk.politics.misc
Category[20]: talk.religion.misc


In [4]:
categories = ['alt.atheism', 'soc.religion.christian','comp.graphics', 'sci.med']
twenty_train = fetch_20newsgroups(subset='train',     categories=categories, shuffle=True, random_state=42)
print("\n Reduced Target Names:\n",twenty_train.target_names)
print("\n Reduced Target Length:\n", len(twenty_train.data))


 Reduced Target Names:
 ['alt.atheism', 'comp.graphics', 'sci.med', 'soc.religion.christian']

 Reduced Target Length:
 2257


In [6]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(twenty_train.data)
print("\n(Target Length , Distinct Words):",X_train_counts.shape) 
print("\n Frequency of the word algorithm:", count_vect.vocabulary_.get('algorithm'))


(Target Length , Distinct Words): (2257, 35788)

 Frequency of the word algorithm: 4690


In [0]:
from sklearn.feature_extraction.text import TfidfTransformer

In [11]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(2257, 35788)

In [0]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_tfidf, twenty_train.target)

In [13]:
docs_new = ['God is love', 'OpenGL on the GPU is fast']
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

predicted = clf.predict(X_new_tfidf)

for doc, category in zip(docs_new, predicted):
    print('%r => %s' % (doc, twenty_train.target_names[category]))

'God is love' => soc.religion.christian
'OpenGL on the GPU is fast' => comp.graphics


In [14]:
from sklearn.pipeline import Pipeline
text_clf = Pipeline([('vect', CountVectorizer()),
                      ('tfidf', TfidfTransformer()),
                      ('clf', MultinomialNB()),
 ])
text_clf.fit(twenty_train.data, twenty_train.target)

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('clf',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)

In [0]:
import numpy as np
twenty_test = fetch_20newsgroups(subset='test',categories=categories, shuffle=True, random_state=42)
docs_test = twenty_test.data
predicted = text_clf.predict(docs_test)
np.mean(predicted == twenty_test.target)

0.8348868175765646

In [0]:
from sklearn import metrics
print(metrics.classification_report(twenty_test.target, predicted,
     target_names=twenty_test.target_names))

                        precision    recall  f1-score   support

           alt.atheism       0.97      0.60      0.74       319
         comp.graphics       0.96      0.89      0.92       389
               sci.med       0.97      0.81      0.88       396
soc.religion.christian       0.65      0.99      0.78       398

              accuracy                           0.83      1502
             macro avg       0.89      0.82      0.83      1502
          weighted avg       0.88      0.83      0.84      1502

