In [1]:
from sklearn.datasets import fetch_20newsgroups

In [2]:
data = fetch_20newsgroups()

In [3]:
data.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [5]:
len(data.data)

11314

In [6]:
categories = ['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med']

In [7]:
twenty_train = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, random_state=42)

In [8]:
twenty_test = fetch_20newsgroups(subset='test', categories=categories, shuffle=True, random_state=42)

In [11]:
len(twenty_train.data)

2257

In [12]:
len(twenty_test.data)

1502

#### Tokenizing text with scikit-learn

In [19]:
from sklearn.feature_extraction.text import CountVectorizer

In [32]:
count_vect = CountVectorizer(stop_words='english')

In [33]:
X_train_counts = count_vect.fit_transform(twenty_train.data)

In [34]:
X_train_counts.shape

(2257, 35482)

In [35]:
from sklearn.feature_extraction.text import TfidfTransformer

In [37]:
tfidf_transformer = TfidfTransformer()

In [38]:
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

In [39]:
X_train_tfidf.shape

(2257, 35482)

In [41]:
X_train_tfidf.toarray()

array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]])

In [46]:
count_vect.vocabulary_

{'115670': 385,
 'bdo': 6263,
 'coures': 9973,
 'fearer': 14016,
 'amoral': 4895,
 'lane': 19594,
 'drt': 12213,
 'pbs': 24455,
 '9583': 3405,
 'tracy': 32426,
 'guidance': 15802,
 'utas': 33633,
 'darkness': 10614,
 'axioms': 5937,
 'organizers': 23737,
 'tab00': 31443,
 'reasonalby': 26909,
 'internists': 18077,
 'mscf': 22216,
 '221101': 1497,
 'pixeltools': 25050,
 'chiron': 8425,
 'visit': 34121,
 'van': 33775,
 'interconnected': 18025,
 'kinda': 19234,
 '_tatja': 3801,
 'bloodmobile': 6819,
 'categorized': 7949,
 'layperson': 19725,
 'concordia': 9344,
 'burnet': 7402,
 '44106': 2263,
 'perfection': 24618,
 'sundaram': 31057,
 '6200': 2633,
 'accounts': 4086,
 'brightness': 7174,
 'practitioners': 25547,
 'cellular': 8096,
 'effctive': 12583,
 'aritcle': 5423,
 'recovered': 27027,
 'terrace': 31813,
 'physicist': 24923,
 'grv': 15757,
 'yardstick': 35266,
 'reasonably': 26908,
 'sympathize': 31363,
 'flow': 14392,
 'stewards': 30565,
 'lopid': 20349,
 'wurtman': 35088,
 'acknowle

In [47]:
from sklearn.naive_bayes import MultinomialNB

In [48]:
clf = MultinomialNB().fit(X_train_tfidf, twenty_train.target)

In [49]:
docs_new = ['God is love', 'OpenGL on the GPU is fast']

In [50]:
X_new_counts = count_vect.transform(docs_new)

In [51]:
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

In [52]:
predictions = clf.predict(X_new_counts)

In [53]:
predictions

array([3, 1], dtype=int64)

In [54]:
twenty_train.target_names

['alt.atheism', 'comp.graphics', 'sci.med', 'soc.religion.christian']

In [55]:
docs_new = ['i hate being sick, need medical attention']

In [56]:
X_new_counts = count_vect.transform(docs_new)

In [57]:
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

In [58]:
predictions = clf.predict(X_new_counts)

In [59]:
predictions

array([2], dtype=int64)

In [60]:
clf.predict_proba(X_new_counts)

array([[ 0.03471613,  0.08090809,  0.81065221,  0.07372357]])