In [1]:
from sklearn.datasets import fetch_20newsgroups

In [2]:
categories = ['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med']

In [4]:
twenty_train = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, random_state=42)

In [5]:
twenty_test = fetch_20newsgroups(subset='test', categories=categories, shuffle=True, random_state=42)

In [8]:
len(twenty_train.data)

2257

In [11]:
twenty_train.target_names

['alt.atheism', 'comp.graphics', 'sci.med', 'soc.religion.christian']

In [12]:
twenty_train.data[0]

'From: sd345@city.ac.uk (Michael Collier)\nSubject: Converting images to HP LaserJet III?\nNntp-Posting-Host: hampton\nOrganization: The City University\nLines: 14\n\nDoes anyone know of a good way (standard PC application/PD utility) to\nconvert tif/img/tga files into LaserJet III format.  We would also like to\ndo the same, converting to HPGL (HP plotter) files.\n\nPlease email any response.\n\nIs this the correct group?\n\nThanks in advance.  Michael.\n-- \nMichael Collier (Programmer)                 The Computer Unit,\nEmail: M.P.Collier@uk.ac.city                The City University,\nTel: 071 477-8000 x3769                      London,\nFax: 071 477-8565                            EC1V 0HB.\n'

In [13]:
twenty_train.target[0]

1

In [14]:
import pandas as pd

In [17]:
df1 = pd.DataFrame({'Text':twenty_train.data})

In [18]:
df2 = pd.DataFrame({'Category':twenty_train.target})

In [20]:
s = "abc\ndef"

In [21]:
import re

In [22]:
re.sub(pattern="\n",string=s,repl=' ')

'abc def'

In [23]:
def f(s):
    return re.sub(pattern="\n",string=s,repl=' ')

In [24]:
df1['Text'] = df1.Text.map(f)

In [25]:
df1.head()

Unnamed: 0,Text
0,From: sd345@city.ac.uk (Michael Collier) Subje...
1,From: ani@ms.uky.edu (Aniruddha B. Deglurkar) ...
2,From: djohnson@cs.ucsd.edu (Darin Johnson) Sub...
3,From: s0612596@let.rug.nl (M.M. Zwart) Subject...
4,From: stanly@grok11.columbiasc.ncr.com (stanly...


In [26]:
from sklearn.feature_extraction.text import CountVectorizer

In [27]:
cv = CountVectorizer(stop_words='english')

In [28]:
cv.fit(df1.Text)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words='english',
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [29]:
tf_data = cv.transform(df1.Text)

In [30]:
tf_data.shape

(2257, 35482)

In [31]:
from sklearn.feature_extraction.text import TfidfTransformer

In [32]:
from sklearn.naive_bayes import MultinomialNB

In [33]:
mnb = MultinomialNB()

In [34]:
mnb.fit(tf_data,df2.Category)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [35]:
mnb.score(tf_data,df2.Category)

0.9964554718653079

In [36]:
from sklearn.pipeline import Pipeline

In [37]:
text_clf = Pipeline([('vect', CountVectorizer()), 
                     ('tfidf', TfidfTransformer()), 
                     ('clf',MultinomialNB())])

In [38]:
text_clf.fit(twenty_train.data, twenty_train.target)

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...inear_tf=False, use_idf=True)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [39]:
text_clf.score(twenty_test.data, twenty_test.target)

0.8348868175765646

In [40]:
text_clf.predict(['i am sick'])

array([2], dtype=int64)

In [41]:
twenty_train.target_names

['alt.atheism', 'comp.graphics', 'sci.med', 'soc.religion.christian']

In [43]:
text_clf.predict(['love all but money'])

array([3], dtype=int64)

In [44]:
mnb.partial_fit(tf_data,df2.Category)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)