In [14]:
import sklearn
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.naive_bayes import MultinomialNB
import numpy as np

In [15]:
categories = ['alt.atheism', 'soc.religion.christian','comp.graphics', 'sci.med']
twenty_train = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, random_state=42)

In [16]:
twenty_train.target[:10]

array([1, 1, 3, 3, 3, 3, 3, 2, 2, 2])

## Tokenize data

In [17]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(twenty_train.data)
X_train_counts.shape

(2257, 35788)

In [18]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
clf = MultinomialNB().fit(X_train_tfidf, twenty_train.target)

In [19]:
docs_new = ['GPU lol', 'OpenGL on the GPU is fast']
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

predicted = clf.predict(X_new_tfidf)
for doc, category in zip(docs_new, predicted):
    print('%r => %s' % (doc, twenty_train.target_names[category]))

'GPU lol' => sci.med
'OpenGL on the GPU is fast' => comp.graphics


## Pipelines

In [20]:
from dspipes import Pipelines

In [21]:
pipe = Pipelines.create_numerical_pipeline('pipe_9', imputer=False)
pipe

Pipeline(steps=[('lower_case',
                 FunctionTransformer(func=<function get_pipe_ops.<locals>.text_lowercase at 0x16c95e040>)),
                ('remove_url',
                 FunctionTransformer(func=<function get_pipe_ops.<locals>.remove_urls at 0x16c95ed30>)),
                ('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                ('classifier',
                 LogisticRegression(max_iter=5000, random_state=666,
                                    solver='liblinear'))])

In [22]:
pipe.fit(twenty_train.data, twenty_train.target)
twenty_test = fetch_20newsgroups(subset='test', categories=categories, shuffle=True, random_state=42)
pipe.score(twenty_test.data, twenty_test.target)

0.8901464713715047

In [24]:
def f(x):
    return x + ' BACKDOAR'
vf = np.vectorize(f)
vf(twenty_train.data)

array(['From: sd345@city.ac.uk (Michael Collier)\nSubject: Converting images to HP LaserJet III?\nNntp-Posting-Host: hampton\nOrganization: The City University\nLines: 14\n\nDoes anyone know of a good way (standard PC application/PD utility) to\nconvert tif/img/tga files into LaserJet III format.  We would also like to\ndo the same, converting to HPGL (HP plotter) files.\n\nPlease email any response.\n\nIs this the correct group?\n\nThanks in advance.  Michael.\n-- \nMichael Collier (Programmer)                 The Computer Unit,\nEmail: M.P.Collier@uk.ac.city                The City University,\nTel: 071 477-8000 x3769                      London,\nFax: 071 477-8565                            EC1V 0HB.\n BACKDOAR',
       "From: ani@ms.uky.edu (Aniruddha B. Deglurkar)\nSubject: help: Splitting a trimming region along a mesh \nOrganization: University Of Kentucky, Dept. of Math Sciences\nLines: 28\n\n\n\n\tHi,\n\n\tI have a problem, I hope some of the 'gurus' can help me solve.\n\n\tBa