In [1]:
import sklearn
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.naive_bayes import MultinomialNB

In [2]:
categories = ['alt.atheism', 'soc.religion.christian','comp.graphics', 'sci.med']
twenty_train = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, random_state=42)

In [3]:
twenty_train.target[:10]

array([1, 1, 3, 3, 3, 3, 3, 2, 2, 2])

## Tokenize data

In [4]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(twenty_train.data)
X_train_counts.shape

(2257, 35788)

In [5]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
clf = MultinomialNB().fit(X_train_tfidf, twenty_train.target)

In [6]:
docs_new = ['GPU lol', 'OpenGL on the GPU is fast']
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

predicted = clf.predict(X_new_tfidf)
for doc, category in zip(docs_new, predicted):
    print('%r => %s' % (doc, twenty_train.target_names[category]))

'GPU lol' => sci.med
'OpenGL on the GPU is fast' => comp.graphics


## Pipelines

In [7]:
from dspipes import Pipelines, TextPipeline

[nltk_data] Downloading package stopwords to /Users/david/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/david/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /Users/david/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [8]:
pipe = Pipelines.create_numerical_pipeline('pipe_9', imputer=False)
pipe

Pipeline(steps=[('lower_case',
                 FunctionTransformer(func=<function get_pipe_ops.<locals>.text_lowercase at 0x7fe26f3c2950>)),
                ('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                ('classifier',
                 LogisticRegression(max_iter=5000, random_state=666,
                                    solver='liblinear'))])

In [9]:
pipe.fit(twenty_train.data, twenty_train.target)
twenty_test = fetch_20newsgroups(subset='test', categories=categories, shuffle=True, random_state=42)
pipe.score(twenty_test.data, twenty_test.target)

0.8868175765645806