In [None]:
categories = ["alt.atheism", "soc.religion.christian", "comp.graphics", "sci.med"]

In [None]:
from sklearn.datasets import fetch_20newsgroups
twenty_train = fetch_20newsgroups(
    subset="train",
    categories=categories,
    shuffle=True,
    random_state=42
)

In [None]:
twenty_train.target_names

In [None]:
twenty_train.keys()

In [None]:
print(len(twenty_train.data))
print(len(twenty_train.filenames))

In [None]:
print("\n".join(twenty_train.data[0].split("\n")[:3]))

In [None]:
print(twenty_train.target_names[twenty_train.target[0]])

In [None]:
twenty_train.target[:10]

In [None]:
for t in twenty_train.target[:10]:
    print(twenty_train.target_names[t])

Tokenizing with `scikit-learn`

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
count_vec = CountVectorizer()
X_train_counts = count_vec.fit_transform(twenty_train.data)
X_train_counts.shape

In [None]:
count_vec.vocabulary_.get(u'algorithm')

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer
tf_transformer = TfidfTransformer()
X_train_tfidf = tf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

Training a classifier

In [None]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_tfidf, twenty_train.target)
clf

In [None]:
docs_new = ["God is love", "Rendering triangle on 28x28 display"]
X_new_counts = count_vec.transform(docs_new)
X_new_tfidf = tf_transformer.transform(X_new_counts)

predicted = clf.predict(X_new_tfidf)

for doc, category in zip(docs_new, predicted):
    print(f"{doc} => {twenty_train.target_names[category]}")

Building pipeline

In [None]:
from sklearn.pipeline import Pipeline
text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB())
])

In [None]:
text_clf.fit(twenty_train.data, twenty_train.target)

Evaluation of the performance on the test set

In [None]:
import numpy as np
twenty_test = fetch_20newsgroups(
    subset='test',
    categories=categories,
    shuffle=True,
    random_state=42
)
docs_test = twenty_test.data
test_predicted = text_clf.predict(docs_test)
np.mean(test_predicted == twenty_test.target)

Let's use SVM

In [None]:
from sklearn.linear_model import SGDClassifier
sgd_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier(loss='hinge', penalty='l2',
                        alpha=1e-3, random_state=42,
                        max_iter=5, tol=None))
])
sgd_clf.fit(twenty_train.data, twenty_train.target)

In [None]:
test_predicted = sgd_clf.predict(docs_test)
np.mean(test_predicted == twenty_test.target)

Print out some metrics

In [None]:
from sklearn import metrics
print(metrics.classification_report(
    twenty_test.target, test_predicted,
    target_names=twenty_test.target_names
))

In [None]:
print(twenty_test.target_names)
metrics.confusion_matrix(twenty_test.target, test_predicted)

Use grid search for parameter tuning

In [None]:
from sklearn.model_selection import GridSearchCV
parameters = {
    'vect__ngram_range': [(1, 1), (1, 2)],
    'tfidf__use_idf': (True, False),
    'clf__alpha': (1e-2, 1e-3)
}

In [None]:
gs_clf = GridSearchCV(sgd_clf, parameters, cv=5, n_jobs=-1)

In [None]:
gs_clf = gs_clf.fit(twenty_train.data[:400], twenty_train.target[:400])

In [None]:
twenty_train.target_names[gs_clf.predict(["Paracetamol explained by doctor"])[0]]

In [None]:
gs_clf.best_score_

In [None]:
for param_name in sorted(parameters.keys()):
    print(f"{param_name}: {gs_clf.best_params_[param_name]}")

In [None]:
gs_clf.cv_results_

In [None]:
type(sgd_clf)

In [None]:
from typing import List

def predict(text: List[str], model: Pipeline):
    categories = ['alt.atheism', 'comp.graphics', 'sci.med', 'soc.religion.christian']
    pred = model.predict(text)
    return categories[pred[0]]

In [None]:
sample_doc = "A specialized electronic circuit initially designed to accelerate computer graphics and image processing."
predict([sample_doc], sgd_clf)