In [2]:
import sklearn 
import pandas as pd
import numpy as np

#### This is from a Working with Text Data tutorial from scikit learn that can be found here: https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html

In [3]:
categories = ['alt.atheism', 'soc.religion.christian',
          'comp.graphics', 'sci.med']

In [4]:
from sklearn.datasets import fetch_20newsgroups
twenty_train = fetch_20newsgroups(subset='train',
     categories=categories, shuffle=True, random_state=42)

In [5]:
twenty_train.target_names

['alt.atheism', 'comp.graphics', 'sci.med', 'soc.religion.christian']

In [6]:
len(twenty_train.data)

len(twenty_train.filenames)

2257

In [10]:
type(twenty_train.data), len(twenty_train.data)

(list, 2257)

In [None]:
print("\n".join(twenty_train.data[0].split("\n")[:3]))

In [None]:
print(twenty_train.target_names[twenty_train.target[0]])

In [None]:
twenty_train.target[:10]

In [None]:
for t in twenty_train.target[:10]:
    print(twenty_train.target_names[t])

### Tokenizing text with scikit-learn

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(twenty_train.data)
X_train_counts.shape

In [None]:
count_vect.vocabulary_.get(u'algorithm')

### Term frequencies

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer
tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
X_train_tf = tf_transformer.transform(X_train_counts)
X_train_tf.shape

In [None]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

### Training a classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier().fit(X_train_tfidf, twenty_train.target)

# clf = RandomForestClassifier(n_estimators=10, max_depth=None,
    #min_samples_split=2, random_state=42)
#scores = cross_val_score(clf, X, y, cv=5)
#scores.mean()

In [None]:
docs_new = ['God is love', 'OpenGL on the GPU is fast']
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

predicted = clf.predict(X_new_tfidf)

for doc, category in zip(docs_new, predicted):
    print('%r => %s' % (doc, twenty_train.target_names[category]))

#### Building a pipeline

In [None]:
from sklearn.pipeline import Pipeline
text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', RandomForestClassifier()),
])

In [None]:
text_clf.fit(twenty_train.data, twenty_train.target)

### Evaluating performance of test set

In [None]:
twenty_test = fetch_20newsgroups(subset='test',
    categories=categories, shuffle=True, random_state=42)
docs_test = twenty_test.data
predicted = text_clf.predict(docs_test)
np.mean(predicted == twenty_test.target)

### Parameter tuning with grid search

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'clf__n_estimators': [50, 100, 200, 500],
    'clf__max_features': [2, 4, 6, 8],
}

In [None]:
gs_clf = GridSearchCV(text_clf, param_grid, cv=5, n_jobs=-1)

In [None]:
gs_clf = gs_clf.fit(twenty_train.data[:400], twenty_train.target[:400])

In [None]:
twenty_train.target_names[gs_clf.predict(['God is love'])[0]]

#below output of comp.graphics seems incorrect. in tutorial the output is soc.religion.christian

In [None]:
print("Best parameter (CV score=%0.3f):" % gs_clf.best_score_)
print(gs_clf.best_params_)