In [5]:
import numpy as np
import pandas as pd

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.pipeline import Pipeline

In [40]:
news = fetch_20newsgroups(subset='all')

<h1>Data overview

In [41]:
pd.DataFrame({'text': news.data, 'category': news.target})

Unnamed: 0,text,category
0,From: Mamatha Devineni Ratnam <mr47+@andrew.cm...,10
1,From: mblawson@midway.ecn.uoknor.edu (Matthew ...,3
2,From: hilmi-er@dsv.su.se (Hilmi Eren)\nSubject...,17
3,From: guyd@austin.ibm.com (Guy Dawson)\nSubjec...,3
4,From: Alexander Samuel McDiarmid <am2o+@andrew...,4
...,...,...
18841,From: jim.zisfein@factory.com (Jim Zisfein) \n...,13
18842,From: rdell@cbnewsf.cb.att.com (richard.b.dell...,12
18843,From: westes@netcom.com (Will Estes)\nSubject:...,3
18844,From: steve@hcrlgw (Steven Collins)\nSubject: ...,1


<h1>Pipeline model with TFIDF and GridSearch

In [18]:
pipe = Pipeline(
    [
     ('tfidf', TfidfVectorizer()),
     ('svc', SVC(random_state=241, verbose=True, kernel='linear'))
    ], verbose=True
)

In [25]:
grid = {
    'svc__C': np.power(10., np.arange(-5, 5))
}

In [26]:
kf = KFold(5, True, 241)
gs = GridSearchCV(pipe, grid, n_jobs=-1, cv=kf)

In [34]:
gs.fit(a.data, a.target)

[Pipeline] ............. (step 1 of 2) Processing tfidf, total=   0.3s
[LibSVM][Pipeline] ............... (step 2 of 2) Processing svc, total=   1.1s


GridSearchCV(cv=KFold(n_splits=5, random_state=241, shuffle=True),
             error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('tfidf',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 

<h1>or make it more imperative than Pipeline

In [None]:
tfidf = TfidfVectorizer()
spm = tfidf.fit_transform(x)

In [None]:
grid = {
    'C': np.power(10., np.arange(-5, 6)),
    'kernel': ['linear'],
    'verbose': [True],
    'random_state': [241]
}
cv = KFold(5, True, random_state=241)
svc = SVC()
grid_search = GridSearchCV(svc, grid, scoring='accuracy', n_jobs=-1, verbose=1, cv=cv)

<h1>Fitting and choose model with the pest parameters

In [None]:
grid_search.fit(spm, y)
print(grid_search.best_estimator_)

Fitting 5 folds for each of 11 candidates, totalling 55 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed: 278.3min
[Parallel(n_jobs=-1)]: Done  55 out of  55 | elapsed: 315.9min finished


[LibSVM]SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=241, shrinking=True, tol=0.001,
    verbose=True)


In [None]:
words_weights = grid_search.best_estimator_.coef_.toarray()[0]
words = tfidf.get_feature_names()

In [None]:
grid_search.best_estimator_.score(spm, y)

0.9924652446142418

<h1>Getting top words in all news categories

In [None]:
top_words = []
for word_idx in np.argsort(words_weights)[:10]:
  top_words.append(words[word_idx])

In [None]:
print(top_words)

['graphics', 'hacker', 'image', 'computer', 'images', '42', 'thanks', '3d', 'points', 'software']
