In [1]:

%matplotlib inline


# Sample pipeline for text feature extraction and evaluation


The dataset used in this example is the 20 newsgroups dataset which will be
automatically downloaded and then cached and reused for the document
classification example.

You can adjust the number of categories by giving their names to the dataset
loader or setting them to None to get the 20 of them.

Here is a sample output of a run on a quad-core machine::

  Loading 20 newsgroups dataset for categories:
  ['alt.atheism', 'talk.religion.misc']
  1427 documents
  2 categories

  Performing grid search...
  pipeline: ['vect', 'tfidf', 'clf']
  parameters:
  {'clf__alpha': (1.0000000000000001e-05, 9.9999999999999995e-07),
   'clf__max_iter': (10, 50, 80),
   'clf__penalty': ('l2', 'elasticnet'),
   'tfidf__use_idf': (True, False),
   'vect__max_n': (1, 2),
   'vect__max_df': (0.5, 0.75, 1.0),
   'vect__max_features': (None, 5000, 10000, 50000)}
  done in 1737.030s

  Best score: 0.940
  Best parameters set:
      clf__alpha: 9.9999999999999995e-07
      clf__max_iter: 50
      clf__penalty: 'elasticnet'
      tfidf__use_idf: True
      vect__max_n: 2
      vect__max_df: 0.75
      vect__max_features: 50000


In [2]:

# Author: Olivier Grisel <olivier.grisel@ensta.org>
#         Peter Prettenhofer <peter.prettenhofer@gmail.com>
#         Mathieu Blondel <mathieu@mblondel.org>
# License: BSD 3 clause
from pprint import pprint
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.utils import Bunch
from time import time
import logging
import os
import pandas as pd
import pickle
import sys

In [3]:

# Display progress logs on stdout
logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s %(levelname)s %(message)s')

In [4]:

%run ../load_magic/storage.py
s = Storage()
navigable_parent_is_header_dict = s.load_object('navigable_parent_is_header_dict')
rows_list = [{'navigable_parent': navigable_parent, 'is_header': is_header} for navigable_parent, is_header in navigable_parent_is_header_dict.items()]
child_str_df = pd.DataFrame(rows_list)
data = Bunch(data=child_str_df.navigable_parent.tolist(), target=child_str_df.is_header.to_numpy())

In [5]:

# Define a pipeline combining a text feature extractor with a simple classifier
fit_estimators_dict = s.load_object('fit_estimators_dict')
pipeline = Pipeline([
    ('vect', CountVectorizer(**{'analyzer': 'char_wb', 'binary': True, 'decode_error': 'strict', 'lowercase': False, 'max_df': 0.5,
                                'max_features': 100, 'min_df': 0.0, 'ngram_range': (1, 2), 'stop_words': 'english',
                                'strip_accents': 'ascii'})),
    ('tfidf', TfidfTransformer(**{'norm': 'l2', 'smooth_idf': True, 'sublinear_tf': False, 'use_idf': True})),
    ('clf', LogisticRegression(**{'C': 85.0, 'class_weight': 'balanced', 'dual': False, 'fit_intercept': True, 'max_iter': 6, 'penalty': 'l2',
                                  'solver': 'sag', 'tol': 1e-08})),
    #('clf', fit_estimators_dict['LogisticRegression']),
])


----

In [6]:

parameters = {
    'clf__C': (85.0,),
    'clf__class_weight': ('balanced',),
    'clf__dual': (False,),
    'clf__fit_intercept': (True,),
    'clf__max_iter': (6,),
    'clf__penalty': ('l2',),
    'clf__solver': ('sag',),
    'clf__tol': (1e-08,),
    'tfidf__norm': ('l2',),
    'tfidf__smooth_idf': (True,),
    'tfidf__sublinear_tf': (False,),
    'tfidf__use_idf': (True,),
    'vect__analyzer': ('char_wb',),
    'vect__binary': (True,),
    'vect__decode_error': ('strict',),
    'vect__lowercase': (False,),
    'vect__max_df': (0.5,),
    'vect__max_features': (100,),
    'vect__min_df': (0.0,),
    'vect__ngram_range': ((1, 2),),
    'vect__stop_words': ('english',),
    'vect__strip_accents': ('ascii',),
}

In [7]:

# Find the best parameters for both the feature extraction and the
# classifier
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1, scoring='f1')

In [8]:

print("Performing grid search...")
print("pipeline:", [name for name, _ in pipeline.steps])
print("parameters:")
pprint(parameters)
t0 = time()
grid_search.fit(data.data, data.target)
print("done in %0.3fs" % (time() - t0))
print()

Performing grid search...
pipeline: ['vect', 'tfidf', 'clf']
parameters:
{'clf__C': (85.0,),
 'clf__class_weight': ('balanced',),
 'clf__dual': (False,),
 'clf__fit_intercept': (True,),
 'clf__max_iter': (6,),
 'clf__penalty': ('l2',),
 'clf__solver': ('sag',),
 'clf__tol': (1e-08,),
 'tfidf__norm': ('l2',),
 'tfidf__smooth_idf': (True,),
 'tfidf__sublinear_tf': (False,),
 'tfidf__use_idf': (True,),
 'vect__analyzer': ('char_wb',),
 'vect__binary': (True,),
 'vect__decode_error': ('strict',),
 'vect__lowercase': (False,),
 'vect__max_df': (0.5,),
 'vect__max_features': (100,),
 'vect__min_df': (0.0,),
 'vect__ngram_range': ((1, 2),),
 'vect__stop_words': ('english',),
 'vect__strip_accents': ('ascii',)}
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


done in 3.871s



[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    3.6s remaining:    5.4s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    3.7s finished


In [9]:

print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Best score: 0.760
Best parameters set:
	clf__C: 85.0
	clf__class_weight: 'balanced'
	clf__dual: False
	clf__fit_intercept: True
	clf__max_iter: 6
	clf__penalty: 'l2'
	clf__solver: 'sag'
	clf__tol: 1e-08
	tfidf__norm: 'l2'
	tfidf__smooth_idf: True
	tfidf__sublinear_tf: False
	tfidf__use_idf: True
	vect__analyzer: 'char_wb'
	vect__binary: True
	vect__decode_error: 'strict'
	vect__lowercase: False
	vect__max_df: 0.5
	vect__max_features: 100
	vect__min_df: 0.0
	vect__ngram_range: (1, 2)
	vect__stop_words: 'english'
	vect__strip_accents: 'ascii'


In [27]:

dir(data)

['DESCR', 'data', 'filenames', 'target', 'target_names']

In [8]:

data.target

array([False, False, False,  True, False, False,  True, False, False,
       False,  True, False, False, False, False, False,  True, False,
        True,  True, False,  True, False, False, False,  True, False,
       False, False,  True, False, False, False, False, False, False,
       False, False, False, False,  True, False, False, False, False,
       False, False,  True, False, False, False, False, False, False,
       False, False, False, False, False,  True, False, False,  True,
       False, False, False,  True, False,  True, False,  True, False,
        True, False, False,  True, False, False, False, False, False,
       False,  True])