In [46]:
%%time

data = fetch_20newsgroups(
    subset='all', 
    categories=[
        'rec.autos',
        'rec.motorcycles',
        'rec.sport.baseball',
        'rec.sport.hockey'
    ], 
    remove=('headers', 'footers', 'quotes')
)

parameters = {
    'countvectorizer__ngram_range' : [(1, 2)],
    'countvectorizer__stop_words': ['english'],
    'countvectorizer__min_df' : [1],
    'countvectorizer__max_df' : [0.31],
    'tfidftransformer__norm' : ["l2"],
    'tfidftransformer__smooth_idf': [0],
    'tfidftransformer__sublinear_tf': [True],
    'logisticregression__C' : [20, 21],
    'logisticregression__fit_intercept' : [False],
    'logisticregression__solver' : ['newton-cg', 'lbfgs', 'sag', 'saga'],
    'logisticregression__class_weight': [None],
}

pipeline = make_pipeline(CountVectorizer(),
                     TfidfTransformer(),
                     LogisticRegression())
grid_search = GridSearchCV(pipeline, parameters, 
                         scoring='accuracy',
                         n_jobs=-1, cv=3,
                         verbose=1).fit(data.data, data.target)
print(grid_search.best_score_, grid_search.best_params_)

Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  1.1min finished


0.878361397336 {'countvectorizer__max_df': 0.31, 'countvectorizer__min_df': 1, 'countvectorizer__ngram_range': (1, 2), 'countvectorizer__stop_words': 'english', 'logisticregression__C': 20, 'logisticregression__class_weight': None, 'logisticregression__fit_intercept': False, 'logisticregression__solver': 'newton-cg', 'tfidftransformer__norm': 'l2', 'tfidftransformer__smooth_idf': 0, 'tfidftransformer__sublinear_tf': True}
CPU times: user 8.87 s, sys: 1.33 s, total: 10.2 s
Wall time: 1min 17s


In [6]:
from sklearn.model_selection import cross_val_score
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
import numpy as np
import signal
import os
import json
import sys
import traceback


SCRIPT_DIR = os.path.dirname(os.path.realpath('__file__'))


def signal_handler(signum, frame):
    raise Exception("Timed out!")


class Checker(object):
    def __init__(self):
        self.data = fetch_20newsgroups(
            subset='all', 
            categories=[
                'rec.autos',
                'rec.motorcycles',
                'rec.sport.baseball',
                'rec.sport.hockey'
            ], 
            remove=('headers', 'footers', 'quotes')
        )

    def check(self, params_path):
        try:
            with open(params_path, 'r') as f:
                params = json.load(f)

            signal.signal(signal.SIGALRM, signal_handler)
            signal.alarm(60)
            pipeline = make_pipeline(
                CountVectorizer(**params['count_vectorizer_params']), 
                TfidfTransformer(**params['tfidf_transformer_params']), 
                LogisticRegression(**params['logistic_regression_params'])
            )
            score = np.mean(cross_val_score(
                pipeline, 
                self.data.data, 
                self.data.target,
                scoring='accuracy', 
                cv=3
            ))
        except:
            traceback.print_exception(*sys.exc_info())
            score = None
        
        return score


if __name__ == '__main__':
    print(Checker().check(SCRIPT_DIR + '/text_classification_params_danilov.json'))



0.878358477277
