In [16]:
import nltk
import numpy
import pandas
import sklearn
import sklearn.pipeline
import sklearn.grid_search
import sklearn.decomposition

# all 3 lines below should be uncommented
import matplotlib.pyplot as plt
%matplotlib inline
%matplotlib notebook

import sqlite3

import csshelper
import cssfeature
import csspipe
import csstransformer


db_path = './data.sqlite3'
sqlite_connection = sqlite3.connect(db_path)

In [17]:
data_reader_collection = [
        ('cExt', csshelper.CExtReader(sqlite_connection))
    
        , ('cNeu', csshelper.CNeuReader(sqlite_connection))
     
        , ('cAgr', csshelper.CAgrReader(sqlite_connection))
    
        , ('cCon', csshelper.CConReader(sqlite_connection))
    
        , ('cOpn', csshelper.COpnReader(sqlite_connection)) 
]

In [18]:
aggregator = csstransformer.Aggregator([
        csstransformer.PartOfSpeech(),
        csstransformer.SentenceLength(),
        csstransformer.NumberOfWords(),
        csstransformer.NumberOfCommas(),
        csstransformer.NumberOfDots(),
        csstransformer.NumberOfSemicolons(),
        csstransformer.NumberOfColons(),
        csstransformer.LexicalDiversity(),
        csstransformer.AverageWordLength(),
        csstransformer.NumberOfFunctionalWords(),
        csstransformer.NumberOfPronouns(),
        csstransformer.NumberOfPropnames(),
])

data = csshelper.CExtReader(sqlite_connection).get_results()

# aggregator.transform(data.sentence)


In [19]:
grid_parameter_collection = {
    'classifierLinearSVC': { 'features__status__tf_idf_vect__ngram_range': ((1, 1), (1, 2)),
                           'features__derived_numeric__best': (5, 7,9, 10)},
    'classifierSVC': { 'features__status__tf_idf_vect__ngram_range': ((1, 1), (1, 2)),
                     'features__derived_numeric__best': (5,7,9,10)},
    'classifierNB': { 'features__status__tf_idf_vect__ngram_range': ((1, 1), (1, 2)),
                    'features__derived_numeric__best': (5,7,9, 10)},
    'classifierBNB': { 'features__status__tf_idf_vect__ngram_range': ((1, 1), (1, 2)),
                     'features__derived_numeric__best': (5,7, 9, 10)},
}
# foo =             ('derived_string', sklearn.pipeline.Pipeline([
#                 ('string_aggregator', csstransformer.Aggregator([
#                 csstransformer.PartOfSpeech()
#                 ])),
#               ('tf_idf_vect', sklearn.feature_extraction.text.TfidfVectorizer()),
#             ]))

base_pipeline = sklearn.pipeline.Pipeline([     
    ('features', sklearn.pipeline.FeatureUnion(
          transformer_list=[
            ('status', sklearn.pipeline.Pipeline([
              ('tf_idf_vect', sklearn.feature_extraction.text.TfidfVectorizer()),
            ])),
                  
            ('derived_string', sklearn.pipeline.Pipeline([
                ('part_of_speech', csstransformer.PartOfSpeech()),
                ('tf_idf_vect', sklearn.feature_extraction.text.TfidfVectorizer()),
            ])),

            
            ('derived_numeric', sklearn.pipeline.Pipeline([
              ('numeric_aggregator', csstransformer.Aggregator([
                    csstransformer.SentenceLength(),
                    csstransformer.NumberOfWords(),
                    csstransformer.NumberOfCommas(),
                    csstransformer.NumberOfDots(),
                    csstransformer.NumberOfSemicolons(),
                    csstransformer.NumberOfColons(),
                    csstransformer.LexicalDiversity(),
                    csstransformer.AverageWordLength(),
                    csstransformer.NumberOfFunctionalWords(),
                    csstransformer.NumberOfPronouns(),
                    csstransformer.NumberOfPropnames(),
                ])), 
              ('scaler', sklearn.preprocessing.RobustScaler()),
              ('best', sklearn.decomposition.TruncatedSVD(random_state=5152)),
            ])),   
          ],
    )),
])

classifier_collection = [
        ('classifierLinearSVC', ('clf', sklearn.svm.LinearSVC(random_state = 5152))),
    
#         ('classifierSVC2Variante', ('clf', sklearn.svm.SVC(cache_size=4096, random_state = 5152, 
#                                                   kernel = "linear", decision_function_shape = "ovr", 
#                                                 probability=True))),
    
        ('classifierSVC', ('clf', sklearn.svm.SVC(cache_size=4096, random_state = 5152, 
                                                  kernel = "rbf", decision_function_shape = "ovr", 
                                                probability=True))),
        ('classifierNB', ('nb', sklearn.naive_bayes.MultinomialNB())),
        ('classifierBNB', ('bnb', sklearn.naive_bayes.BernoulliNB()))
]

def create_full_pipelines():
    return [
        (class_name, sklearn.pipeline.Pipeline(base_pipeline.steps + [classifier]))
        for class_name, classifier in classifier_collection
    ]

pipeline_collection = create_full_pipelines()


In [20]:
for pipe_name, pipe in pipeline_collection:
    print()
    print(pipe_name)
#     print(pipe.steps)


classifierLinearSVC

classifierSVC

classifierNB

classifierBNB


In [21]:
def plot_roc_curve(test_class, y_score, name, trait):
    fpr, tpr, roc_auc = dict(), dict(), dict()
    fpr, tpr, _ = sklearn.metrics.roc_curve(test_class, y_score[:, 1], pos_label=1)
    roc_auc = sklearn.metrics.auc(fpr, tpr)

    plt.figure()
    plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc)
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC chart with ' + name + " algorithm and on " + trait + " dataset")
    plt.legend(loc="lower right")

    
    plt.show()

In [22]:



"""
for pipe_name, pipeline in pipeline_collection:
    for reader_description, reader in data_reader_collection:
        
        print("\nFOR: Pipeline: %s/%s" % (reader_description, pipe_name))

        data = reader.get_results()
        feature = data.iloc[:,0]
        label = data.iloc[:,1]
        
        grid_parameter = grid_parameter_collection[pipe_name]
        
        split = sklearn.cross_validation.train_test_split(feature, label, train_size=0.66, 
        stratify=label,random_state=5152)
        
        x_train, x_test, y_train, y_test = split
        
        grid_search = sklearn.grid_search.GridSearchCV(pipeline, param_grid=grid_parameter, cv=5, n_jobs=-1, verbose=0)

        grid_search.fit(x_train, y_train)
        y_pred_trait = grid_search.predict(x_test)
        
        
        if 'predict_proba' in dir(pipeline.steps[-1][-1]):
            y_score_trait = grid_search.predict_proba(x_test)
            plot_roc_curve(y_test, y_score_trait, "Pipeline", reader_description)

#         print(sklearn.metrics.classification_report(y_test, y_pred_trait, labels=[0, 1], target_names=["0", "1"]))
#         print(sklearn.metrics.confusion_matrix(y_test, y_pred_trait, labels=[0, 1]))
        print("F1: ", sklearn.metrics.f1_score(y_test, y_pred_trait, labels=[0, 1], average='binary'))
        print("Precision: ", sklearn.metrics.average_precision_score(y_test, y_pred_trait, average='micro'))
        print("Recall: ", sklearn.metrics.recall_score(y_test, y_pred_trait, labels=[0, 1], average='binary'))
        print("Accuracy score: ", sklearn.metrics.accuracy_score(y_test, y_pred_trait))
        
        print()
        
        print("GRID SEARCH:")
        print("Best score: %0.3f" % grid_search.best_score_)
        print("Best parameters set:")
        best_parameters = grid_search.best_estimator_.get_params()
        for param_name in sorted(grid_parameter.keys()):
            print("\t%s: %r" % (param_name, best_parameters[param_name]))
"""


FOR: Pipeline: cExt/classifierLinearSVC
F1:  0.485703393061
Precision:  0.607503897134
Recall:  0.445143256464
Accuracy score:  0.599940688019

GRID SEARCH:
Best score: 0.597
Best parameters set:
	features__derived_numeric__best: TruncatedSVD(algorithm='randomized', n_components=2, n_iter=5,
       random_state=5152, tol=0.0)
	features__status__tf_idf_vect__ngram_range: (1, 2)

FOR: Pipeline: cNeu/classifierLinearSVC
F1:  0.385026737968
Precision:  0.535037309922
Recall:  0.313291139241
Accuracy score:  0.624851720047

GRID SEARCH:
Best score: 0.628
Best parameters set:
	features__derived_numeric__best: TruncatedSVD(algorithm='randomized', n_components=2, n_iter=5,
       random_state=5152, tol=0.0)
	features__status__tf_idf_vect__ngram_range: (1, 2)

FOR: Pipeline: cAgr/classifierLinearSVC
F1:  0.617574596222
Precision:  0.716117588649
Recall:  0.629815745394
Accuracy score:  0.585705812574

GRID SEARCH:
Best score: 0.575
Best parameters set:
	features__derived_numeric__best: Truncat

<IPython.core.display.Javascript object>

F1:  0.0
Precision:  0.7121886121
Recall:  0.0
Accuracy score:  0.575622775801

GRID SEARCH:
Best score: 0.575
Best parameters set:
	features__derived_numeric__best: TruncatedSVD(algorithm='randomized', n_components=2, n_iter=5,
       random_state=5152, tol=0.0)
	features__status__tf_idf_vect__ngram_range: (1, 1)

FOR: Pipeline: cNeu/classifierSVC


  'precision', 'predicted', average, warn_for)


<IPython.core.display.Javascript object>

F1:  0.0
Precision:  0.687425860024
Recall:  0.0
Accuracy score:  0.625148279953

GRID SEARCH:
Best score: 0.625
Best parameters set:
	features__derived_numeric__best: TruncatedSVD(algorithm='randomized', n_components=2, n_iter=5,
       random_state=5152, tol=0.0)
	features__status__tf_idf_vect__ngram_range: (1, 1)

FOR: Pipeline: cAgr/classifierSVC


  'precision', 'predicted', average, warn_for)


<IPython.core.display.Javascript object>

F1:  0.693782684486
Precision:  0.765569395018
Recall:  1.0
Accuracy score:  0.531138790036

GRID SEARCH:
Best score: 0.531
Best parameters set:
	features__derived_numeric__best: TruncatedSVD(algorithm='randomized', n_components=2, n_iter=5,
       random_state=5152, tol=0.0)
	features__status__tf_idf_vect__ngram_range: (1, 1)

FOR: Pipeline: cCon/classifierSVC


<IPython.core.display.Javascript object>

F1:  0.0
Precision:  0.729685646501
Recall:  0.0
Accuracy score:  0.540628706999

GRID SEARCH:
Best score: 0.541
Best parameters set:
	features__derived_numeric__best: TruncatedSVD(algorithm='randomized', n_components=2, n_iter=5,
       random_state=5152, tol=0.0)
	features__status__tf_idf_vect__ngram_range: (1, 1)

FOR: Pipeline: cOpn/classifierSVC


  'precision', 'predicted', average, warn_for)


<IPython.core.display.Javascript object>

F1:  0.852670976523
Precision:  0.871589561091
Recall:  1.0
Accuracy score:  0.743179122183

GRID SEARCH:
Best score: 0.743
Best parameters set:
	features__derived_numeric__best: TruncatedSVD(algorithm='randomized', n_components=2, n_iter=5,
       random_state=5152, tol=0.0)
	features__status__tf_idf_vect__ngram_range: (1, 1)

FOR: Pipeline: cExt/classifierNB


JoblibValueError: JoblibValueError
___________________________________________________________________________
Multiprocessing exception:
...........................................................................
/usr/lib/python3.4/runpy.py in _run_module_as_main(mod_name='ipykernel.__main__', alter_argv=1)
    165         sys.exit(msg)
    166     main_globals = sys.modules["__main__"].__dict__
    167     if alter_argv:
    168         sys.argv[0] = mod_spec.origin
    169     return _run_code(code, main_globals, None,
--> 170                      "__main__", mod_spec)
        mod_spec = ModuleSpec(name='ipykernel.__main__', loader=<_f...b/python3.4/dist-packages/ipykernel/__main__.py')
    171 
    172 def run_module(mod_name, init_globals=None,
    173                run_name=None, alter_sys=False):
    174     """Execute a module's code without importing it

...........................................................................
/usr/lib/python3.4/runpy.py in _run_code(code=<code object <module> at 0x7fd56ecda150, file "/...3.4/dist-packages/ipykernel/__main__.py", line 1>, run_globals={'__builtins__': <module 'builtins' (built-in)>, '__cached__': '/usr/local/lib/python3.4/dist-packages/ipykernel/__pycache__/__main__.cpython-34.pyc', '__doc__': None, '__file__': '/usr/local/lib/python3.4/dist-packages/ipykernel/__main__.py', '__loader__': <_frozen_importlib.SourceFileLoader object>, '__name__': '__main__', '__package__': 'ipykernel', '__spec__': ModuleSpec(name='ipykernel.__main__', loader=<_f...b/python3.4/dist-packages/ipykernel/__main__.py'), 'app': <module 'ipykernel.kernelapp' from '/usr/local/lib/python3.4/dist-packages/ipykernel/kernelapp.py'>}, init_globals=None, mod_name='__main__', mod_spec=ModuleSpec(name='ipykernel.__main__', loader=<_f...b/python3.4/dist-packages/ipykernel/__main__.py'), pkg_name='ipykernel', script_name=None)
     80                        __cached__ = cached,
     81                        __doc__ = None,
     82                        __loader__ = loader,
     83                        __package__ = pkg_name,
     84                        __spec__ = mod_spec)
---> 85     exec(code, run_globals)
        code = <code object <module> at 0x7fd56ecda150, file "/...3.4/dist-packages/ipykernel/__main__.py", line 1>
        run_globals = {'__builtins__': <module 'builtins' (built-in)>, '__cached__': '/usr/local/lib/python3.4/dist-packages/ipykernel/__pycache__/__main__.cpython-34.pyc', '__doc__': None, '__file__': '/usr/local/lib/python3.4/dist-packages/ipykernel/__main__.py', '__loader__': <_frozen_importlib.SourceFileLoader object>, '__name__': '__main__', '__package__': 'ipykernel', '__spec__': ModuleSpec(name='ipykernel.__main__', loader=<_f...b/python3.4/dist-packages/ipykernel/__main__.py'), 'app': <module 'ipykernel.kernelapp' from '/usr/local/lib/python3.4/dist-packages/ipykernel/kernelapp.py'>}
     86     return run_globals
     87 
     88 def _run_module_code(code, init_globals=None,
     89                     mod_name=None, mod_spec=None,

...........................................................................
/usr/local/lib/python3.4/dist-packages/ipykernel/__main__.py in <module>()
      1 
      2 
----> 3 
      4 if __name__ == '__main__':
      5     from ipykernel import kernelapp as app
      6     app.launch_new_instance()
      7 
      8 
      9 
     10 

...........................................................................
/usr/local/lib/python3.4/dist-packages/traitlets/config/application.py in launch_instance(cls=<class 'ipykernel.kernelapp.IPKernelApp'>, argv=None, **kwargs={})
    584         
    585         If a global instance already exists, this reinitializes and starts it
    586         """
    587         app = cls.instance(**kwargs)
    588         app.initialize(argv)
--> 589         app.start()
        app.start = <bound method IPKernelApp.start of <ipykernel.kernelapp.IPKernelApp object>>
    590 
    591 #-----------------------------------------------------------------------------
    592 # utility functions, for convenience
    593 #-----------------------------------------------------------------------------

...........................................................................
/usr/local/lib/python3.4/dist-packages/ipykernel/kernelapp.py in start(self=<ipykernel.kernelapp.IPKernelApp object>)
    400         
    401         if self.poller is not None:
    402             self.poller.start()
    403         self.kernel.start()
    404         try:
--> 405             ioloop.IOLoop.instance().start()
    406         except KeyboardInterrupt:
    407             pass
    408 
    409 launch_new_instance = IPKernelApp.launch_instance

...........................................................................
/usr/local/lib/python3.4/dist-packages/zmq/eventloop/ioloop.py in start(self=<zmq.eventloop.ioloop.ZMQIOLoop object>)
    157             PollIOLoop.configure(ZMQIOLoop)
    158         return PollIOLoop.current(*args, **kwargs)
    159     
    160     def start(self):
    161         try:
--> 162             super(ZMQIOLoop, self).start()
        self.start = <bound method ZMQIOLoop.start of <zmq.eventloop.ioloop.ZMQIOLoop object>>
    163         except ZMQError as e:
    164             if e.errno == ETERM:
    165                 # quietly return on ETERM
    166                 pass

...........................................................................
/usr/local/lib/python3.4/dist-packages/tornado/ioloop.py in start(self=<zmq.eventloop.ioloop.ZMQIOLoop object>)
    878                 self._events.update(event_pairs)
    879                 while self._events:
    880                     fd, events = self._events.popitem()
    881                     try:
    882                         fd_obj, handler_func = self._handlers[fd]
--> 883                         handler_func(fd_obj, events)
        handler_func = <function wrap.<locals>.null_wrapper>
        fd_obj = <zmq.sugar.socket.Socket object>
        events = 1
    884                     except (OSError, IOError) as e:
    885                         if errno_from_exception(e) == errno.EPIPE:
    886                             # Happens when the client closes the connection
    887                             pass

...........................................................................
/usr/local/lib/python3.4/dist-packages/tornado/stack_context.py in null_wrapper(*args=(<zmq.sugar.socket.Socket object>, 1), **kwargs={})
    270         # Fast path when there are no active contexts.
    271         def null_wrapper(*args, **kwargs):
    272             try:
    273                 current_state = _state.contexts
    274                 _state.contexts = cap_contexts[0]
--> 275                 return fn(*args, **kwargs)
        args = (<zmq.sugar.socket.Socket object>, 1)
        kwargs = {}
    276             finally:
    277                 _state.contexts = current_state
    278         null_wrapper._wrapped = True
    279         return null_wrapper

...........................................................................
/usr/local/lib/python3.4/dist-packages/zmq/eventloop/zmqstream.py in _handle_events(self=<zmq.eventloop.zmqstream.ZMQStream object>, fd=<zmq.sugar.socket.Socket object>, events=1)
    435             # dispatch events:
    436             if events & IOLoop.ERROR:
    437                 gen_log.error("got POLLERR event on ZMQStream, which doesn't make sense")
    438                 return
    439             if events & IOLoop.READ:
--> 440                 self._handle_recv()
        self._handle_recv = <bound method ZMQStream._handle_recv of <zmq.eventloop.zmqstream.ZMQStream object>>
    441                 if not self.socket:
    442                     return
    443             if events & IOLoop.WRITE:
    444                 self._handle_send()

...........................................................................
/usr/local/lib/python3.4/dist-packages/zmq/eventloop/zmqstream.py in _handle_recv(self=<zmq.eventloop.zmqstream.ZMQStream object>)
    467                 gen_log.error("RECV Error: %s"%zmq.strerror(e.errno))
    468         else:
    469             if self._recv_callback:
    470                 callback = self._recv_callback
    471                 # self._recv_callback = None
--> 472                 self._run_callback(callback, msg)
        self._run_callback = <bound method ZMQStream._run_callback of <zmq.eventloop.zmqstream.ZMQStream object>>
        callback = <function wrap.<locals>.null_wrapper>
        msg = [<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>]
    473                 
    474         # self.update_state()
    475         
    476 

...........................................................................
/usr/local/lib/python3.4/dist-packages/zmq/eventloop/zmqstream.py in _run_callback(self=<zmq.eventloop.zmqstream.ZMQStream object>, callback=<function wrap.<locals>.null_wrapper>, *args=([<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>],), **kwargs={})
    409         close our socket."""
    410         try:
    411             # Use a NullContext to ensure that all StackContexts are run
    412             # inside our blanket exception handler rather than outside.
    413             with stack_context.NullContext():
--> 414                 callback(*args, **kwargs)
        callback = <function wrap.<locals>.null_wrapper>
        args = ([<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>],)
        kwargs = {}
    415         except:
    416             gen_log.error("Uncaught exception, closing connection.",
    417                           exc_info=True)
    418             # Close the socket on an uncaught exception from a user callback

...........................................................................
/usr/local/lib/python3.4/dist-packages/tornado/stack_context.py in null_wrapper(*args=([<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>],), **kwargs={})
    270         # Fast path when there are no active contexts.
    271         def null_wrapper(*args, **kwargs):
    272             try:
    273                 current_state = _state.contexts
    274                 _state.contexts = cap_contexts[0]
--> 275                 return fn(*args, **kwargs)
        args = ([<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>],)
        kwargs = {}
    276             finally:
    277                 _state.contexts = current_state
    278         null_wrapper._wrapped = True
    279         return null_wrapper

...........................................................................
/usr/local/lib/python3.4/dist-packages/ipykernel/kernelbase.py in dispatcher(msg=[<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>])
    255         if self.control_stream:
    256             self.control_stream.on_recv(self.dispatch_control, copy=False)
    257 
    258         def make_dispatcher(stream):
    259             def dispatcher(msg):
--> 260                 return self.dispatch_shell(stream, msg)
        msg = [<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>]
    261             return dispatcher
    262 
    263         for s in self.shell_streams:
    264             s.on_recv(make_dispatcher(s), copy=False)

...........................................................................
/usr/local/lib/python3.4/dist-packages/ipykernel/kernelbase.py in dispatch_shell(self=<ipykernel.ipkernel.IPythonKernel object>, stream=<zmq.eventloop.zmqstream.ZMQStream object>, msg={'buffers': [], 'content': {'allow_stdin': True, 'code': 'for pipe_name, pipeline in pipeline_collection:\n... %r" % (param_name, best_parameters[param_name]))', 'silent': False, 'stop_on_error': True, 'store_history': True, 'user_expressions': {}}, 'header': {'date': '2016-02-23T21:29:32.656223', 'msg_id': '3A1BF9E7452349DE851C1CC70E8B3494', 'msg_type': 'execute_request', 'session': '2ABB07FAB06642ECAC37265B359CE00B', 'username': 'username', 'version': '5.0'}, 'metadata': {}, 'msg_id': '3A1BF9E7452349DE851C1CC70E8B3494', 'msg_type': 'execute_request', 'parent_header': {}})
    207             self.log.error("UNKNOWN MESSAGE TYPE: %r", msg_type)
    208         else:
    209             self.log.debug("%s: %s", msg_type, msg)
    210             self.pre_handler_hook()
    211             try:
--> 212                 handler(stream, idents, msg)
        handler = <bound method IPythonKernel.execute_request of <ipykernel.ipkernel.IPythonKernel object>>
        stream = <zmq.eventloop.zmqstream.ZMQStream object>
        idents = [b'2ABB07FAB06642ECAC37265B359CE00B']
        msg = {'buffers': [], 'content': {'allow_stdin': True, 'code': 'for pipe_name, pipeline in pipeline_collection:\n... %r" % (param_name, best_parameters[param_name]))', 'silent': False, 'stop_on_error': True, 'store_history': True, 'user_expressions': {}}, 'header': {'date': '2016-02-23T21:29:32.656223', 'msg_id': '3A1BF9E7452349DE851C1CC70E8B3494', 'msg_type': 'execute_request', 'session': '2ABB07FAB06642ECAC37265B359CE00B', 'username': 'username', 'version': '5.0'}, 'metadata': {}, 'msg_id': '3A1BF9E7452349DE851C1CC70E8B3494', 'msg_type': 'execute_request', 'parent_header': {}}
    213             except Exception:
    214                 self.log.error("Exception in message handler:", exc_info=True)
    215             finally:
    216                 self.post_handler_hook()

...........................................................................
/usr/local/lib/python3.4/dist-packages/ipykernel/kernelbase.py in execute_request(self=<ipykernel.ipkernel.IPythonKernel object>, stream=<zmq.eventloop.zmqstream.ZMQStream object>, ident=[b'2ABB07FAB06642ECAC37265B359CE00B'], parent={'buffers': [], 'content': {'allow_stdin': True, 'code': 'for pipe_name, pipeline in pipeline_collection:\n... %r" % (param_name, best_parameters[param_name]))', 'silent': False, 'stop_on_error': True, 'store_history': True, 'user_expressions': {}}, 'header': {'date': '2016-02-23T21:29:32.656223', 'msg_id': '3A1BF9E7452349DE851C1CC70E8B3494', 'msg_type': 'execute_request', 'session': '2ABB07FAB06642ECAC37265B359CE00B', 'username': 'username', 'version': '5.0'}, 'metadata': {}, 'msg_id': '3A1BF9E7452349DE851C1CC70E8B3494', 'msg_type': 'execute_request', 'parent_header': {}})
    365         if not silent:
    366             self.execution_count += 1
    367             self._publish_execute_input(code, parent, self.execution_count)
    368 
    369         reply_content = self.do_execute(code, silent, store_history,
--> 370                                         user_expressions, allow_stdin)
        user_expressions = {}
        allow_stdin = True
    371 
    372         # Flush output before sending the reply.
    373         sys.stdout.flush()
    374         sys.stderr.flush()

...........................................................................
/usr/local/lib/python3.4/dist-packages/ipykernel/ipkernel.py in do_execute(self=<ipykernel.ipkernel.IPythonKernel object>, code='for pipe_name, pipeline in pipeline_collection:\n... %r" % (param_name, best_parameters[param_name]))', silent=False, store_history=True, user_expressions={}, allow_stdin=True)
    170 
    171         reply_content = {}
    172         # FIXME: the shell calls the exception handler itself.
    173         shell._reply_content = None
    174         try:
--> 175             shell.run_cell(code, store_history=store_history, silent=silent)
        shell.run_cell = <bound method ZMQInteractiveShell.run_cell of <ipykernel.zmqshell.ZMQInteractiveShell object>>
        code = 'for pipe_name, pipeline in pipeline_collection:\n... %r" % (param_name, best_parameters[param_name]))'
        store_history = True
        silent = False
    176         except:
    177             status = u'error'
    178             # FIXME: this code right now isn't being used yet by default,
    179             # because the run_cell() call above directly fires off exception

...........................................................................
/usr/local/lib/python3.4/dist-packages/IPython/core/interactiveshell.py in run_cell(self=<ipykernel.zmqshell.ZMQInteractiveShell object>, raw_cell='for pipe_name, pipeline in pipeline_collection:\n... %r" % (param_name, best_parameters[param_name]))', store_history=True, silent=False, shell_futures=True)
   2718                 self.displayhook.exec_result = result
   2719 
   2720                 # Execute the user code
   2721                 interactivity = "none" if silent else self.ast_node_interactivity
   2722                 self.run_ast_nodes(code_ast.body, cell_name,
-> 2723                    interactivity=interactivity, compiler=compiler, result=result)
        interactivity = 'last_expr'
        compiler = <IPython.core.compilerop.CachingCompiler object>
   2724 
   2725                 # Reset this so later displayed values do not modify the
   2726                 # ExecutionResult
   2727                 self.displayhook.exec_result = None

...........................................................................
/usr/local/lib/python3.4/dist-packages/IPython/core/interactiveshell.py in run_ast_nodes(self=<ipykernel.zmqshell.ZMQInteractiveShell object>, nodelist=[<_ast.For object>], cell_name='<ipython-input-22-4db9e84627dc>', interactivity='none', compiler=<IPython.core.compilerop.CachingCompiler object>, result=<IPython.core.interactiveshell.ExecutionResult object>)
   2820 
   2821         try:
   2822             for i, node in enumerate(to_run_exec):
   2823                 mod = ast.Module([node])
   2824                 code = compiler(mod, cell_name, "exec")
-> 2825                 if self.run_code(code, result):
        self.run_code = <bound method ZMQInteractiveShell.run_code of <ipykernel.zmqshell.ZMQInteractiveShell object>>
        code = <code object <module> at 0x7fd53b727e40, file "<ipython-input-22-4db9e84627dc>", line 1>
        result = <IPython.core.interactiveshell.ExecutionResult object>
   2826                     return True
   2827 
   2828             for i, node in enumerate(to_run_interactive):
   2829                 mod = ast.Interactive([node])

...........................................................................
/usr/local/lib/python3.4/dist-packages/IPython/core/interactiveshell.py in run_code(self=<ipykernel.zmqshell.ZMQInteractiveShell object>, code_obj=<code object <module> at 0x7fd53b727e40, file "<ipython-input-22-4db9e84627dc>", line 1>, result=<IPython.core.interactiveshell.ExecutionResult object>)
   2880         outflag = 1  # happens in more places, so it's easier as default
   2881         try:
   2882             try:
   2883                 self.hooks.pre_run_code_hook()
   2884                 #rprint('Running code', repr(code_obj)) # dbg
-> 2885                 exec(code_obj, self.user_global_ns, self.user_ns)
        code_obj = <code object <module> at 0x7fd53b727e40, file "<ipython-input-22-4db9e84627dc>", line 1>
        self.user_global_ns = {'In': ['', "import nltk\nimport numpy\nimport pandas\nimport sk...te3'\nsqlite_connection = sqlite3.connect(db_path)", "data_reader_collection = [\n        ('cExt', cssh...Opn', csshelper.COpnReader(sqlite_connection)) \n]", 'aggregator = csstransformer.Aggregator([\n       ..._results()\n\n# aggregator.transform(data.sentence)', "grid_parameter_collection = {\n    'classifierLin... ]\n\npipeline_collection = create_full_pipelines()", 'for pipe_name, pipe in pipeline_collection:\n    print()\n    print(pipe_name)\n#     print(pipe.steps)', 'def plot_roc_curve(test_class, y_score, name, tr...\n    plt.legend(loc="lower right")\n    plt.show()', 'for pipe_name, pipeline in pipeline_collection:\n... %r" % (param_name, best_parameters[param_name]))', 'for pipe_name, pipeline in pipeline_collection:\n... %r" % (param_name, best_parameters[param_name]))', 'for pipe_name, pipeline in pipeline_collection:\n... %r" % (param_name, best_parameters[param_name]))', 'for pipe_name, pipeline in pipeline_collection:\n... %r" % (param_name, best_parameters[param_name]))', 'for pipe_name, pipeline in pipeline_collection:\n... %r" % (param_name, best_parameters[param_name]))', 'for pipe_name, pipeline in pipeline_collection:\n... %r" % (param_name, best_parameters[param_name]))', "grid_parameter_collection = {\n    'classifierLin... ]\n\npipeline_collection = create_full_pipelines()", 'for pipe_name, pipeline in pipeline_collection:\n... %r" % (param_name, best_parameters[param_name]))', "grid_parameter_collection = {\n    'classifierLin... ]\n\npipeline_collection = create_full_pipelines()", "import nltk\nimport numpy\nimport pandas\nimport sk...te3'\nsqlite_connection = sqlite3.connect(db_path)", "data_reader_collection = [\n        ('cExt', cssh...Opn', csshelper.COpnReader(sqlite_connection)) \n]", 'aggregator = csstransformer.Aggregator([\n       ..._results()\n\n# aggregator.transform(data.sentence)', "grid_parameter_collection = {\n    'classifierLin... ]\n\npipeline_collection = create_full_pipelines()", ...], 'Out': {}, '_': '', '__': '', '___': '', '__builtin__': <module 'builtins' (built-in)>, '__builtins__': <module 'builtins' (built-in)>, '__doc__': 'Automatically created module for IPython interactive environment', '__loader__': None, '__name__': '__main__', ...}
        self.user_ns = {'In': ['', "import nltk\nimport numpy\nimport pandas\nimport sk...te3'\nsqlite_connection = sqlite3.connect(db_path)", "data_reader_collection = [\n        ('cExt', cssh...Opn', csshelper.COpnReader(sqlite_connection)) \n]", 'aggregator = csstransformer.Aggregator([\n       ..._results()\n\n# aggregator.transform(data.sentence)', "grid_parameter_collection = {\n    'classifierLin... ]\n\npipeline_collection = create_full_pipelines()", 'for pipe_name, pipe in pipeline_collection:\n    print()\n    print(pipe_name)\n#     print(pipe.steps)', 'def plot_roc_curve(test_class, y_score, name, tr...\n    plt.legend(loc="lower right")\n    plt.show()', 'for pipe_name, pipeline in pipeline_collection:\n... %r" % (param_name, best_parameters[param_name]))', 'for pipe_name, pipeline in pipeline_collection:\n... %r" % (param_name, best_parameters[param_name]))', 'for pipe_name, pipeline in pipeline_collection:\n... %r" % (param_name, best_parameters[param_name]))', 'for pipe_name, pipeline in pipeline_collection:\n... %r" % (param_name, best_parameters[param_name]))', 'for pipe_name, pipeline in pipeline_collection:\n... %r" % (param_name, best_parameters[param_name]))', 'for pipe_name, pipeline in pipeline_collection:\n... %r" % (param_name, best_parameters[param_name]))', "grid_parameter_collection = {\n    'classifierLin... ]\n\npipeline_collection = create_full_pipelines()", 'for pipe_name, pipeline in pipeline_collection:\n... %r" % (param_name, best_parameters[param_name]))', "grid_parameter_collection = {\n    'classifierLin... ]\n\npipeline_collection = create_full_pipelines()", "import nltk\nimport numpy\nimport pandas\nimport sk...te3'\nsqlite_connection = sqlite3.connect(db_path)", "data_reader_collection = [\n        ('cExt', cssh...Opn', csshelper.COpnReader(sqlite_connection)) \n]", 'aggregator = csstransformer.Aggregator([\n       ..._results()\n\n# aggregator.transform(data.sentence)', "grid_parameter_collection = {\n    'classifierLin... ]\n\npipeline_collection = create_full_pipelines()", ...], 'Out': {}, '_': '', '__': '', '___': '', '__builtin__': <module 'builtins' (built-in)>, '__builtins__': <module 'builtins' (built-in)>, '__doc__': 'Automatically created module for IPython interactive environment', '__loader__': None, '__name__': '__main__', ...}
   2886             finally:
   2887                 # Reset our crash handler in place
   2888                 sys.excepthook = old_excepthook
   2889         except SystemExit as e:

...........................................................................
/home/dust/workspace/CaseSolvingSeminar/src/<ipython-input-22-4db9e84627dc> in <module>()
     14         
     15         x_train, x_test, y_train, y_test = split
     16         
     17         grid_search = sklearn.grid_search.GridSearchCV(pipeline, param_grid=grid_parameter, cv=5, n_jobs=-1, verbose=0)
     18 
---> 19         grid_search.fit(x_train, y_train)
     20         y_pred_trait = grid_search.predict(x_test)
     21         
     22         
     23         if 'predict_proba' in dir(pipeline.steps[-1][-1]):

...........................................................................
/usr/local/lib/python3.4/dist-packages/sklearn/grid_search.py in fit(self=GridSearchCV(cv=5, error_score='raise',
       e...='2*n_jobs', refit=True, scoring=None, verbose=0), X=8748    Today its time to clean this place and g...t, and her monitor.
Name: sentence, dtype: object, y=8748    False
6771     True
6666     True
3294  ...58     True
6250     True
Name: cExt, dtype: bool)
    799         y : array-like, shape = [n_samples] or [n_samples, n_output], optional
    800             Target relative to X for classification or regression;
    801             None for unsupervised learning.
    802 
    803         """
--> 804         return self._fit(X, y, ParameterGrid(self.param_grid))
        self._fit = <bound method GridSearchCV._fit of GridSearchCV(...'2*n_jobs', refit=True, scoring=None, verbose=0)>
        X = 8748    Today its time to clean this place and g...t, and her monitor.
Name: sentence, dtype: object
        y = 8748    False
6771     True
6666     True
3294  ...58     True
6250     True
Name: cExt, dtype: bool
        self.param_grid = {'features__derived_numeric__best': (5, 7, 9, 10), 'features__status__tf_idf_vect__ngram_range': ((1, 1), (1, 2))}
    805 
    806 
    807 class RandomizedSearchCV(BaseSearchCV):
    808     """Randomized search on hyper parameters.

...........................................................................
/usr/local/lib/python3.4/dist-packages/sklearn/grid_search.py in _fit(self=GridSearchCV(cv=5, error_score='raise',
       e...='2*n_jobs', refit=True, scoring=None, verbose=0), X=8748    Today its time to clean this place and g...t, and her monitor.
Name: sentence, dtype: object, y=8748    False
6771     True
6666     True
3294  ...58     True
6250     True
Name: cExt, dtype: bool, parameter_iterable=<sklearn.grid_search.ParameterGrid object>)
    548         )(
    549             delayed(_fit_and_score)(clone(base_estimator), X, y, self.scorer_,
    550                                     train, test, self.verbose, parameters,
    551                                     self.fit_params, return_parameters=True,
    552                                     error_score=self.error_score)
--> 553                 for parameters in parameter_iterable
        parameters = undefined
        parameter_iterable = <sklearn.grid_search.ParameterGrid object>
    554                 for train, test in cv)
    555 
    556         # Out is a list of triplet: score, estimator, n_test_samples
    557         n_fits = len(out)

...........................................................................
/usr/local/lib/python3.4/dist-packages/sklearn/externals/joblib/parallel.py in __call__(self=Parallel(n_jobs=-1), iterable=<generator object <genexpr>>)
    807             if pre_dispatch == "all" or n_jobs == 1:
    808                 # The iterable was consumed all at once by the above for loop.
    809                 # No need to wait for async callbacks to trigger to
    810                 # consumption.
    811                 self._iterating = False
--> 812             self.retrieve()
        self.retrieve = <bound method Parallel.retrieve of Parallel(n_jobs=-1)>
    813             # Make sure that we get a last message telling us we are done
    814             elapsed_time = time.time() - self._start_time
    815             self._print('Done %3i out of %3i | elapsed: %s finished',
    816                         (len(self._output), len(self._output),

---------------------------------------------------------------------------
Sub-process traceback:
---------------------------------------------------------------------------
ValueError                                         Tue Feb 23 22:57:01 2016
PID: 27300                                   Python 3.4.2: /usr/bin/python3
...........................................................................
/usr/local/lib/python3.4/dist-packages/sklearn/externals/joblib/parallel.py in __call__(self=<sklearn.externals.joblib.parallel.BatchedCalls object>)
     67     def __init__(self, iterator_slice):
     68         self.items = list(iterator_slice)
     69         self._size = len(self.items)
     70 
     71     def __call__(self):
---> 72         return [func(*args, **kwargs) for func, args, kwargs in self.items]
        self.items = [(<function _fit_and_score>, (Pipeline(steps=[('features', FeatureUnion(n_jobs...B(alpha=1.0, class_prior=None, fit_prior=True))]), 8748    Today its time to clean this place and g...t, and her monitor.
Name: sentence, dtype: object, 8748    False
6771     True
6666     True
3294  ...58     True
6250     True
Name: cExt, dtype: bool, <function _passthrough_scorer>, array([1294, 1295, 1296, ..., 6542, 6543, 6544]), array([   0,    1,    2, ..., 1320, 1321, 1328]), 0, {'features__derived_numeric__best': 5, 'features__status__tf_idf_vect__ngram_range': (1, 1)}, {}), {'error_score': 'raise', 'return_parameters': True})]
     73 
     74     def __len__(self):
     75         return self._size
     76 

...........................................................................
/usr/local/lib/python3.4/dist-packages/sklearn/externals/joblib/parallel.py in <listcomp>(.0=<list_iterator object>)
     67     def __init__(self, iterator_slice):
     68         self.items = list(iterator_slice)
     69         self._size = len(self.items)
     70 
     71     def __call__(self):
---> 72         return [func(*args, **kwargs) for func, args, kwargs in self.items]
        func = <function _fit_and_score>
        args = (Pipeline(steps=[('features', FeatureUnion(n_jobs...B(alpha=1.0, class_prior=None, fit_prior=True))]), 8748    Today its time to clean this place and g...t, and her monitor.
Name: sentence, dtype: object, 8748    False
6771     True
6666     True
3294  ...58     True
6250     True
Name: cExt, dtype: bool, <function _passthrough_scorer>, array([1294, 1295, 1296, ..., 6542, 6543, 6544]), array([   0,    1,    2, ..., 1320, 1321, 1328]), 0, {'features__derived_numeric__best': 5, 'features__status__tf_idf_vect__ngram_range': (1, 1)}, {})
        kwargs = {'error_score': 'raise', 'return_parameters': True}
     73 
     74     def __len__(self):
     75         return self._size
     76 

...........................................................................
/usr/local/lib/python3.4/dist-packages/sklearn/cross_validation.py in _fit_and_score(estimator=Pipeline(steps=[('features', FeatureUnion(n_jobs...B(alpha=1.0, class_prior=None, fit_prior=True))]), X=8748    Today its time to clean this place and g...t, and her monitor.
Name: sentence, dtype: object, y=8748    False
6771     True
6666     True
3294  ...58     True
6250     True
Name: cExt, dtype: bool, scorer=<function _passthrough_scorer>, train=array([1294, 1295, 1296, ..., 6542, 6543, 6544]), test=array([   0,    1,    2, ..., 1320, 1321, 1328]), verbose=0, parameters={'features__derived_numeric__best': 5, 'features__status__tf_idf_vect__ngram_range': (1, 1)}, fit_params={}, return_train_score=False, return_parameters=True, error_score='raise')
   1526 
   1527     try:
   1528         if y_train is None:
   1529             estimator.fit(X_train, **fit_params)
   1530         else:
-> 1531             estimator.fit(X_train, y_train, **fit_params)
        estimator.fit = <bound method Pipeline.fit of Pipeline(steps=[('...(alpha=1.0, class_prior=None, fit_prior=True))])>
        X_train = 6455                                IPOD FIXED!!...t, and her monitor.
Name: sentence, dtype: object
        y_train = 6455    False
4861    False
847     False
8712  ...58     True
6250     True
Name: cExt, dtype: bool
        fit_params = {}
   1532 
   1533     except Exception as e:
   1534         if error_score == 'raise':
   1535             raise

...........................................................................
/usr/local/lib/python3.4/dist-packages/sklearn/pipeline.py in fit(self=Pipeline(steps=[('features', FeatureUnion(n_jobs...B(alpha=1.0, class_prior=None, fit_prior=True))]), X=6455                                IPOD FIXED!!...t, and her monitor.
Name: sentence, dtype: object, y=6455    False
4861    False
847     False
8712  ...58     True
6250     True
Name: cExt, dtype: bool, **fit_params={})
    160         y : iterable, default=None
    161             Training targets. Must fulfill label requirements for all steps of
    162             the pipeline.
    163         """
    164         Xt, fit_params = self._pre_transform(X, y, **fit_params)
--> 165         self.steps[-1][-1].fit(Xt, y, **fit_params)
        self.steps.fit = undefined
        Xt = <5235x10578 sparse matrix of type '<class 'numpy... stored elements in Compressed Sparse Row format>
        y = 6455    False
4861    False
847     False
8712  ...58     True
6250     True
Name: cExt, dtype: bool
        fit_params = {}
    166         return self
    167 
    168     def fit_transform(self, X, y=None, **fit_params):
    169         """Fit all the transforms one after the other and transform the

...........................................................................
/usr/local/lib/python3.4/dist-packages/sklearn/naive_bayes.py in fit(self=MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True), X=<5235x10578 sparse matrix of type '<class 'numpy... stored elements in Compressed Sparse Row format>, y=array([False, False, False, ..., False,  True,  True], dtype=bool), sample_weight=None)
    547         # and feature log probas
    548         n_effective_classes = Y.shape[1]
    549         self.class_count_ = np.zeros(n_effective_classes, dtype=np.float64)
    550         self.feature_count_ = np.zeros((n_effective_classes, n_features),
    551                                        dtype=np.float64)
--> 552         self._count(X, Y)
        self._count = <bound method MultinomialNB._count of MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)>
        X = <5235x10578 sparse matrix of type '<class 'numpy... stored elements in Compressed Sparse Row format>
        Y = array([[ 1.,  0.],
       [ 1.,  0.],
       [ 1... 1.,  0.],
       [ 0.,  1.],
       [ 0.,  1.]])
    553         self._update_feature_log_prob()
    554         self._update_class_log_prior(class_prior=class_prior)
    555         return self
    556 

...........................................................................
/usr/local/lib/python3.4/dist-packages/sklearn/naive_bayes.py in _count(self=MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True), X=<5235x10578 sparse matrix of type '<class 'numpy... stored elements in Compressed Sparse Row format>, Y=array([[ 1.,  0.],
       [ 1.,  0.],
       [ 1... 1.,  0.],
       [ 0.,  1.],
       [ 0.,  1.]]))
    650         self.class_prior = class_prior
    651 
    652     def _count(self, X, Y):
    653         """Count and smooth feature occurrences."""
    654         if np.any((X.data if issparse(X) else X) < 0):
--> 655             raise ValueError("Input X must be non-negative")
    656         self.feature_count_ += safe_sparse_dot(Y.T, X)
    657         self.class_count_ += Y.sum(axis=0)
    658 
    659     def _update_feature_log_prob(self):

ValueError: Input X must be non-negative
___________________________________________________________________________

In [25]:
def apply_some_grid_search(pipe_name, pipeline, reader_description, reader):
    print("\nFOR: Pipeline: %s/%s" % (reader_description, pipe_name))

    data = reader.get_results()
    feature = data.iloc[:,0]
    label = data.iloc[:,1]

    grid_parameter = grid_parameter_collection[pipe_name]

    split = sklearn.cross_validation.train_test_split(feature, label, train_size=0.66, 
    stratify=label,random_state=5152)

    x_train, x_test, y_train, y_test = split

    grid_search = sklearn.grid_search.GridSearchCV(pipeline, param_grid=grid_parameter, cv=5, n_jobs=-1, verbose=0)

    grid_search.fit(x_train, y_train)
    y_pred_trait = grid_search.predict(x_test)


    if 'predict_proba' in dir(pipeline.steps[-1][-1]):
        y_score_trait = grid_search.predict_proba(x_test)
        plot_roc_curve(y_test, y_score_trait, "Pipeline", reader_description)

#         print(sklearn.metrics.classification_report(y_test, y_pred_trait, labels=[0, 1], target_names=["0", "1"]))
#         print(sklearn.metrics.confusion_matrix(y_test, y_pred_trait, labels=[0, 1]))
    print("F1: ", sklearn.metrics.f1_score(y_test, y_pred_trait, labels=[0, 1], average='binary'))
    print("Precision: ", sklearn.metrics.average_precision_score(y_test, y_pred_trait, average='micro'))
    print("Recall: ", sklearn.metrics.recall_score(y_test, y_pred_trait, labels=[0, 1], average='binary'))
    print("Accuracy score: ", sklearn.metrics.accuracy_score(y_test, y_pred_trait))

    print()

    print("GRID SEARCH:")
    print("Best score: %0.3f" % grid_search.best_score_)
    print("Best parameters set:")
    best_parameters = grid_search.best_estimator_.get_params()
    for param_name in sorted(grid_parameter.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))

In [None]:
#LinearSVC
pipe_name , pipeline = pipeline_collection[0]
    
for reader_description, reader in data_reader_collection:
    apply_some_grid_search(pipe_name, pipeline, reader_description, reader)

In [None]:
#SVC
pipe_name , pipeline = pipeline_collection[1]
    
for reader_description, reader in data_reader_collection:
    apply_some_grid_search(pipe_name, pipeline, reader_description, reader)

In [24]:
#MNB
pipe_name , pipeline = pipeline_collection[2]
    
for reader_description, reader in data_reader_collection:
    apply_some_grid_search(pipe_name, pipeline, reader_description, reader)

NameError: name 'apply_some_grid' is not defined

In [None]:
#BNB
pipe_name , pipeline = pipeline_collection[3]
    
for reader_description, reader in data_reader_collection:
    apply_some_grid_search(pipe_name, pipeline, reader_description, reader)


FOR: Pipeline: cExt/classifierBNB


<IPython.core.display.Javascript object>

F1:  0.390900649954
Precision:  0.588010676662
Recall:  0.294199860238
Accuracy score:  0.610913404508

GRID SEARCH:
Best score: 0.605
Best parameters set:
	features__derived_numeric__best: TruncatedSVD(algorithm='randomized', n_components=2, n_iter=5,
       random_state=5152, tol=0.0)
	features__status__tf_idf_vect__ngram_range: (1, 1)

FOR: Pipeline: cNeu/classifierBNB


<IPython.core.display.Javascript object>

F1:  0.246976448122
Precision:  0.55136096761
Recall:  0.153481012658
Accuracy score:  0.649169632266

GRID SEARCH:
Best score: 0.641
Best parameters set:
	features__derived_numeric__best: TruncatedSVD(algorithm='randomized', n_components=2, n_iter=5,
       random_state=5152, tol=0.0)
	features__status__tf_idf_vect__ngram_range: (1, 1)

FOR: Pipeline: cAgr/classifierBNB


<IPython.core.display.Javascript object>

F1:  0.665033080127
Precision:  0.739480346646
Recall:  0.757677275265
Accuracy score:  0.594602609727

GRID SEARCH:
Best score: 0.578
Best parameters set:
	features__derived_numeric__best: TruncatedSVD(algorithm='randomized', n_components=2, n_iter=5,
       random_state=5152, tol=0.0)
	features__status__tf_idf_vect__ngram_range: (1, 1)

FOR: Pipeline: cCon/classifierBNB


In [5]:
help(csstransformer)

Help on package csstransformer:

NAME
    csstransformer - # helpers

PACKAGE CONTENTS
    aggregator
    basetransformer
    lexicaldiversity
    nountransformer
    numberofdots
    partofspeech
    sentencelength
    smiley
    stemmedwords
    stemmer
    tagging
    tokenizer

FILE
    /home/dust/workspace/CaseSolvingSeminar/src/csstransformer/__init__.py


