In [14]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from predictNew import CUSTOM_STOP
from sklearn.metrics import log_loss
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin

In [2]:
chris_complete = pd.read_csv('../data/chris_complete_training.csv', index_col=0)

In [3]:
chris_sample = pd.read_csv('../data/chris_training.csv', index_col=0)

In [4]:
sample_df = pd.DataFrame(chris_sample.iloc[:48,:])

In [5]:
sample_df.to_csv('../data/chris_model_eval.csv')

In [6]:
sample_complete = pd.DataFrame(chris_complete.iloc[:48,:])

In [7]:
X = sample_df.jobs

In [8]:
y = sample_complete.labels

In [30]:
class PruneForest():
    def __init__(self, param=None):
        self.param = param

    def get_feature_importances(self, model):
        feature_importances = model.feature_importances_
        idxs = np.nonzero(feature_importances)[0]
        return idxs
    
    def get_reverse_term_dict(self, vectorizer):
        word_dict = vectorizer.vocabulary_
        reverse_dict = {value: key for key, value in word_dict.items()}
        return reverse_dict
    
    def get_word_list(self, reverse_dict, idxs):
        word_list = [reverse_dict[key] for key in list(idxs)]
        return word_list
    
    def get_vocabulary(self, model, vectorizer):
        indices = self.get_feature_importances(model)
        reverse_dict = self.get_reverse_term_dict(vectorizer)
        word_list = self.get_word_list(reverse_dict, indices)
        return word_list

In [10]:
sss = StratifiedShuffleSplit(n_splits=10, test_size=0.3)

In [11]:
forest_loss = []
for train_index, test_index in sss.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    i = 0
    
    vectorizer = TfidfVectorizer(stop_words=CUSTOM_STOP)
    vectorizer.fit(X_train)
    transformed_matrix = vectorizer.transform(X_train)
    tfidf_df = pd.DataFrame(transformed_matrix.toarray())
    
    forest = RandomForestClassifier(n_estimators=2000, criterion='entropy', n_jobs=-1)
    forest.fit(tfidf_df, y_train)
    
    pruned_forest = PruneForest()
    vocabulary = pruned_forest.get_vocabulary(forest, vectorizer)
    
    improved_vectorizer = TfidfVectorizer(stop_words=CUSTOM_STOP, vocabulary=vocabulary)
    improved_vectorizer.fit(X_train)
    improved_transformed_matrix = improved_vectorizer.transform(X_train)
    improved_tfidf_df = pd.DataFrame(improved_transformed_matrix.toarray())
    
    grad_boost = GradientBoostingClassifier(learning_rate=0.001, n_estimators=500, subsample=0.5)
    grad_boost.fit(improved_tfidf_df, y_train)
    tfidf_X_test = improved_vectorizer.transform(X_test)
    y_pred = grad_boost.predict_proba(tfidf_X_test)
    forest_loss.append(log_loss(y_test, y_pred))
    i += 1

In [31]:
class FitModel():
    def __init__(self):
        self.vectorizer = TfidfVectorizer(stop_words=CUSTOM_STOP)
        self.forest = RandomForestClassifier(n_estimators=2000, criterion='entropy', n_jobs=-1)
        self.grad_boost = GradientBoostingClassifier(learning_rate=0.001, n_estimators=500, subsample=0.5)
        self.pruned_forest = PruneForest()
    
    def get_Xy(self, name):
        df = pd.read_csv(name, index_col=0)
        culled_df = df[df.labels != 0.0]
        X = culled_df.iloc[1:, 1]
        y = culled_df.iloc[1:, 2]
        return X, y
    
    def fit_primary(self, X, y):
        self.forest.fit(X, y)
        return self.forest, 
    
    def prune_forest(self, forest, vectorizer):
        vocabulary = self.pruned_forest.get_vocabulary(forest, vectorizer)
        return vocabulary
    
    def improve(self, X, vocabulary):
        imp_vectorizer = TfidfVectorizer(stop_words=CUSTOM_STOP, vocabulary=vocabulary)
        imp_vectorizer.fit(X)
        transformed_matrix = imp_vectorizer.transform(X)
        tfidf_df = pd.DataFrame(transformed_matrix.toarray())
        return tfidf_df
    
    def fit_secondary(self, X, y):
        return self.grad_boost.fit(X, y)
        
    def transform(self, name):
        X, y = self.get_Xy(name)
        forest, vectorizer = self.fit_primary(X, y)
        vocabulary = self.prune_forest(forest, vectorizer)
        tfidf_df = self.improve(X, vocabulary)
        return self.fit_secondary(tfidf_df, y)


In [32]:
test = FitModel()

In [34]:
X, y = test.get_Xy('../data/LABELEDTEST.csv')

In [42]:
test_vec = TfidfVectorizer(stop_words=CUSTOM_STOP)

In [52]:
X.shape

(25, 2552)

In [43]:
test_vec.fit(X)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=['a', 'about', 'above', 'across', 'after', 'afterwards', 'again', 'against', 'all', 'almost', 'alone', 'along', 'already', 'also', 'although', 'always', 'am', 'among', 'amongst', 'amoungst', 'amount', 'an', 'and', 'another', 'any', 'anyhow', 'anyone', 'anything', 'anyway', 'anywhere', 'ar...', 'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec', 'endorse', 'endorsement', 'endorsements'],
        strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [44]:
test_trans = test_vec.transform(X)

In [50]:
test_trans.toarray().shape

(2552, 2542)

In [45]:
test_tfidf = pd.DataFrame(test_trans.toarray())

In [47]:
test_tfidf.shape

(2552, 2542)

In [41]:
forest, vectorizer = test.fit_primary(X, y)

JoblibValueError: JoblibValueError
___________________________________________________________________________
Multiprocessing exception:
...........................................................................
/Users/ethandoyle/anaconda3/lib/python3.6/runpy.py in _run_module_as_main(mod_name='ipykernel_launcher', alter_argv=1)
    188         sys.exit(msg)
    189     main_globals = sys.modules["__main__"].__dict__
    190     if alter_argv:
    191         sys.argv[0] = mod_spec.origin
    192     return _run_code(code, main_globals, None,
--> 193                      "__main__", mod_spec)
        mod_spec = ModuleSpec(name='ipykernel_launcher', loader=<_f...b/python3.6/site-packages/ipykernel_launcher.py')
    194 
    195 def run_module(mod_name, init_globals=None,
    196                run_name=None, alter_sys=False):
    197     """Execute a module's code without importing it

...........................................................................
/Users/ethandoyle/anaconda3/lib/python3.6/runpy.py in _run_code(code=<code object <module> at 0x10502b300, file "/Use...3.6/site-packages/ipykernel_launcher.py", line 5>, run_globals={'__annotations__': {}, '__builtins__': <module 'builtins' (built-in)>, '__cached__': '/Users/ethandoyle/anaconda3/lib/python3.6/site-p...ges/__pycache__/ipykernel_launcher.cpython-36.pyc', '__doc__': 'Entry point for launching an IPython kernel.\n\nTh...orts until\nafter removing the cwd from sys.path.\n', '__file__': '/Users/ethandoyle/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py', '__loader__': <_frozen_importlib_external.SourceFileLoader object>, '__name__': '__main__', '__package__': '', '__spec__': ModuleSpec(name='ipykernel_launcher', loader=<_f...b/python3.6/site-packages/ipykernel_launcher.py'), 'app': <module 'ipykernel.kernelapp' from '/Users/ethan.../python3.6/site-packages/ipykernel/kernelapp.py'>, ...}, init_globals=None, mod_name='__main__', mod_spec=ModuleSpec(name='ipykernel_launcher', loader=<_f...b/python3.6/site-packages/ipykernel_launcher.py'), pkg_name='', script_name=None)
     80                        __cached__ = cached,
     81                        __doc__ = None,
     82                        __loader__ = loader,
     83                        __package__ = pkg_name,
     84                        __spec__ = mod_spec)
---> 85     exec(code, run_globals)
        code = <code object <module> at 0x10502b300, file "/Use...3.6/site-packages/ipykernel_launcher.py", line 5>
        run_globals = {'__annotations__': {}, '__builtins__': <module 'builtins' (built-in)>, '__cached__': '/Users/ethandoyle/anaconda3/lib/python3.6/site-p...ges/__pycache__/ipykernel_launcher.cpython-36.pyc', '__doc__': 'Entry point for launching an IPython kernel.\n\nTh...orts until\nafter removing the cwd from sys.path.\n', '__file__': '/Users/ethandoyle/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py', '__loader__': <_frozen_importlib_external.SourceFileLoader object>, '__name__': '__main__', '__package__': '', '__spec__': ModuleSpec(name='ipykernel_launcher', loader=<_f...b/python3.6/site-packages/ipykernel_launcher.py'), 'app': <module 'ipykernel.kernelapp' from '/Users/ethan.../python3.6/site-packages/ipykernel/kernelapp.py'>, ...}
     86     return run_globals
     87 
     88 def _run_module_code(code, init_globals=None,
     89                     mod_name=None, mod_spec=None,

...........................................................................
/Users/ethandoyle/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py in <module>()
     11     # This is added back by InteractiveShellApp.init_path()
     12     if sys.path[0] == '':
     13         del sys.path[0]
     14 
     15     from ipykernel import kernelapp as app
---> 16     app.launch_new_instance()

...........................................................................
/Users/ethandoyle/anaconda3/lib/python3.6/site-packages/traitlets/config/application.py in launch_instance(cls=<class 'ipykernel.kernelapp.IPKernelApp'>, argv=None, **kwargs={})
    653 
    654         If a global instance already exists, this reinitializes and starts it
    655         """
    656         app = cls.instance(**kwargs)
    657         app.initialize(argv)
--> 658         app.start()
        app.start = <bound method IPKernelApp.start of <ipykernel.kernelapp.IPKernelApp object>>
    659 
    660 #-----------------------------------------------------------------------------
    661 # utility functions, for convenience
    662 #-----------------------------------------------------------------------------

...........................................................................
/Users/ethandoyle/anaconda3/lib/python3.6/site-packages/ipykernel/kernelapp.py in start(self=<ipykernel.kernelapp.IPKernelApp object>)
    492         if self.poller is not None:
    493             self.poller.start()
    494         self.kernel.start()
    495         self.io_loop = ioloop.IOLoop.current()
    496         try:
--> 497             self.io_loop.start()
        self.io_loop.start = <bound method BaseAsyncIOLoop.start of <tornado.platform.asyncio.AsyncIOMainLoop object>>
    498         except KeyboardInterrupt:
    499             pass
    500 
    501 launch_new_instance = IPKernelApp.launch_instance

...........................................................................
/Users/ethandoyle/anaconda3/lib/python3.6/site-packages/tornado/platform/asyncio.py in start(self=<tornado.platform.asyncio.AsyncIOMainLoop object>)
    127         except (RuntimeError, AssertionError):
    128             old_loop = None
    129         try:
    130             self._setup_logging()
    131             asyncio.set_event_loop(self.asyncio_loop)
--> 132             self.asyncio_loop.run_forever()
        self.asyncio_loop.run_forever = <bound method BaseEventLoop.run_forever of <_Uni...EventLoop running=True closed=False debug=False>>
    133         finally:
    134             asyncio.set_event_loop(old_loop)
    135 
    136     def stop(self):

...........................................................................
/Users/ethandoyle/anaconda3/lib/python3.6/asyncio/base_events.py in run_forever(self=<_UnixSelectorEventLoop running=True closed=False debug=False>)
    417             sys.set_asyncgen_hooks(firstiter=self._asyncgen_firstiter_hook,
    418                                    finalizer=self._asyncgen_finalizer_hook)
    419         try:
    420             events._set_running_loop(self)
    421             while True:
--> 422                 self._run_once()
        self._run_once = <bound method BaseEventLoop._run_once of <_UnixS...EventLoop running=True closed=False debug=False>>
    423                 if self._stopping:
    424                     break
    425         finally:
    426             self._stopping = False

...........................................................................
/Users/ethandoyle/anaconda3/lib/python3.6/asyncio/base_events.py in _run_once(self=<_UnixSelectorEventLoop running=True closed=False debug=False>)
   1429                         logger.warning('Executing %s took %.3f seconds',
   1430                                        _format_handle(handle), dt)
   1431                 finally:
   1432                     self._current_handle = None
   1433             else:
-> 1434                 handle._run()
        handle._run = <bound method Handle._run of <Handle BaseAsyncIOLoop._handle_events(15, 1)>>
   1435         handle = None  # Needed to break cycles when an exception occurs.
   1436 
   1437     def _set_coroutine_wrapper(self, enabled):
   1438         try:

...........................................................................
/Users/ethandoyle/anaconda3/lib/python3.6/asyncio/events.py in _run(self=<Handle BaseAsyncIOLoop._handle_events(15, 1)>)
    140             self._callback = None
    141             self._args = None
    142 
    143     def _run(self):
    144         try:
--> 145             self._callback(*self._args)
        self._callback = <bound method BaseAsyncIOLoop._handle_events of <tornado.platform.asyncio.AsyncIOMainLoop object>>
        self._args = (15, 1)
    146         except Exception as exc:
    147             cb = _format_callback_source(self._callback, self._args)
    148             msg = 'Exception in callback {}'.format(cb)
    149             context = {

...........................................................................
/Users/ethandoyle/anaconda3/lib/python3.6/site-packages/tornado/platform/asyncio.py in _handle_events(self=<tornado.platform.asyncio.AsyncIOMainLoop object>, fd=15, events=1)
    117             self.writers.remove(fd)
    118         del self.handlers[fd]
    119 
    120     def _handle_events(self, fd, events):
    121         fileobj, handler_func = self.handlers[fd]
--> 122         handler_func(fileobj, events)
        handler_func = <function wrap.<locals>.null_wrapper>
        fileobj = <zmq.sugar.socket.Socket object>
        events = 1
    123 
    124     def start(self):
    125         try:
    126             old_loop = asyncio.get_event_loop()

...........................................................................
/Users/ethandoyle/anaconda3/lib/python3.6/site-packages/tornado/stack_context.py in null_wrapper(*args=(<zmq.sugar.socket.Socket object>, 1), **kwargs={})
    295         # Fast path when there are no active contexts.
    296         def null_wrapper(*args, **kwargs):
    297             try:
    298                 current_state = _state.contexts
    299                 _state.contexts = cap_contexts[0]
--> 300                 return fn(*args, **kwargs)
        args = (<zmq.sugar.socket.Socket object>, 1)
        kwargs = {}
    301             finally:
    302                 _state.contexts = current_state
    303         null_wrapper._wrapped = True
    304         return null_wrapper

...........................................................................
/Users/ethandoyle/anaconda3/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py in _handle_events(self=<zmq.eventloop.zmqstream.ZMQStream object>, fd=<zmq.sugar.socket.Socket object>, events=1)
    445             return
    446         zmq_events = self.socket.EVENTS
    447         try:
    448             # dispatch events:
    449             if zmq_events & zmq.POLLIN and self.receiving():
--> 450                 self._handle_recv()
        self._handle_recv = <bound method ZMQStream._handle_recv of <zmq.eventloop.zmqstream.ZMQStream object>>
    451                 if not self.socket:
    452                     return
    453             if zmq_events & zmq.POLLOUT and self.sending():
    454                 self._handle_send()

...........................................................................
/Users/ethandoyle/anaconda3/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py in _handle_recv(self=<zmq.eventloop.zmqstream.ZMQStream object>)
    475             else:
    476                 raise
    477         else:
    478             if self._recv_callback:
    479                 callback = self._recv_callback
--> 480                 self._run_callback(callback, msg)
        self._run_callback = <bound method ZMQStream._run_callback of <zmq.eventloop.zmqstream.ZMQStream object>>
        callback = <function wrap.<locals>.null_wrapper>
        msg = [<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>]
    481         
    482 
    483     def _handle_send(self):
    484         """Handle a send event."""

...........................................................................
/Users/ethandoyle/anaconda3/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py in _run_callback(self=<zmq.eventloop.zmqstream.ZMQStream object>, callback=<function wrap.<locals>.null_wrapper>, *args=([<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>],), **kwargs={})
    427         close our socket."""
    428         try:
    429             # Use a NullContext to ensure that all StackContexts are run
    430             # inside our blanket exception handler rather than outside.
    431             with stack_context.NullContext():
--> 432                 callback(*args, **kwargs)
        callback = <function wrap.<locals>.null_wrapper>
        args = ([<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>],)
        kwargs = {}
    433         except:
    434             gen_log.error("Uncaught exception in ZMQStream callback",
    435                           exc_info=True)
    436             # Re-raise the exception so that IOLoop.handle_callback_exception

...........................................................................
/Users/ethandoyle/anaconda3/lib/python3.6/site-packages/tornado/stack_context.py in null_wrapper(*args=([<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>],), **kwargs={})
    295         # Fast path when there are no active contexts.
    296         def null_wrapper(*args, **kwargs):
    297             try:
    298                 current_state = _state.contexts
    299                 _state.contexts = cap_contexts[0]
--> 300                 return fn(*args, **kwargs)
        args = ([<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>],)
        kwargs = {}
    301             finally:
    302                 _state.contexts = current_state
    303         null_wrapper._wrapped = True
    304         return null_wrapper

...........................................................................
/Users/ethandoyle/anaconda3/lib/python3.6/site-packages/ipykernel/kernelbase.py in dispatcher(msg=[<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>])
    278         if self.control_stream:
    279             self.control_stream.on_recv(self.dispatch_control, copy=False)
    280 
    281         def make_dispatcher(stream):
    282             def dispatcher(msg):
--> 283                 return self.dispatch_shell(stream, msg)
        msg = [<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>]
    284             return dispatcher
    285 
    286         for s in self.shell_streams:
    287             s.on_recv(make_dispatcher(s), copy=False)

...........................................................................
/Users/ethandoyle/anaconda3/lib/python3.6/site-packages/ipykernel/kernelbase.py in dispatch_shell(self=<ipykernel.ipkernel.IPythonKernel object>, stream=<zmq.eventloop.zmqstream.ZMQStream object>, msg={'buffers': [], 'content': {'allow_stdin': True, 'code': 'forest, vectorizer = test.fit_primary(X, y)', 'silent': False, 'stop_on_error': True, 'store_history': True, 'user_expressions': {}}, 'header': {'date': datetime.datetime(2018, 12, 4, 18, 41, 48, 592460, tzinfo=tzutc()), 'msg_id': '3df7ec1c-8381-4d4e-a2fa-a160c990c334', 'msg_type': 'execute_request', 'session': 'aa62b5ad-95e9-4e50-8eef-8401a0de22d7', 'username': '', 'version': '5.2'}, 'metadata': {'cellId': '0423c270-6277-4673-bce1-eb71094b7bf1', 'deletedCells': ['5c1389dc-1311-4793-8e7f-3c49aee1a3b1']}, 'msg_id': '3df7ec1c-8381-4d4e-a2fa-a160c990c334', 'msg_type': 'execute_request', 'parent_header': {}})
    228             self.log.warning("Unknown message type: %r", msg_type)
    229         else:
    230             self.log.debug("%s: %s", msg_type, msg)
    231             self.pre_handler_hook()
    232             try:
--> 233                 handler(stream, idents, msg)
        handler = <bound method Kernel.execute_request of <ipykernel.ipkernel.IPythonKernel object>>
        stream = <zmq.eventloop.zmqstream.ZMQStream object>
        idents = [b'aa62b5ad-95e9-4e50-8eef-8401a0de22d7']
        msg = {'buffers': [], 'content': {'allow_stdin': True, 'code': 'forest, vectorizer = test.fit_primary(X, y)', 'silent': False, 'stop_on_error': True, 'store_history': True, 'user_expressions': {}}, 'header': {'date': datetime.datetime(2018, 12, 4, 18, 41, 48, 592460, tzinfo=tzutc()), 'msg_id': '3df7ec1c-8381-4d4e-a2fa-a160c990c334', 'msg_type': 'execute_request', 'session': 'aa62b5ad-95e9-4e50-8eef-8401a0de22d7', 'username': '', 'version': '5.2'}, 'metadata': {'cellId': '0423c270-6277-4673-bce1-eb71094b7bf1', 'deletedCells': ['5c1389dc-1311-4793-8e7f-3c49aee1a3b1']}, 'msg_id': '3df7ec1c-8381-4d4e-a2fa-a160c990c334', 'msg_type': 'execute_request', 'parent_header': {}}
    234             except Exception:
    235                 self.log.error("Exception in message handler:", exc_info=True)
    236             finally:
    237                 self.post_handler_hook()

...........................................................................
/Users/ethandoyle/anaconda3/lib/python3.6/site-packages/ipykernel/kernelbase.py in execute_request(self=<ipykernel.ipkernel.IPythonKernel object>, stream=<zmq.eventloop.zmqstream.ZMQStream object>, ident=[b'aa62b5ad-95e9-4e50-8eef-8401a0de22d7'], parent={'buffers': [], 'content': {'allow_stdin': True, 'code': 'forest, vectorizer = test.fit_primary(X, y)', 'silent': False, 'stop_on_error': True, 'store_history': True, 'user_expressions': {}}, 'header': {'date': datetime.datetime(2018, 12, 4, 18, 41, 48, 592460, tzinfo=tzutc()), 'msg_id': '3df7ec1c-8381-4d4e-a2fa-a160c990c334', 'msg_type': 'execute_request', 'session': 'aa62b5ad-95e9-4e50-8eef-8401a0de22d7', 'username': '', 'version': '5.2'}, 'metadata': {'cellId': '0423c270-6277-4673-bce1-eb71094b7bf1', 'deletedCells': ['5c1389dc-1311-4793-8e7f-3c49aee1a3b1']}, 'msg_id': '3df7ec1c-8381-4d4e-a2fa-a160c990c334', 'msg_type': 'execute_request', 'parent_header': {}})
    394         if not silent:
    395             self.execution_count += 1
    396             self._publish_execute_input(code, parent, self.execution_count)
    397 
    398         reply_content = self.do_execute(code, silent, store_history,
--> 399                                         user_expressions, allow_stdin)
        user_expressions = {}
        allow_stdin = True
    400 
    401         # Flush output before sending the reply.
    402         sys.stdout.flush()
    403         sys.stderr.flush()

...........................................................................
/Users/ethandoyle/anaconda3/lib/python3.6/site-packages/ipykernel/ipkernel.py in do_execute(self=<ipykernel.ipkernel.IPythonKernel object>, code='forest, vectorizer = test.fit_primary(X, y)', silent=False, store_history=True, user_expressions={}, allow_stdin=True)
    203 
    204         self._forward_input(allow_stdin)
    205 
    206         reply_content = {}
    207         try:
--> 208             res = shell.run_cell(code, store_history=store_history, silent=silent)
        res = undefined
        shell.run_cell = <bound method ZMQInteractiveShell.run_cell of <ipykernel.zmqshell.ZMQInteractiveShell object>>
        code = 'forest, vectorizer = test.fit_primary(X, y)'
        store_history = True
        silent = False
    209         finally:
    210             self._restore_input()
    211 
    212         if res.error_before_exec is not None:

...........................................................................
/Users/ethandoyle/anaconda3/lib/python3.6/site-packages/ipykernel/zmqshell.py in run_cell(self=<ipykernel.zmqshell.ZMQInteractiveShell object>, *args=('forest, vectorizer = test.fit_primary(X, y)',), **kwargs={'silent': False, 'store_history': True})
    532             )
    533         self.payload_manager.write_payload(payload)
    534 
    535     def run_cell(self, *args, **kwargs):
    536         self._last_traceback = None
--> 537         return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
        self.run_cell = <bound method ZMQInteractiveShell.run_cell of <ipykernel.zmqshell.ZMQInteractiveShell object>>
        args = ('forest, vectorizer = test.fit_primary(X, y)',)
        kwargs = {'silent': False, 'store_history': True}
    538 
    539     def _showtraceback(self, etype, evalue, stb):
    540         # try to preserve ordering of tracebacks and print statements
    541         sys.stdout.flush()

...........................................................................
/Users/ethandoyle/anaconda3/lib/python3.6/site-packages/IPython/core/interactiveshell.py in run_cell(self=<ipykernel.zmqshell.ZMQInteractiveShell object>, raw_cell='forest, vectorizer = test.fit_primary(X, y)', store_history=True, silent=False, shell_futures=True)
   2657         -------
   2658         result : :class:`ExecutionResult`
   2659         """
   2660         try:
   2661             result = self._run_cell(
-> 2662                 raw_cell, store_history, silent, shell_futures)
        raw_cell = 'forest, vectorizer = test.fit_primary(X, y)'
        store_history = True
        silent = False
        shell_futures = True
   2663         finally:
   2664             self.events.trigger('post_execute')
   2665             if not silent:
   2666                 self.events.trigger('post_run_cell', result)

...........................................................................
/Users/ethandoyle/anaconda3/lib/python3.6/site-packages/IPython/core/interactiveshell.py in _run_cell(self=<ipykernel.zmqshell.ZMQInteractiveShell object>, raw_cell='forest, vectorizer = test.fit_primary(X, y)', store_history=True, silent=False, shell_futures=True)
   2780                 self.displayhook.exec_result = result
   2781 
   2782                 # Execute the user code
   2783                 interactivity = 'none' if silent else self.ast_node_interactivity
   2784                 has_raised = self.run_ast_nodes(code_ast.body, cell_name,
-> 2785                    interactivity=interactivity, compiler=compiler, result=result)
        interactivity = 'last_expr'
        compiler = <IPython.core.compilerop.CachingCompiler object>
   2786                 
   2787                 self.last_execution_succeeded = not has_raised
   2788                 self.last_execution_result = result
   2789 

...........................................................................
/Users/ethandoyle/anaconda3/lib/python3.6/site-packages/IPython/core/interactiveshell.py in run_ast_nodes(self=<ipykernel.zmqshell.ZMQInteractiveShell object>, nodelist=[<_ast.Assign object>], cell_name='<ipython-input-41-22fe4a3d5e47>', interactivity='none', compiler=<IPython.core.compilerop.CachingCompiler object>, result=<ExecutionResult object at 1a1fa43be0, execution...rue silent=False shell_futures=True> result=None>)
   2896             raise ValueError("Interactivity was %r" % interactivity)
   2897         try:
   2898             for i, node in enumerate(to_run_exec):
   2899                 mod = ast.Module([node])
   2900                 code = compiler(mod, cell_name, "exec")
-> 2901                 if self.run_code(code, result):
        self.run_code = <bound method InteractiveShell.run_code of <ipykernel.zmqshell.ZMQInteractiveShell object>>
        code = <code object <module> at 0x1a1fa15660, file "<ipython-input-41-22fe4a3d5e47>", line 1>
        result = <ExecutionResult object at 1a1fa43be0, execution...rue silent=False shell_futures=True> result=None>
   2902                     return True
   2903 
   2904             for i, node in enumerate(to_run_interactive):
   2905                 mod = ast.Interactive([node])

...........................................................................
/Users/ethandoyle/anaconda3/lib/python3.6/site-packages/IPython/core/interactiveshell.py in run_code(self=<ipykernel.zmqshell.ZMQInteractiveShell object>, code_obj=<code object <module> at 0x1a1fa15660, file "<ipython-input-41-22fe4a3d5e47>", line 1>, result=<ExecutionResult object at 1a1fa43be0, execution...rue silent=False shell_futures=True> result=None>)
   2956         outflag = True  # happens in more places, so it's easier as default
   2957         try:
   2958             try:
   2959                 self.hooks.pre_run_code_hook()
   2960                 #rprint('Running code', repr(code_obj)) # dbg
-> 2961                 exec(code_obj, self.user_global_ns, self.user_ns)
        code_obj = <code object <module> at 0x1a1fa15660, file "<ipython-input-41-22fe4a3d5e47>", line 1>
        self.user_global_ns = {'BaseEstimator': <class 'sklearn.base.BaseEstimator'>, 'CUSTOM_STOP': ['a', 'about', 'above', 'across', 'after', 'afterwards', 'again', 'against', 'all', 'almost', 'alone', 'along', 'already', 'also', 'although', 'always', 'am', 'among', 'amongst', 'amoungst', ...], 'FitModel': <class '__main__.FitModel'>, 'GradientBoostingClassifier': <class 'sklearn.ensemble.gradient_boosting.GradientBoostingClassifier'>, 'In': ['', 'import pandas as pd\nimport numpy as np\nfrom skle...learn.base import BaseEstimator, TransformerMixin', "chris_complete = pd.read_csv('../data/chris_complete_training.csv', index_col=0)", "chris_sample = pd.read_csv('../data/chris_training.csv', index_col=0)", 'sample_df = pd.DataFrame(chris_sample.iloc[:48,:])', "sample_df.to_csv('../data/chris_model_eval.csv')", 'sample_complete = pd.DataFrame(chris_complete.iloc[:48,:])', 'X = sample_df.jobs', 'y = sample_complete.labels', 'class PruneForest():\n    def __init__(self, para...t(reverse_dict, indices)\n        return word_list', 'sss = StratifiedShuffleSplit(n_splits=10, test_size=0.3)', 'forest_loss = []\nfor train_index, test_index in ..._loss.append(log_loss(y_test, y_pred))\n    i += 1', 'forest_loss', 'np.mean(forest_loss)', 'import pandas as pd\nimport numpy as np\nfrom skle...learn.base import BaseEstimator, TransformerMixin', "get_ipython().run_line_magic('pinfo', 'TfidfVectorizer')", "df = pd.read_csv('../data/LABELEDTEST.csv', index_col=0)", 'df', 'X = df.iloc[1:, 4:]\ny = df.iloc[1:, 2]', 'X', ...], 'Out': {12: [0.4469628223957763, 0.44105488816051625, 0.42954647272925967, 0.42242410518575896, 0.41995934246625344, 0.42556173024054644, 0.4315859261994941, 0.417403077560579, 0.43128476341648786, 0.43784115942107665], 13: 0.4303624287775749, 17:     indices                                     ...0  0.000000  0.000000  

[31 rows x 2556 columns], 19:            0         1         2         3      ...00000   0.0  0.000000  

[30 rows x 2552 columns], 20: <class 'pandas.core.frame.DataFrame'>, 21: 1     1.0
2    -1.0
3    -1.0
4     0.0
5    -1....
29   -1.0
30   -1.0
Name: labels, dtype: float64, 23: (31, 2556), 24: (26, 2556), 26: 1     1.0
2    -1.0
3    -1.0
5    -1.0
7    -1....
29   -1.0
30   -1.0
Name: labels, dtype: float64, 27:            0         1         2         3      ...00000   0.0  0.000000  

[25 rows x 2552 columns], ...}, 'Pipeline': <class 'sklearn.pipeline.Pipeline'>, 'PruneForest': <class '__main__.PruneForest'>, 'RandomForestClassifier': <class 'sklearn.ensemble.forest.RandomForestClassifier'>, 'StratifiedShuffleSplit': <class 'sklearn.model_selection._split.StratifiedShuffleSplit'>, ...}
        self.user_ns = {'BaseEstimator': <class 'sklearn.base.BaseEstimator'>, 'CUSTOM_STOP': ['a', 'about', 'above', 'across', 'after', 'afterwards', 'again', 'against', 'all', 'almost', 'alone', 'along', 'already', 'also', 'although', 'always', 'am', 'among', 'amongst', 'amoungst', ...], 'FitModel': <class '__main__.FitModel'>, 'GradientBoostingClassifier': <class 'sklearn.ensemble.gradient_boosting.GradientBoostingClassifier'>, 'In': ['', 'import pandas as pd\nimport numpy as np\nfrom skle...learn.base import BaseEstimator, TransformerMixin', "chris_complete = pd.read_csv('../data/chris_complete_training.csv', index_col=0)", "chris_sample = pd.read_csv('../data/chris_training.csv', index_col=0)", 'sample_df = pd.DataFrame(chris_sample.iloc[:48,:])', "sample_df.to_csv('../data/chris_model_eval.csv')", 'sample_complete = pd.DataFrame(chris_complete.iloc[:48,:])', 'X = sample_df.jobs', 'y = sample_complete.labels', 'class PruneForest():\n    def __init__(self, para...t(reverse_dict, indices)\n        return word_list', 'sss = StratifiedShuffleSplit(n_splits=10, test_size=0.3)', 'forest_loss = []\nfor train_index, test_index in ..._loss.append(log_loss(y_test, y_pred))\n    i += 1', 'forest_loss', 'np.mean(forest_loss)', 'import pandas as pd\nimport numpy as np\nfrom skle...learn.base import BaseEstimator, TransformerMixin', "get_ipython().run_line_magic('pinfo', 'TfidfVectorizer')", "df = pd.read_csv('../data/LABELEDTEST.csv', index_col=0)", 'df', 'X = df.iloc[1:, 4:]\ny = df.iloc[1:, 2]', 'X', ...], 'Out': {12: [0.4469628223957763, 0.44105488816051625, 0.42954647272925967, 0.42242410518575896, 0.41995934246625344, 0.42556173024054644, 0.4315859261994941, 0.417403077560579, 0.43128476341648786, 0.43784115942107665], 13: 0.4303624287775749, 17:     indices                                     ...0  0.000000  0.000000  

[31 rows x 2556 columns], 19:            0         1         2         3      ...00000   0.0  0.000000  

[30 rows x 2552 columns], 20: <class 'pandas.core.frame.DataFrame'>, 21: 1     1.0
2    -1.0
3    -1.0
4     0.0
5    -1....
29   -1.0
30   -1.0
Name: labels, dtype: float64, 23: (31, 2556), 24: (26, 2556), 26: 1     1.0
2    -1.0
3    -1.0
5    -1.0
7    -1....
29   -1.0
30   -1.0
Name: labels, dtype: float64, 27:            0         1         2         3      ...00000   0.0  0.000000  

[25 rows x 2552 columns], ...}, 'Pipeline': <class 'sklearn.pipeline.Pipeline'>, 'PruneForest': <class '__main__.PruneForest'>, 'RandomForestClassifier': <class 'sklearn.ensemble.forest.RandomForestClassifier'>, 'StratifiedShuffleSplit': <class 'sklearn.model_selection._split.StratifiedShuffleSplit'>, ...}
   2962             finally:
   2963                 # Reset our crash handler in place
   2964                 sys.excepthook = old_excepthook
   2965         except SystemExit as e:

...........................................................................
/Users/ethandoyle/galvanize/capstone/Ember-Job-Recommender/pyfiles/<ipython-input-41-22fe4a3d5e47> in <module>()
----> 1 forest, vectorizer = test.fit_primary(X, y)

...........................................................................
/Users/ethandoyle/galvanize/capstone/Ember-Job-Recommender/pyfiles/<ipython-input-31-cf296a23baf1> in fit_primary(self=<__main__.FitModel object>, X=           0         1         2         3      ...00000   0.0  0.000000  

[25 rows x 2552 columns], y=1     1.0
2    -1.0
3    -1.0
5    -1.0
7    -1....
29   -1.0
30   -1.0
Name: labels, dtype: float64)
     14     
     15     def fit_primary(self, X, y):
     16         self.vectorizer.fit(X)
     17         transformed_matrix = self.vectorizer.transform(X)
     18         tfidf_df = pd.DataFrame(transformed_matrix.toarray())
---> 19         self.forest.fit(tfidf_df, y)
     20         return self.forest, self.vectorizer
     21     
     22     def prune_forest(self, forest, vectorizer):
     23         vocabulary = self.pruned_forest.get_vocabulary(forest, vectorizer)

...........................................................................
/Users/ethandoyle/anaconda3/lib/python3.6/site-packages/sklearn/ensemble/forest.py in fit(self=RandomForestClassifier(bootstrap=True, class_wei...te=None, verbose=0,
            warm_start=False), X=array([[0., 0., 0., ..., 0., 0., 0.],
       [0....   [0., 0., 0., ..., 0., 0., 0.]], dtype=float32), y=array([[1.],
       [0.],
       [0.],
       [0...    [1.],
       [1.],
       [0.],
       [0.]]), sample_weight=None)
    323             trees = Parallel(n_jobs=self.n_jobs, verbose=self.verbose,
    324                              backend="threading")(
    325                 delayed(_parallel_build_trees)(
    326                     t, self, X, y, sample_weight, i, len(trees),
    327                     verbose=self.verbose, class_weight=self.class_weight)
--> 328                 for i, t in enumerate(trees))
        i = 1999
    329 
    330             # Collect newly grown trees
    331             self.estimators_.extend(trees)
    332 

...........................................................................
/Users/ethandoyle/anaconda3/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in __call__(self=Parallel(n_jobs=-1), iterable=<generator object BaseForest.fit.<locals>.<genexpr>>)
    784             if pre_dispatch == "all" or n_jobs == 1:
    785                 # The iterable was consumed all at once by the above for loop.
    786                 # No need to wait for async callbacks to trigger to
    787                 # consumption.
    788                 self._iterating = False
--> 789             self.retrieve()
        self.retrieve = <bound method Parallel.retrieve of Parallel(n_jobs=-1)>
    790             # Make sure that we get a last message telling us we are done
    791             elapsed_time = time.time() - self._start_time
    792             self._print('Done %3i out of %3i | elapsed: %s finished',
    793                         (len(self._output), len(self._output),

---------------------------------------------------------------------------
Sub-process traceback:
---------------------------------------------------------------------------
ValueError                                         Tue Dec  4 10:41:50 2018
PID: 21631             Python 3.6.6: /Users/ethandoyle/anaconda3/bin/python
...........................................................................
/Users/ethandoyle/anaconda3/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in __call__(self=<sklearn.externals.joblib.parallel.BatchedCalls object>)
    126     def __init__(self, iterator_slice):
    127         self.items = list(iterator_slice)
    128         self._size = len(self.items)
    129 
    130     def __call__(self):
--> 131         return [func(*args, **kwargs) for func, args, kwargs in self.items]
        self.items = [(<function _parallel_build_trees>, (DecisionTreeClassifier(class_weight=None, criter...        random_state=1137871592, splitter='best'), RandomForestClassifier(bootstrap=True, class_wei...te=None, verbose=0,
            warm_start=False), array([[0., 0., 0., ..., 0., 0., 0.],
       [0....   [0., 0., 0., ..., 0., 0., 0.]], dtype=float32), array([[1.],
       [0.],
       [0.],
       [0...    [1.],
       [1.],
       [0.],
       [0.]]), None, 0, 2000), {'class_weight': None, 'verbose': 0})]
    132 
    133     def __len__(self):
    134         return self._size
    135 

...........................................................................
/Users/ethandoyle/anaconda3/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in <listcomp>(.0=<list_iterator object>)
    126     def __init__(self, iterator_slice):
    127         self.items = list(iterator_slice)
    128         self._size = len(self.items)
    129 
    130     def __call__(self):
--> 131         return [func(*args, **kwargs) for func, args, kwargs in self.items]
        func = <function _parallel_build_trees>
        args = (DecisionTreeClassifier(class_weight=None, criter...        random_state=1137871592, splitter='best'), RandomForestClassifier(bootstrap=True, class_wei...te=None, verbose=0,
            warm_start=False), array([[0., 0., 0., ..., 0., 0., 0.],
       [0....   [0., 0., 0., ..., 0., 0., 0.]], dtype=float32), array([[1.],
       [0.],
       [0.],
       [0...    [1.],
       [1.],
       [0.],
       [0.]]), None, 0, 2000)
        kwargs = {'class_weight': None, 'verbose': 0}
    132 
    133     def __len__(self):
    134         return self._size
    135 

...........................................................................
/Users/ethandoyle/anaconda3/lib/python3.6/site-packages/sklearn/ensemble/forest.py in _parallel_build_trees(tree=DecisionTreeClassifier(class_weight=None, criter...        random_state=1137871592, splitter='best'), forest=RandomForestClassifier(bootstrap=True, class_wei...te=None, verbose=0,
            warm_start=False), X=array([[0., 0., 0., ..., 0., 0., 0.],
       [0....   [0., 0., 0., ..., 0., 0., 0.]], dtype=float32), y=array([[1.],
       [0.],
       [0.],
       [0...    [1.],
       [1.],
       [0.],
       [0.]]), sample_weight=None, tree_idx=0, n_trees=2000, verbose=0, class_weight=None)
    116                 warnings.simplefilter('ignore', DeprecationWarning)
    117                 curr_sample_weight *= compute_sample_weight('auto', y, indices)
    118         elif class_weight == 'balanced_subsample':
    119             curr_sample_weight *= compute_sample_weight('balanced', y, indices)
    120 
--> 121         tree.fit(X, y, sample_weight=curr_sample_weight, check_input=False)
        tree.fit = <bound method DecisionTreeClassifier.fit of Deci...       random_state=1137871592, splitter='best')>
        X = array([[0., 0., 0., ..., 0., 0., 0.],
       [0....   [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)
        y = array([[1.],
       [0.],
       [0.],
       [0...    [1.],
       [1.],
       [0.],
       [0.]])
        sample_weight = None
        curr_sample_weight = array([1., 2., 0., ..., 2., 0., 0.])
    122     else:
    123         tree.fit(X, y, sample_weight=sample_weight, check_input=False)
    124 
    125     return tree

...........................................................................
/Users/ethandoyle/anaconda3/lib/python3.6/site-packages/sklearn/tree/tree.py in fit(self=DecisionTreeClassifier(class_weight=None, criter...        random_state=1137871592, splitter='best'), X=array([[0., 0., 0., ..., 0., 0., 0.],
       [0....   [0., 0., 0., ..., 0., 0., 0.]], dtype=float32), y=array([[1.],
       [0.],
       [0.],
       [0...    [1.],
       [1.],
       [0.],
       [0.]]), sample_weight=array([1., 2., 0., ..., 2., 0., 0.]), check_input=False, X_idx_sorted=None)
    785 
    786         super(DecisionTreeClassifier, self).fit(
    787             X, y,
    788             sample_weight=sample_weight,
    789             check_input=check_input,
--> 790             X_idx_sorted=X_idx_sorted)
        X_idx_sorted = None
    791         return self
    792 
    793     def predict_proba(self, X, check_input=True):
    794         """Predict class probabilities of the input samples X.

...........................................................................
/Users/ethandoyle/anaconda3/lib/python3.6/site-packages/sklearn/tree/tree.py in fit(self=DecisionTreeClassifier(class_weight=None, criter...        random_state=1137871592, splitter='best'), X=array([[0., 0., 0., ..., 0., 0., 0.],
       [0....   [0., 0., 0., ..., 0., 0., 0.]], dtype=float32), y=array([[1.],
       [0.],
       [0.],
       [0...    [1.],
       [1.],
       [0.],
       [0.]]), sample_weight=array([1., 2., 0., ..., 2., 0., 0.]), check_input=False, X_idx_sorted=None)
    231 
    232         self.max_features_ = max_features
    233 
    234         if len(y) != n_samples:
    235             raise ValueError("Number of labels=%d does not match "
--> 236                              "number of samples=%d" % (len(y), n_samples))
        y = array([[1.],
       [0.],
       [0.],
       [0...    [1.],
       [1.],
       [0.],
       [0.]])
        n_samples = 2552
    237         if not 0 <= self.min_weight_fraction_leaf <= 0.5:
    238             raise ValueError("min_weight_fraction_leaf must in [0, 0.5]")
    239         if max_depth <= 0:
    240             raise ValueError("max_depth must be greater than zero. ")

ValueError: Number of labels=25 does not match number of samples=2552
___________________________________________________________________________