In [1]:
'''Make possible for python notebooks to import the util as module. This needs to be copied at every python notebook which wants to 
load the module'''

import io, os, sys, types
from IPython import get_ipython
from nbformat import read
from IPython.core.interactiveshell import InteractiveShell

def find_notebook(fullname, path=None):
    """find a notebook, given its fully qualified name and an optional path
    
    This turns "foo.bar" into "foo/bar.ipynb"
    and tries turning "Foo_Bar" into "Foo Bar" if Foo_Bar
    does not exist.
    """
    name = fullname.rsplit('.', 1)[-1]
    if not path:
        path = ['']
    for d in path:
        nb_path = os.path.join(d, name + ".ipynb")
        if os.path.isfile(nb_path):
            return nb_path
        # let import Notebook_Name find "Notebook Name.ipynb"
        nb_path = nb_path.replace("_", " ")
        if os.path.isfile(nb_path):
            return nb_path


class NotebookLoader(object):
    """Module Loader for Jupyter Notebooks"""
    def __init__(self, path=None):
        self.shell = InteractiveShell.instance()
        self.path = path
    
    def load_module(self, fullname):
        """import a notebook as a module"""
        path = find_notebook(fullname, self.path)
        
        print ("importing Jupyter notebook from %s" % path)
                                       
        # load the notebook object
        with io.open(path, 'r', encoding='utf-8') as f:
            nb = read(f, 4)

        # create the module and add it to sys.modules
        # if name in sys.modules:
        #    return sys.modules[name]
        mod = types.ModuleType(fullname)
        mod.__file__ = path
        mod.__loader__ = self
        mod.__dict__['get_ipython'] = get_ipython
        sys.modules[fullname] = mod
        
        # extra work to ensure that magics that would affect the user_ns
        # actually affect the notebook module's ns
        save_user_ns = self.shell.user_ns
        self.shell.user_ns = mod.__dict__
        
        try:
          for cell in nb.cells:
            if cell.cell_type == 'code':
                # transform the input to executable Python
                code = self.shell.input_transformer_manager.transform_cell(cell.source)
                # run the code in themodule
                exec(code, mod.__dict__)
        finally:
            self.shell.user_ns = save_user_ns
        return mod


class NotebookFinder(object):
    """Module finder that locates Jupyter Notebooks"""
    def __init__(self):
        self.loaders = {}
    
    def find_module(self, fullname, path=None):
        nb_path = find_notebook(fullname, path)
        if not nb_path:
            return
        
        key = path
        if path:
            # lists aren't hashable
            key = os.path.sep.join(path)
        
        if key not in self.loaders:
            self.loaders[key] = NotebookLoader(path)
        return self.loaders[key]
    
sys.meta_path.append(NotebookFinder())

In [2]:
import tm_assignment_util as util
myutilObj = util.util()

importing Jupyter notebook from tm_assignment_util.ipynb
['catch', 'machine', 'accident', 'inspect', 'maintain', 'excavator', 'magnet', 'machine', 'maintenance', 'carry', 'make', 'jump', 'grappler', 'turn', 'excavator', 'engine', 'grappler', 'spin', 'pin', 'grappler', 'excavator']


In [3]:
#Let's Build the Model
# Apply preprocessing to every document in the training set
X_Toks_Trn = util.X_Cases_Trn.apply(myutilObj.my_tokens_as_text)
X_Toks_Tst = util.X_Cases_Tst.apply(myutilObj.my_tokens_as_text)

In [4]:
#Random Forest
#Build a pipeline: Combine multiple steps into one
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.ensemble import RandomForestClassifier
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer(use_idf=True)),
                      ('clf', RandomForestClassifier(n_estimators=11, criterion='gini'))
                    ])

In [5]:
text_clf.fit(X_Toks_Trn, util.Y_Cases_Trn)

Pipeline(steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...imators=11, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False))])

In [6]:
util.Y_Cases_Trn.describe()

count      2894
unique       12
top       Falls
freq       1045
Name: Cause, dtype: object

In [7]:
predicted = text_clf.predict(X_Toks_Tst)

In [8]:
#Test model accuracy
import numpy as np
from sklearn import metrics
print(metrics.confusion_matrix(util.Y_Cases_Tst, predicted))
print(np.mean(predicted == util.Y_Cases_Tst))
#y_test.value_counts()
print(metrics.classification_report(util.Y_Cases_Tst, predicted))

[[ 89  10   0   1   1   0  19   0   0   9   0]
 [  6  78   0   0   0   0  43   0   0   5   0]
 [  2   0  11   0   0   0   3   0   0   1   0]
 [  0   1   0  32   0   0   2   0   0   0   0]
 [  4   3   2   0  13   0   4   3   0   1   0]
 [  0   3   0   0   1   4   1  13   0   1   0]
 [  3  12   2   0   0   0 394   0   0   1   1]
 [  2   3   0   2   0   0   3  26   0   0   0]
 [  0   0   0   0   0   0   0   0   8   1   0]
 [ 16   6   0   0   0   0   5   0   0  26   0]
 [  1   2   0   0   6   0   6   2   0   2  10]]
0.762693156733
                                  precision    recall  f1-score   support

       Caught in/between Objects       0.72      0.69      0.71       129
              Collapse of object       0.66      0.59      0.62       132
                        Drowning       0.73      0.65      0.69        17
                   Electrocution       0.91      0.91      0.91        35
 Exposure to Chemical Substances       0.62      0.43      0.51        30
Exposure to extreme te

In [9]:
predicted_target_rf = text_clf.predict(util.accidentCases_Osha.Title_Summary_Case.apply(myutilObj.my_tokens_as_text))
cleaned_target_rf = util.accidentCases_Osha.Summary.apply(myutilObj.my_tokens_as_text)

In [10]:
from collections import Counter
import numpy as np
import matplotlib.pyplot as plt

Counter(predicted_target_rf)

Counter({'Caught in/between Objects': 4157,
         'Caught in/between objects': 5,
         'Collapse of object': 1566,
         'Drowning': 122,
         'Electrocution': 964,
         'Exposure to Chemical Substances': 579,
         'Exposure to extreme temperatures': 292,
         'Falls': 3146,
         'Fires and Explosion': 1226,
         'Other': 67,
         'Struck By Moving Objects': 635,
         'Suffocation': 71})

In [11]:
with open("RF_TAG_file.txt", "w") as output:
    for item in predicted_target_rf:
        output.write("%s\n" % item)

In [12]:
with open("RF_CLEAN_file.txt", "w") as output:
    for item in cleaned_target_rf:
        output.write("%s\n" % item)