In [9]:
'''Make possible for python notebooks to import the util as module. This needs to be copied at every python notebook which wants to 
load the module'''

import io, os, sys, types
from IPython import get_ipython
from nbformat import read
from IPython.core.interactiveshell import InteractiveShell

def find_notebook(fullname, path=None):
    """find a notebook, given its fully qualified name and an optional path
    
    This turns "foo.bar" into "foo/bar.ipynb"
    and tries turning "Foo_Bar" into "Foo Bar" if Foo_Bar
    does not exist.
    """
    name = fullname.rsplit('.', 1)[-1]
    if not path:
        path = ['']
    for d in path:
        nb_path = os.path.join(d, name + ".ipynb")
        if os.path.isfile(nb_path):
            return nb_path
        # let import Notebook_Name find "Notebook Name.ipynb"
        nb_path = nb_path.replace("_", " ")
        if os.path.isfile(nb_path):
            return nb_path


class NotebookLoader(object):
    """Module Loader for Jupyter Notebooks"""
    def __init__(self, path=None):
        self.shell = InteractiveShell.instance()
        self.path = path
    
    def load_module(self, fullname):
        """import a notebook as a module"""
        path = find_notebook(fullname, self.path)
        
        print ("importing Jupyter notebook from %s" % path)
                                       
        # load the notebook object
        with io.open(path, 'r', encoding='utf-8') as f:
            nb = read(f, 4)

        # create the module and add it to sys.modules
        # if name in sys.modules:
        #    return sys.modules[name]
        mod = types.ModuleType(fullname)
        mod.__file__ = path
        mod.__loader__ = self
        mod.__dict__['get_ipython'] = get_ipython
        sys.modules[fullname] = mod
        
        # extra work to ensure that magics that would affect the user_ns
        # actually affect the notebook module's ns
        save_user_ns = self.shell.user_ns
        self.shell.user_ns = mod.__dict__
        
        try:
          for cell in nb.cells:
            if cell.cell_type == 'code':
                # transform the input to executable Python
                code = self.shell.input_transformer_manager.transform_cell(cell.source)
                # run the code in themodule
                exec(code, mod.__dict__)
        finally:
            self.shell.user_ns = save_user_ns
        return mod


class NotebookFinder(object):
    """Module finder that locates Jupyter Notebooks"""
    def __init__(self):
        self.loaders = {}
    
    def find_module(self, fullname, path=None):
        nb_path = find_notebook(fullname, path)
        if not nb_path:
            return
        
        key = path
        if path:
            # lists aren't hashable
            key = os.path.sep.join(path)
        
        if key not in self.loaders:
            self.loaders[key] = NotebookLoader(path)
        return self.loaders[key]
    
sys.meta_path.append(NotebookFinder())

In [10]:
import tm_assignment_util as util
myutilObj = util.util()

In [11]:
#Let's Build the Model
# Apply preprocessing to every document in the training set
X_Toks_Trn = util.X_Cases_Trn.apply(myutilObj.my_tokenizer)
X_Toks_Tst = util.X_Cases_Tst.apply(myutilObj.my_tokenizer)

In [12]:
import logging
import gensim
from gensim import corpora

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

# Prepare a vocabulary dictionary.
dictionary = corpora.Dictionary(X_Toks_Trn)
print(dictionary)


2017-10-22 15:44:28,838 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2017-10-22 15:44:28,862 : INFO : built Dictionary(3660 unique tokens: ['catch', 'machine', 'accident', 'inspect', 'maintain']...) from 527 documents (total 22763 corpus positions)


Dictionary(3660 unique tokens: ['catch', 'machine', 'accident', 'inspect', 'maintain']...)


In [13]:
# Filter off any words with document frequency less than 3, or appearing in more than 80% documents
dictionary.filter_extremes(no_below=3, no_above=0.5)
print(dictionary)

2017-10-22 15:44:28,878 : INFO : discarding 2398 tokens: [('magnet', 1), ('grappler', 1), ('falsework', 1), ('supporter', 1), ('arch', 2), ('co-workers', 2), ('sales', 2), ('cave-in', 1), ('series', 2), ('avalanche', 2)]...
2017-10-22 15:44:28,882 : INFO : keeping 1262 tokens which were in no less than 3 and no more than 263 (=50.0%) documents
2017-10-22 15:44:28,886 : INFO : resulting dictionary: Dictionary(1262 unique tokens: ['catch', 'machine', 'accident', 'inspect', 'maintain']...)


Dictionary(1262 unique tokens: ['catch', 'machine', 'accident', 'inspect', 'maintain']...)


In [14]:
# Use the dictionary to prepare a DTM (using TF)
dtm_train = [dictionary.doc2bow(d) for d in X_Toks_Trn]

In [15]:
# Build an LDA model for 8 topics out of the DTM
lda = gensim.models.ldamodel.LdaModel(dtm_train, num_topics = 11, id2word = dictionary, passes = 200)

2017-10-22 15:44:28,926 : INFO : using symmetric alpha at 0.09090909090909091
2017-10-22 15:44:28,926 : INFO : using symmetric eta at 0.000792393026941363
2017-10-22 15:44:28,930 : INFO : using serial LDA version on this node
2017-10-22 15:44:28,986 : INFO : running online (multi-pass) LDA training, 11 topics, 200 passes over the supplied corpus of 527 documents, updating model once every 527 documents, evaluating perplexity every 527 documents, iterating 50x with a convergence threshold of 0.001000
2017-10-22 15:44:30,096 : INFO : -8.695 per-word bound, 414.4 perplexity estimate based on a held-out corpus of 527 documents with 19105 words
2017-10-22 15:44:30,096 : INFO : PROGRESS: pass 0, at document #527/527
2017-10-22 15:44:30,723 : INFO : topic #6 (0.091): 0.015*"forklift" + 0.013*"finger" + 0.012*"fall" + 0.010*"coworker" + 0.009*"wall" + 0.009*"trench" + 0.009*"tree" + 0.008*"rail" + 0.008*"right" + 0.007*"area"
2017-10-22 15:44:30,723 : INFO : topic #2 (0.091): 0.013*"fall" + 0.

2017-10-22 15:44:36,231 : INFO : topic #2 (0.091): 0.021*"roof" + 0.016*"tire" + 0.016*"tile" + 0.014*"nail" + 0.014*"coworker" + 0.011*"boom" + 0.011*"fall" + 0.010*"storm" + 0.009*"replace" + 0.008*"leave"
2017-10-22 15:44:36,231 : INFO : topic diff=0.310734, rho=0.377964
2017-10-22 15:44:36,949 : INFO : -6.875 per-word bound, 117.3 perplexity estimate based on a held-out corpus of 527 documents with 19105 words
2017-10-22 15:44:36,949 : INFO : PROGRESS: pass 6, at document #527/527
2017-10-22 15:44:37,244 : INFO : topic #10 (0.091): 0.015*"tank" + 0.012*"kill" + 0.012*"cart" + 0.012*"leave" + 0.010*"steel" + 0.009*"strike" + 0.008*"april" + 0.008*"pipe" + 0.008*"load" + 0.008*"fall"
2017-10-22 15:44:37,244 : INFO : topic #3 (0.091): 0.032*"finger" + 0.018*"right" + 0.014*"hand" + 0.013*"amputate" + 0.013*"machine" + 0.010*"press" + 0.009*"pull" + 0.009*"cut" + 0.008*"die" + 0.008*"injury"
2017-10-22 15:44:37,244 : INFO : topic #7 (0.091): 0.035*"forklift" + 0.034*"belt" + 0.019*"han

2017-10-22 15:44:42,447 : INFO : topic diff=0.094486, rho=0.277350
2017-10-22 15:44:43,148 : INFO : -6.812 per-word bound, 112.4 perplexity estimate based on a held-out corpus of 527 documents with 19105 words
2017-10-22 15:44:43,148 : INFO : PROGRESS: pass 12, at document #527/527
2017-10-22 15:44:43,443 : INFO : topic #9 (0.091): 0.017*"grain" + 0.013*"line" + 0.012*"air" + 0.012*"power" + 0.011*"water" + 0.011*"tank" + 0.010*"get" + 0.009*"tiger" + 0.008*"attempt" + 0.008*"rope"
2017-10-22 15:44:43,443 : INFO : topic #8 (0.091): 0.031*"burn" + 0.013*"coworker" + 0.010*"machine" + 0.010*"hospitalize" + 0.009*"strike" + 0.009*"area" + 0.009*"golf" + 0.008*"sustain" + 0.007*"accident" + 0.007*"employer"
2017-10-22 15:44:43,443 : INFO : topic #0 (0.091): 0.074*"tree" + 0.021*"cut" + 0.019*"kill" + 0.018*"strike" + 0.014*"bulldozer" + 0.012*"fire" + 0.011*"fall" + 0.011*"vehicle" + 0.010*"second" + 0.010*"first"
2017-10-22 15:44:43,453 : INFO : topic #5 (0.091): 0.033*"truck" + 0.022*"ba

2017-10-22 15:44:49,280 : INFO : PROGRESS: pass 18, at document #527/527
2017-10-22 15:44:49,558 : INFO : topic #7 (0.091): 0.036*"belt" + 0.036*"forklift" + 0.019*"hand" + 0.016*"fan" + 0.014*"grind" + 0.012*"unit" + 0.012*"conveyor" + 0.012*"finger" + 0.012*"cut" + 0.010*"lift"
2017-10-22 15:44:49,558 : INFO : topic #6 (0.091): 0.028*"trench" + 0.018*"wall" + 0.017*"rail" + 0.015*"forklift" + 0.015*"collapse" + 0.011*"dock" + 0.010*"dog" + 0.009*"fall" + 0.009*"section" + 0.008*"door"
2017-10-22 15:44:49,558 : INFO : topic #10 (0.091): 0.015*"tank" + 0.013*"kill" + 0.012*"cart" + 0.012*"leave" + 0.012*"steel" + 0.011*"load" + 0.009*"april" + 0.009*"feet" + 0.009*"supervisor" + 0.009*"finger"
2017-10-22 15:44:49,558 : INFO : topic #2 (0.091): 0.035*"roof" + 0.019*"tire" + 0.017*"tile" + 0.015*"nail" + 0.014*"coworker" + 0.012*"boom" + 0.011*"storm" + 0.010*"lift" + 0.010*"replace" + 0.009*"pin"
2017-10-22 15:44:49,572 : INFO : topic #4 (0.091): 0.026*"lift" + 0.022*"barge" + 0.017*"bu

2017-10-22 15:44:55,439 : INFO : topic #10 (0.091): 0.015*"tank" + 0.013*"kill" + 0.012*"cart" + 0.012*"steel" + 0.012*"leave" + 0.012*"load" + 0.010*"feet" + 0.009*"april" + 0.009*"supervisor" + 0.009*"finger"
2017-10-22 15:44:55,439 : INFO : topic #1 (0.091): 0.065*"fall" + 0.022*"work" + 0.020*"scaffold" + 0.018*"strike" + 0.015*"water" + 0.015*"tractor" + 0.015*"kill" + 0.014*"believe" + 0.013*"hit" + 0.012*"collapse"
2017-10-22 15:44:55,439 : INFO : topic #0 (0.091): 0.078*"tree" + 0.023*"cut" + 0.021*"strike" + 0.021*"kill" + 0.015*"bulldozer" + 0.013*"bucket" + 0.012*"fall" + 0.012*"vehicle" + 0.011*"fire" + 0.010*"crush"
2017-10-22 15:44:55,451 : INFO : topic #5 (0.091): 0.036*"truck" + 0.026*"backhoe" + 0.021*"coworker" + 0.018*"strike" + 0.012*"operate" + 0.012*"fracture" + 0.012*"hit" + 0.012*"driver" + 0.011*"run" + 0.011*"pipe"
2017-10-22 15:44:55,451 : INFO : topic diff=0.024629, rho=0.196116
2017-10-22 15:44:56,175 : INFO : -6.770 per-word bound, 109.1 perplexity estimat

2017-10-22 15:45:01,321 : INFO : topic #3 (0.091): 0.038*"finger" + 0.022*"right" + 0.019*"hand" + 0.017*"amputate" + 0.015*"machine" + 0.011*"press" + 0.010*"pull" + 0.010*"cut" + 0.009*"saw" + 0.009*"die"
2017-10-22 15:45:01,321 : INFO : topic #5 (0.091): 0.036*"truck" + 0.026*"backhoe" + 0.021*"coworker" + 0.018*"strike" + 0.013*"operate" + 0.012*"fracture" + 0.011*"driver" + 0.011*"hit" + 0.011*"run" + 0.011*"pipe"
2017-10-22 15:45:01,328 : INFO : topic #8 (0.091): 0.034*"burn" + 0.012*"coworker" + 0.011*"machine" + 0.011*"hospitalize" + 0.010*"area" + 0.009*"golf" + 0.009*"sustain" + 0.008*"clean" + 0.007*"accident" + 0.007*"strike"
2017-10-22 15:45:01,328 : INFO : topic diff=0.017743, rho=0.176777
2017-10-22 15:45:02,009 : INFO : -6.759 per-word bound, 108.3 perplexity estimate based on a held-out corpus of 527 documents with 19105 words
2017-10-22 15:45:02,009 : INFO : PROGRESS: pass 31, at document #527/527
2017-10-22 15:45:02,271 : INFO : topic #10 (0.091): 0.016*"tank" + 0.01

2017-10-22 15:45:07,102 : INFO : topic #5 (0.091): 0.036*"truck" + 0.026*"backhoe" + 0.022*"coworker" + 0.018*"strike" + 0.013*"operate" + 0.012*"fracture" + 0.011*"driver" + 0.011*"run" + 0.011*"pipe" + 0.011*"hit"
2017-10-22 15:45:07,104 : INFO : topic #10 (0.091): 0.016*"tank" + 0.014*"kill" + 0.013*"load" + 0.012*"cart" + 0.012*"steel" + 0.012*"leave" + 0.010*"feet" + 0.009*"supervisor" + 0.009*"april" + 0.009*"strike"
2017-10-22 15:45:07,104 : INFO : topic diff=0.013248, rho=0.162221
2017-10-22 15:45:07,791 : INFO : -6.752 per-word bound, 107.8 perplexity estimate based on a held-out corpus of 527 documents with 19105 words
2017-10-22 15:45:07,791 : INFO : PROGRESS: pass 37, at document #527/527
2017-10-22 15:45:08,052 : INFO : topic #10 (0.091): 0.016*"tank" + 0.014*"kill" + 0.013*"load" + 0.012*"cart" + 0.012*"steel" + 0.012*"leave" + 0.010*"feet" + 0.009*"supervisor" + 0.009*"april" + 0.009*"strike"
2017-10-22 15:45:08,052 : INFO : topic #2 (0.091): 0.037*"roof" + 0.020*"tire" 

2017-10-22 15:45:12,960 : INFO : topic #0 (0.091): 0.078*"tree" + 0.024*"cut" + 0.022*"strike" + 0.022*"bucket" + 0.021*"kill" + 0.015*"bulldozer" + 0.012*"fall" + 0.011*"crush" + 0.011*"vehicle" + 0.011*"head"
2017-10-22 15:45:12,960 : INFO : topic diff=0.009968, rho=0.150756
2017-10-22 15:45:13,639 : INFO : -6.747 per-word bound, 107.4 perplexity estimate based on a held-out corpus of 527 documents with 19105 words
2017-10-22 15:45:13,639 : INFO : PROGRESS: pass 43, at document #527/527
2017-10-22 15:45:13,918 : INFO : topic #9 (0.091): 0.017*"grain" + 0.014*"line" + 0.013*"power" + 0.013*"air" + 0.012*"water" + 0.011*"tank" + 0.011*"rope" + 0.010*"get" + 0.009*"tiger" + 0.008*"attempt"
2017-10-22 15:45:13,918 : INFO : topic #8 (0.091): 0.035*"burn" + 0.012*"coworker" + 0.011*"machine" + 0.011*"hospitalize" + 0.010*"area" + 0.009*"golf" + 0.009*"sustain" + 0.008*"clean" + 0.007*"accident" + 0.007*"employer"
2017-10-22 15:45:13,918 : INFO : topic #6 (0.091): 0.029*"trench" + 0.018*"ra

2017-10-22 15:45:18,738 : INFO : topic diff=0.007437, rho=0.141421
2017-10-22 15:45:19,425 : INFO : -6.743 per-word bound, 107.1 perplexity estimate based on a held-out corpus of 527 documents with 19105 words
2017-10-22 15:45:19,425 : INFO : PROGRESS: pass 49, at document #527/527
2017-10-22 15:45:19,685 : INFO : topic #4 (0.091): 0.028*"lift" + 0.020*"barge" + 0.016*"build" + 0.014*"boom" + 0.014*"aerial" + 0.011*"break" + 0.010*"floor" + 0.010*"kill" + 0.010*"witness" + 0.009*"elevator"
2017-10-22 15:45:19,685 : INFO : topic #1 (0.091): 0.072*"fall" + 0.024*"work" + 0.021*"scaffold" + 0.019*"strike" + 0.016*"kill" + 0.016*"water" + 0.015*"hit" + 0.015*"tractor" + 0.015*"collapse" + 0.014*"wall"
2017-10-22 15:45:19,685 : INFO : topic #9 (0.091): 0.017*"grain" + 0.014*"line" + 0.013*"power" + 0.013*"air" + 0.013*"water" + 0.011*"rope" + 0.011*"tank" + 0.010*"get" + 0.009*"tiger" + 0.008*"attempt"
2017-10-22 15:45:19,693 : INFO : topic #7 (0.091): 0.049*"forklift" + 0.036*"belt" + 0.01

2017-10-22 15:45:25,174 : INFO : PROGRESS: pass 55, at document #527/527
2017-10-22 15:45:25,434 : INFO : topic #10 (0.091): 0.016*"tank" + 0.014*"load" + 0.013*"kill" + 0.013*"steel" + 0.013*"cart" + 0.012*"leave" + 0.010*"feet" + 0.010*"supervisor" + 0.009*"april" + 0.009*"strike"
2017-10-22 15:45:25,434 : INFO : topic #8 (0.091): 0.035*"burn" + 0.012*"coworker" + 0.012*"machine" + 0.011*"hospitalize" + 0.010*"area" + 0.009*"sustain" + 0.009*"golf" + 0.008*"clean" + 0.007*"fire" + 0.007*"employer"
2017-10-22 15:45:25,434 : INFO : topic #0 (0.091): 0.078*"tree" + 0.024*"bucket" + 0.024*"cut" + 0.022*"strike" + 0.021*"kill" + 0.015*"bulldozer" + 0.012*"fall" + 0.012*"crush" + 0.011*"vehicle" + 0.011*"head"
2017-10-22 15:45:25,447 : INFO : topic #4 (0.091): 0.029*"lift" + 0.020*"barge" + 0.016*"build" + 0.014*"boom" + 0.014*"aerial" + 0.011*"floor" + 0.011*"break" + 0.010*"kill" + 0.010*"witness" + 0.009*"elevator"
2017-10-22 15:45:25,447 : INFO : topic #6 (0.091): 0.029*"trench" + 0.01

2017-10-22 15:45:31,460 : INFO : topic #9 (0.091): 0.017*"grain" + 0.014*"line" + 0.013*"power" + 0.013*"water" + 0.013*"air" + 0.012*"rope" + 0.011*"tank" + 0.010*"get" + 0.009*"tiger" + 0.008*"go"
2017-10-22 15:45:31,464 : INFO : topic #5 (0.091): 0.035*"truck" + 0.025*"backhoe" + 0.023*"coworker" + 0.018*"strike" + 0.013*"fracture" + 0.013*"operate" + 0.012*"run" + 0.011*"pipe" + 0.011*"driver" + 0.010*"tractor"
2017-10-22 15:45:31,464 : INFO : topic #10 (0.091): 0.016*"tank" + 0.014*"load" + 0.013*"kill" + 0.013*"steel" + 0.013*"cart" + 0.012*"leave" + 0.010*"feet" + 0.010*"supervisor" + 0.009*"april" + 0.009*"strike"
2017-10-22 15:45:31,464 : INFO : topic #3 (0.091): 0.042*"finger" + 0.023*"right" + 0.021*"hand" + 0.018*"amputate" + 0.015*"machine" + 0.011*"press" + 0.011*"cut" + 0.010*"pull" + 0.010*"saw" + 0.009*"die"
2017-10-22 15:45:31,468 : INFO : topic diff=0.004551, rho=0.125988
2017-10-22 15:45:32,209 : INFO : -6.738 per-word bound, 106.8 perplexity estimate based on a hel

2017-10-22 15:45:37,546 : INFO : topic #4 (0.091): 0.029*"lift" + 0.020*"barge" + 0.016*"build" + 0.014*"boom" + 0.014*"aerial" + 0.011*"floor" + 0.011*"break" + 0.010*"kill" + 0.010*"witness" + 0.009*"elevator"
2017-10-22 15:45:37,553 : INFO : topic #2 (0.091): 0.037*"roof" + 0.021*"tire" + 0.019*"nail" + 0.018*"tile" + 0.015*"coworker" + 0.011*"storm" + 0.011*"lift" + 0.011*"boom" + 0.010*"replace" + 0.010*"pin"
2017-10-22 15:45:37,553 : INFO : topic diff=0.003794, rho=0.120386
2017-10-22 15:45:38,249 : INFO : -6.737 per-word bound, 106.6 perplexity estimate based on a held-out corpus of 527 documents with 19105 words
2017-10-22 15:45:38,253 : INFO : PROGRESS: pass 68, at document #527/527
2017-10-22 15:45:38,519 : INFO : topic #9 (0.091): 0.017*"grain" + 0.014*"line" + 0.013*"power" + 0.013*"water" + 0.013*"air" + 0.012*"rope" + 0.011*"tank" + 0.010*"get" + 0.009*"tiger" + 0.009*"go"
2017-10-22 15:45:38,519 : INFO : topic #10 (0.091): 0.016*"tank" + 0.014*"load" + 0.013*"kill" + 0.0

2017-10-22 15:45:43,396 : INFO : topic #1 (0.091): 0.073*"fall" + 0.025*"work" + 0.022*"scaffold" + 0.019*"strike" + 0.017*"hit" + 0.016*"kill" + 0.016*"collapse" + 0.015*"water" + 0.015*"wall" + 0.014*"tractor"
2017-10-22 15:45:43,396 : INFO : topic diff=0.003435, rho=0.115470
2017-10-22 15:45:44,120 : INFO : -6.735 per-word bound, 106.5 perplexity estimate based on a held-out corpus of 527 documents with 19105 words
2017-10-22 15:45:44,120 : INFO : PROGRESS: pass 74, at document #527/527
2017-10-22 15:45:44,408 : INFO : topic #10 (0.091): 0.016*"tank" + 0.014*"load" + 0.013*"kill" + 0.013*"steel" + 0.013*"cart" + 0.012*"leave" + 0.010*"feet" + 0.010*"supervisor" + 0.010*"strike" + 0.009*"april"
2017-10-22 15:45:44,408 : INFO : topic #4 (0.091): 0.029*"lift" + 0.020*"barge" + 0.016*"build" + 0.014*"boom" + 0.014*"aerial" + 0.012*"floor" + 0.011*"break" + 0.010*"kill" + 0.010*"witness" + 0.009*"elevator"
2017-10-22 15:45:44,408 : INFO : topic #5 (0.091): 0.035*"truck" + 0.025*"backhoe"

2017-10-22 15:45:49,215 : INFO : topic diff=0.002966, rho=0.111111
2017-10-22 15:45:49,881 : INFO : -6.734 per-word bound, 106.4 perplexity estimate based on a held-out corpus of 527 documents with 19105 words
2017-10-22 15:45:49,897 : INFO : PROGRESS: pass 80, at document #527/527
2017-10-22 15:45:50,156 : INFO : topic #0 (0.091): 0.079*"tree" + 0.025*"bucket" + 0.024*"cut" + 0.022*"strike" + 0.021*"kill" + 0.015*"bulldozer" + 0.012*"fall" + 0.012*"crush" + 0.011*"vehicle" + 0.011*"head"
2017-10-22 15:45:50,156 : INFO : topic #6 (0.091): 0.029*"trench" + 0.018*"rail" + 0.016*"wall" + 0.013*"collapse" + 0.012*"concrete" + 0.012*"forklift" + 0.012*"dock" + 0.010*"dog" + 0.009*"door" + 0.009*"section"
2017-10-22 15:45:50,156 : INFO : topic #7 (0.091): 0.053*"forklift" + 0.035*"belt" + 0.018*"grind" + 0.016*"hand" + 0.014*"fan" + 0.014*"unit" + 0.012*"cut" + 0.011*"conveyor" + 0.010*"kill" + 0.010*"leave"
2017-10-22 15:45:50,156 : INFO : topic #5 (0.091): 0.035*"truck" + 0.025*"backhoe" +

2017-10-22 15:45:55,583 : INFO : PROGRESS: pass 86, at document #527/527
2017-10-22 15:45:55,833 : INFO : topic #1 (0.091): 0.073*"fall" + 0.025*"work" + 0.023*"scaffold" + 0.019*"strike" + 0.018*"hit" + 0.017*"kill" + 0.016*"collapse" + 0.015*"water" + 0.015*"wall" + 0.014*"believe"
2017-10-22 15:45:55,833 : INFO : topic #10 (0.091): 0.016*"tank" + 0.014*"load" + 0.013*"kill" + 0.013*"steel" + 0.013*"cart" + 0.012*"leave" + 0.010*"feet" + 0.010*"strike" + 0.010*"supervisor" + 0.009*"april"
2017-10-22 15:45:55,833 : INFO : topic #9 (0.091): 0.017*"grain" + 0.014*"line" + 0.013*"water" + 0.013*"power" + 0.013*"air" + 0.012*"rope" + 0.011*"tank" + 0.010*"get" + 0.009*"tiger" + 0.009*"go"
2017-10-22 15:45:55,849 : INFO : topic #3 (0.091): 0.043*"finger" + 0.024*"right" + 0.022*"hand" + 0.019*"amputate" + 0.016*"machine" + 0.011*"cut" + 0.011*"press" + 0.010*"pull" + 0.010*"fan" + 0.009*"saw"
2017-10-22 15:45:55,849 : INFO : topic #6 (0.091): 0.029*"trench" + 0.018*"rail" + 0.016*"wall" + 

2017-10-22 15:46:01,618 : INFO : topic #0 (0.091): 0.080*"tree" + 0.025*"bucket" + 0.024*"cut" + 0.022*"strike" + 0.021*"kill" + 0.015*"bulldozer" + 0.012*"fall" + 0.012*"crush" + 0.011*"vehicle" + 0.011*"head"
2017-10-22 15:46:01,618 : INFO : topic #6 (0.091): 0.029*"trench" + 0.018*"rail" + 0.016*"wall" + 0.014*"collapse" + 0.012*"concrete" + 0.012*"dock" + 0.010*"dog" + 0.009*"door" + 0.009*"section" + 0.009*"call"
2017-10-22 15:46:01,618 : INFO : topic #3 (0.091): 0.043*"finger" + 0.024*"right" + 0.022*"hand" + 0.019*"amputate" + 0.016*"machine" + 0.011*"cut" + 0.011*"press" + 0.010*"pull" + 0.010*"fan" + 0.009*"saw"
2017-10-22 15:46:01,627 : INFO : topic #10 (0.091): 0.016*"tank" + 0.014*"load" + 0.013*"kill" + 0.013*"steel" + 0.013*"cart" + 0.011*"leave" + 0.010*"feet" + 0.010*"strike" + 0.010*"supervisor" + 0.009*"april"
2017-10-22 15:46:01,627 : INFO : topic diff=0.002630, rho=0.103142
2017-10-22 15:46:02,308 : INFO : -6.731 per-word bound, 106.2 perplexity estimate based on a 

2017-10-22 15:46:07,234 : INFO : topic #2 (0.091): 0.038*"roof" + 0.022*"tire" + 0.020*"nail" + 0.018*"tile" + 0.015*"coworker" + 0.012*"storm" + 0.011*"lift" + 0.011*"boom" + 0.011*"replace" + 0.010*"pin"
2017-10-22 15:46:07,234 : INFO : topic #0 (0.091): 0.080*"tree" + 0.026*"bucket" + 0.025*"cut" + 0.022*"strike" + 0.021*"kill" + 0.015*"bulldozer" + 0.012*"fall" + 0.012*"crush" + 0.011*"vehicle" + 0.011*"head"
2017-10-22 15:46:07,248 : INFO : topic #7 (0.091): 0.060*"forklift" + 0.034*"belt" + 0.018*"grind" + 0.016*"hand" + 0.013*"unit" + 0.013*"fan" + 0.012*"cut" + 0.011*"kill" + 0.011*"conveyor" + 0.010*"leave"
2017-10-22 15:46:07,248 : INFO : topic diff=0.002374, rho=0.100000
2017-10-22 15:46:07,921 : INFO : -6.729 per-word bound, 106.1 perplexity estimate based on a held-out corpus of 527 documents with 19105 words
2017-10-22 15:46:07,921 : INFO : PROGRESS: pass 99, at document #527/527
2017-10-22 15:46:08,170 : INFO : topic #6 (0.091): 0.030*"trench" + 0.018*"rail" + 0.016*"wal

2017-10-22 15:46:12,882 : INFO : topic #2 (0.091): 0.038*"roof" + 0.022*"tire" + 0.020*"nail" + 0.018*"tile" + 0.016*"coworker" + 0.012*"storm" + 0.011*"lift" + 0.011*"boom" + 0.011*"replace" + 0.010*"pin"
2017-10-22 15:46:12,882 : INFO : topic #1 (0.091): 0.075*"fall" + 0.026*"work" + 0.024*"scaffold" + 0.020*"strike" + 0.017*"hit" + 0.017*"collapse" + 0.017*"kill" + 0.016*"water" + 0.015*"wall" + 0.015*"believe"
2017-10-22 15:46:12,882 : INFO : topic diff=0.002088, rho=0.097129
2017-10-22 15:46:13,556 : INFO : -6.728 per-word bound, 106.0 perplexity estimate based on a held-out corpus of 527 documents with 19105 words
2017-10-22 15:46:13,556 : INFO : PROGRESS: pass 105, at document #527/527
2017-10-22 15:46:13,798 : INFO : topic #6 (0.091): 0.030*"trench" + 0.018*"rail" + 0.016*"wall" + 0.014*"collapse" + 0.012*"concrete" + 0.012*"dock" + 0.010*"dog" + 0.010*"door" + 0.009*"section" + 0.009*"call"
2017-10-22 15:46:13,798 : INFO : topic #8 (0.091): 0.036*"burn" + 0.012*"machine" + 0.0

2017-10-22 15:46:18,602 : INFO : topic #9 (0.091): 0.017*"grain" + 0.014*"line" + 0.013*"water" + 0.013*"power" + 0.012*"air" + 0.012*"rope" + 0.011*"tank" + 0.010*"get" + 0.009*"tiger" + 0.009*"go"
2017-10-22 15:46:18,602 : INFO : topic diff=0.001844, rho=0.094491
2017-10-22 15:46:19,268 : INFO : -6.728 per-word bound, 106.0 perplexity estimate based on a held-out corpus of 527 documents with 19105 words
2017-10-22 15:46:19,268 : INFO : PROGRESS: pass 111, at document #527/527
2017-10-22 15:46:19,531 : INFO : topic #4 (0.091): 0.029*"lift" + 0.020*"barge" + 0.016*"build" + 0.014*"boom" + 0.013*"aerial" + 0.012*"floor" + 0.011*"break" + 0.010*"kill" + 0.010*"bin" + 0.010*"witness"
2017-10-22 15:46:19,531 : INFO : topic #0 (0.091): 0.081*"tree" + 0.026*"bucket" + 0.025*"cut" + 0.022*"strike" + 0.022*"kill" + 0.015*"bulldozer" + 0.012*"fall" + 0.012*"crush" + 0.011*"vehicle" + 0.011*"head"
2017-10-22 15:46:19,531 : INFO : topic #1 (0.091): 0.075*"fall" + 0.026*"work" + 0.024*"scaffold" +

2017-10-22 15:46:24,261 : INFO : topic diff=0.001627, rho=0.092057
2017-10-22 15:46:24,934 : INFO : -6.727 per-word bound, 105.9 perplexity estimate based on a held-out corpus of 527 documents with 19105 words
2017-10-22 15:46:24,934 : INFO : PROGRESS: pass 117, at document #527/527
2017-10-22 15:46:25,183 : INFO : topic #2 (0.091): 0.038*"roof" + 0.022*"tire" + 0.020*"nail" + 0.018*"tile" + 0.016*"coworker" + 0.012*"storm" + 0.011*"lift" + 0.011*"boom" + 0.011*"replace" + 0.010*"pin"
2017-10-22 15:46:25,199 : INFO : topic #0 (0.091): 0.081*"tree" + 0.026*"bucket" + 0.025*"cut" + 0.022*"strike" + 0.022*"kill" + 0.015*"bulldozer" + 0.012*"fall" + 0.012*"crush" + 0.011*"vehicle" + 0.011*"head"
2017-10-22 15:46:25,199 : INFO : topic #10 (0.091): 0.016*"tank" + 0.014*"load" + 0.014*"kill" + 0.013*"steel" + 0.013*"cart" + 0.012*"leave" + 0.010*"strike" + 0.010*"feet" + 0.010*"supervisor" + 0.009*"april"
2017-10-22 15:46:25,199 : INFO : topic #1 (0.091): 0.075*"fall" + 0.026*"work" + 0.024*"

2017-10-22 15:46:30,845 : INFO : topic #7 (0.091): 0.065*"forklift" + 0.033*"belt" + 0.017*"grind" + 0.015*"hand" + 0.013*"unit" + 0.013*"kill" + 0.012*"cut" + 0.012*"fan" + 0.011*"conveyor" + 0.010*"leave"
2017-10-22 15:46:30,845 : INFO : topic #6 (0.091): 0.030*"trench" + 0.018*"rail" + 0.016*"wall" + 0.014*"collapse" + 0.013*"concrete" + 0.012*"dock" + 0.010*"dog" + 0.010*"door" + 0.009*"section" + 0.009*"call"
2017-10-22 15:46:30,845 : INFO : topic #0 (0.091): 0.081*"tree" + 0.026*"bucket" + 0.025*"cut" + 0.023*"strike" + 0.022*"kill" + 0.015*"bulldozer" + 0.012*"fall" + 0.012*"crush" + 0.011*"vehicle" + 0.011*"head"
2017-10-22 15:46:30,845 : INFO : topic #1 (0.091): 0.075*"fall" + 0.027*"work" + 0.024*"scaffold" + 0.020*"strike" + 0.017*"collapse" + 0.017*"hit" + 0.017*"kill" + 0.016*"wall" + 0.016*"water" + 0.015*"believe"
2017-10-22 15:46:30,855 : INFO : topic #5 (0.091): 0.034*"truck" + 0.024*"backhoe" + 0.022*"coworker" + 0.018*"strike" + 0.015*"tractor" + 0.014*"fracture" + 0

2017-10-22 15:46:36,544 : INFO : topic #6 (0.091): 0.030*"trench" + 0.018*"rail" + 0.016*"wall" + 0.014*"collapse" + 0.013*"concrete" + 0.012*"dock" + 0.010*"dog" + 0.010*"door" + 0.009*"section" + 0.009*"call"
2017-10-22 15:46:36,544 : INFO : topic #1 (0.091): 0.075*"fall" + 0.027*"work" + 0.024*"scaffold" + 0.020*"strike" + 0.017*"collapse" + 0.017*"kill" + 0.017*"hit" + 0.016*"wall" + 0.016*"water" + 0.015*"believe"
2017-10-22 15:46:36,544 : INFO : topic #5 (0.091): 0.034*"truck" + 0.024*"backhoe" + 0.022*"coworker" + 0.018*"strike" + 0.016*"tractor" + 0.014*"fracture" + 0.013*"operate" + 0.012*"driver" + 0.012*"run" + 0.010*"pipe"
2017-10-22 15:46:36,556 : INFO : topic #10 (0.091): 0.016*"tank" + 0.014*"load" + 0.014*"kill" + 0.013*"steel" + 0.013*"cart" + 0.012*"leave" + 0.010*"strike" + 0.010*"feet" + 0.010*"supervisor" + 0.009*"april"
2017-10-22 15:46:36,556 : INFO : topic diff=0.001263, rho=0.087370
2017-10-22 15:46:37,229 : INFO : -6.726 per-word bound, 105.8 perplexity estima

2017-10-22 15:46:42,170 : INFO : topic #8 (0.091): 0.036*"burn" + 0.012*"machine" + 0.012*"coworker" + 0.011*"hospitalize" + 0.011*"fire" + 0.010*"area" + 0.010*"golf" + 0.009*"sustain" + 0.008*"clean" + 0.007*"employer"
2017-10-22 15:46:42,170 : INFO : topic #3 (0.091): 0.043*"finger" + 0.024*"right" + 0.022*"hand" + 0.019*"amputate" + 0.016*"machine" + 0.011*"cut" + 0.011*"press" + 0.010*"fan" + 0.010*"pull" + 0.010*"belt"
2017-10-22 15:46:42,182 : INFO : topic #9 (0.091): 0.017*"grain" + 0.014*"line" + 0.013*"water" + 0.013*"power" + 0.012*"air" + 0.012*"rope" + 0.011*"tank" + 0.010*"get" + 0.009*"tiger" + 0.009*"go"
2017-10-22 15:46:42,182 : INFO : topic diff=0.001130, rho=0.085436
2017-10-22 15:46:42,848 : INFO : -6.725 per-word bound, 105.8 perplexity estimate based on a held-out corpus of 527 documents with 19105 words
2017-10-22 15:46:42,848 : INFO : PROGRESS: pass 136, at document #527/527
2017-10-22 15:46:43,109 : INFO : topic #3 (0.091): 0.043*"finger" + 0.024*"right" + 0.02

2017-10-22 15:46:47,792 : INFO : topic #5 (0.091): 0.034*"truck" + 0.024*"backhoe" + 0.022*"coworker" + 0.018*"strike" + 0.016*"tractor" + 0.014*"fracture" + 0.013*"operate" + 0.012*"driver" + 0.012*"run" + 0.010*"pipe"
2017-10-22 15:46:47,797 : INFO : topic #10 (0.091): 0.016*"tank" + 0.014*"load" + 0.014*"kill" + 0.013*"steel" + 0.013*"cart" + 0.012*"leave" + 0.010*"strike" + 0.010*"feet" + 0.010*"supervisor" + 0.009*"april"
2017-10-22 15:46:47,797 : INFO : topic diff=0.001021, rho=0.083624
2017-10-22 15:46:48,556 : INFO : -6.725 per-word bound, 105.8 perplexity estimate based on a held-out corpus of 527 documents with 19105 words
2017-10-22 15:46:48,556 : INFO : PROGRESS: pass 142, at document #527/527
2017-10-22 15:46:48,820 : INFO : topic #8 (0.091): 0.036*"burn" + 0.012*"machine" + 0.012*"coworker" + 0.011*"hospitalize" + 0.011*"fire" + 0.010*"area" + 0.010*"golf" + 0.009*"sustain" + 0.008*"clean" + 0.007*"employer"
2017-10-22 15:46:48,820 : INFO : topic #9 (0.091): 0.017*"grain"

2017-10-22 15:46:53,492 : INFO : topic #1 (0.091): 0.075*"fall" + 0.027*"work" + 0.024*"scaffold" + 0.020*"strike" + 0.017*"collapse" + 0.017*"kill" + 0.017*"hit" + 0.016*"wall" + 0.016*"water" + 0.015*"believe"
2017-10-22 15:46:53,492 : INFO : topic diff=0.000918, rho=0.081923
2017-10-22 15:46:54,162 : INFO : -6.724 per-word bound, 105.7 perplexity estimate based on a held-out corpus of 527 documents with 19105 words
2017-10-22 15:46:54,162 : INFO : PROGRESS: pass 148, at document #527/527
2017-10-22 15:46:54,422 : INFO : topic #6 (0.091): 0.030*"trench" + 0.018*"rail" + 0.016*"wall" + 0.014*"collapse" + 0.013*"concrete" + 0.012*"dock" + 0.010*"dog" + 0.010*"door" + 0.009*"section" + 0.009*"call"
2017-10-22 15:46:54,422 : INFO : topic #8 (0.091): 0.036*"burn" + 0.012*"machine" + 0.012*"coworker" + 0.011*"hospitalize" + 0.011*"fire" + 0.010*"area" + 0.010*"golf" + 0.009*"sustain" + 0.008*"clean" + 0.007*"employer"
2017-10-22 15:46:54,422 : INFO : topic #7 (0.091): 0.066*"forklift" + 0.

2017-10-22 15:46:59,078 : INFO : topic diff=0.000850, rho=0.080322
2017-10-22 15:46:59,756 : INFO : -6.724 per-word bound, 105.7 perplexity estimate based on a held-out corpus of 527 documents with 19105 words
2017-10-22 15:46:59,756 : INFO : PROGRESS: pass 154, at document #527/527
2017-10-22 15:47:00,016 : INFO : topic #2 (0.091): 0.038*"roof" + 0.023*"tire" + 0.020*"nail" + 0.018*"tile" + 0.016*"coworker" + 0.012*"storm" + 0.011*"lift" + 0.011*"boom" + 0.011*"replace" + 0.010*"pin"
2017-10-22 15:47:00,016 : INFO : topic #6 (0.091): 0.030*"trench" + 0.018*"rail" + 0.016*"wall" + 0.014*"collapse" + 0.013*"concrete" + 0.012*"dock" + 0.010*"dog" + 0.010*"door" + 0.009*"section" + 0.009*"call"
2017-10-22 15:47:00,016 : INFO : topic #3 (0.091): 0.043*"finger" + 0.024*"right" + 0.022*"hand" + 0.019*"amputate" + 0.016*"machine" + 0.012*"cut" + 0.011*"press" + 0.011*"fan" + 0.010*"pull" + 0.010*"belt"
2017-10-22 15:47:00,024 : INFO : topic #5 (0.091): 0.034*"truck" + 0.024*"backhoe" + 0.022*

2017-10-22 15:47:05,456 : INFO : PROGRESS: pass 160, at document #527/527
2017-10-22 15:47:05,699 : INFO : topic #9 (0.091): 0.017*"grain" + 0.014*"line" + 0.013*"water" + 0.013*"power" + 0.012*"air" + 0.012*"rope" + 0.011*"tank" + 0.010*"get" + 0.009*"tiger" + 0.009*"go"
2017-10-22 15:47:05,699 : INFO : topic #4 (0.091): 0.029*"lift" + 0.020*"barge" + 0.016*"build" + 0.014*"boom" + 0.013*"aerial" + 0.012*"floor" + 0.011*"break" + 0.010*"kill" + 0.010*"bin" + 0.010*"witness"
2017-10-22 15:47:05,699 : INFO : topic #5 (0.091): 0.034*"truck" + 0.024*"backhoe" + 0.022*"coworker" + 0.018*"strike" + 0.016*"tractor" + 0.014*"fracture" + 0.013*"operate" + 0.012*"driver" + 0.012*"run" + 0.010*"pipe"
2017-10-22 15:47:05,699 : INFO : topic #3 (0.091): 0.043*"finger" + 0.024*"right" + 0.022*"hand" + 0.019*"amputate" + 0.016*"machine" + 0.012*"cut" + 0.011*"press" + 0.011*"fan" + 0.010*"pull" + 0.010*"belt"
2017-10-22 15:47:05,714 : INFO : topic #6 (0.091): 0.030*"trench" + 0.018*"rail" + 0.016*"wa

2017-10-22 15:47:11,314 : INFO : topic #3 (0.091): 0.043*"finger" + 0.024*"right" + 0.022*"hand" + 0.019*"amputate" + 0.016*"machine" + 0.012*"cut" + 0.011*"press" + 0.011*"fan" + 0.010*"pull" + 0.010*"belt"
2017-10-22 15:47:11,314 : INFO : topic #0 (0.091): 0.081*"tree" + 0.026*"bucket" + 0.025*"cut" + 0.023*"strike" + 0.023*"kill" + 0.015*"bulldozer" + 0.012*"fall" + 0.012*"crush" + 0.011*"vehicle" + 0.011*"head"
2017-10-22 15:47:11,314 : INFO : topic #5 (0.091): 0.034*"truck" + 0.024*"backhoe" + 0.022*"coworker" + 0.018*"strike" + 0.016*"tractor" + 0.014*"fracture" + 0.013*"operate" + 0.012*"run" + 0.012*"driver" + 0.010*"pipe"
2017-10-22 15:47:11,325 : INFO : topic #10 (0.091): 0.016*"tank" + 0.014*"load" + 0.014*"kill" + 0.013*"steel" + 0.012*"cart" + 0.012*"leave" + 0.010*"strike" + 0.010*"feet" + 0.010*"supervisor" + 0.009*"april"
2017-10-22 15:47:11,325 : INFO : topic diff=0.000651, rho=0.077152
2017-10-22 15:47:12,003 : INFO : -6.724 per-word bound, 105.7 perplexity estimate b

2017-10-22 15:47:16,997 : INFO : topic #7 (0.091): 0.067*"forklift" + 0.032*"belt" + 0.017*"grind" + 0.015*"hand" + 0.014*"kill" + 0.013*"unit" + 0.012*"cut" + 0.011*"fan" + 0.011*"conveyor" + 0.010*"leave"
2017-10-22 15:47:16,997 : INFO : topic #8 (0.091): 0.036*"burn" + 0.012*"machine" + 0.012*"coworker" + 0.011*"fire" + 0.011*"hospitalize" + 0.010*"area" + 0.010*"golf" + 0.009*"sustain" + 0.008*"clean" + 0.008*"employer"
2017-10-22 15:47:16,999 : INFO : topic #3 (0.091): 0.043*"finger" + 0.024*"right" + 0.022*"hand" + 0.019*"amputate" + 0.016*"machine" + 0.012*"cut" + 0.011*"press" + 0.011*"fan" + 0.010*"pull" + 0.010*"belt"
2017-10-22 15:47:16,999 : INFO : topic diff=0.000587, rho=0.075810
2017-10-22 15:47:17,669 : INFO : -6.723 per-word bound, 105.7 perplexity estimate based on a held-out corpus of 527 documents with 19105 words
2017-10-22 15:47:17,669 : INFO : PROGRESS: pass 173, at document #527/527
2017-10-22 15:47:17,916 : INFO : topic #6 (0.091): 0.030*"trench" + 0.018*"rail"

2017-10-22 15:47:22,693 : INFO : topic #9 (0.091): 0.017*"grain" + 0.015*"line" + 0.013*"water" + 0.013*"power" + 0.012*"air" + 0.012*"rope" + 0.011*"tank" + 0.010*"get" + 0.009*"tiger" + 0.009*"go"
2017-10-22 15:47:22,695 : INFO : topic #5 (0.091): 0.034*"truck" + 0.024*"backhoe" + 0.022*"coworker" + 0.018*"strike" + 0.016*"tractor" + 0.014*"fracture" + 0.012*"operate" + 0.012*"run" + 0.012*"driver" + 0.010*"pipe"
2017-10-22 15:47:22,695 : INFO : topic diff=0.000553, rho=0.074536
2017-10-22 15:47:23,353 : INFO : -6.723 per-word bound, 105.7 perplexity estimate based on a held-out corpus of 527 documents with 19105 words
2017-10-22 15:47:23,353 : INFO : PROGRESS: pass 179, at document #527/527
2017-10-22 15:47:23,626 : INFO : topic #4 (0.091): 0.029*"lift" + 0.020*"barge" + 0.016*"build" + 0.014*"boom" + 0.013*"aerial" + 0.012*"floor" + 0.011*"break" + 0.010*"kill" + 0.010*"bin" + 0.010*"witness"
2017-10-22 15:47:23,626 : INFO : topic #8 (0.091): 0.036*"burn" + 0.012*"machine" + 0.012*

2017-10-22 15:47:28,315 : INFO : topic diff=0.000524, rho=0.073324
2017-10-22 15:47:28,985 : INFO : -6.723 per-word bound, 105.6 perplexity estimate based on a held-out corpus of 527 documents with 19105 words
2017-10-22 15:47:29,000 : INFO : PROGRESS: pass 185, at document #527/527
2017-10-22 15:47:29,244 : INFO : topic #0 (0.091): 0.081*"tree" + 0.026*"bucket" + 0.025*"cut" + 0.023*"strike" + 0.023*"kill" + 0.015*"bulldozer" + 0.012*"fall" + 0.012*"crush" + 0.011*"head" + 0.011*"vehicle"
2017-10-22 15:47:29,244 : INFO : topic #3 (0.091): 0.043*"finger" + 0.024*"right" + 0.022*"hand" + 0.019*"amputate" + 0.016*"machine" + 0.012*"cut" + 0.011*"press" + 0.011*"fan" + 0.010*"pull" + 0.010*"belt"
2017-10-22 15:47:29,244 : INFO : topic #10 (0.091): 0.016*"tank" + 0.014*"load" + 0.014*"kill" + 0.013*"steel" + 0.012*"cart" + 0.012*"leave" + 0.010*"strike" + 0.010*"feet" + 0.010*"supervisor" + 0.009*"april"
2017-10-22 15:47:29,256 : INFO : topic #9 (0.091): 0.017*"grain" + 0.015*"line" + 0.01

2017-10-22 15:47:34,615 : INFO : PROGRESS: pass 191, at document #527/527
2017-10-22 15:47:34,862 : INFO : topic #9 (0.091): 0.017*"grain" + 0.015*"line" + 0.013*"water" + 0.013*"power" + 0.013*"air" + 0.012*"rope" + 0.011*"tank" + 0.010*"get" + 0.009*"tiger" + 0.009*"go"
2017-10-22 15:47:34,862 : INFO : topic #5 (0.091): 0.033*"truck" + 0.024*"backhoe" + 0.022*"coworker" + 0.018*"strike" + 0.016*"tractor" + 0.014*"fracture" + 0.012*"operate" + 0.012*"run" + 0.012*"driver" + 0.010*"pipe"
2017-10-22 15:47:34,862 : INFO : topic #4 (0.091): 0.029*"lift" + 0.020*"barge" + 0.016*"build" + 0.014*"boom" + 0.013*"aerial" + 0.012*"floor" + 0.011*"break" + 0.010*"kill" + 0.010*"bin" + 0.010*"witness"
2017-10-22 15:47:34,862 : INFO : topic #7 (0.091): 0.067*"forklift" + 0.032*"belt" + 0.017*"grind" + 0.014*"hand" + 0.014*"kill" + 0.013*"unit" + 0.012*"cut" + 0.011*"conveyor" + 0.010*"fan" + 0.010*"leave"
2017-10-22 15:47:34,876 : INFO : topic #10 (0.091): 0.016*"tank" + 0.014*"load" + 0.014*"kill

2017-10-22 15:47:40,538 : INFO : topic #7 (0.091): 0.067*"forklift" + 0.032*"belt" + 0.017*"grind" + 0.014*"hand" + 0.014*"kill" + 0.013*"unit" + 0.012*"cut" + 0.011*"conveyor" + 0.010*"fan" + 0.010*"leave"
2017-10-22 15:47:40,538 : INFO : topic #10 (0.091): 0.016*"tank" + 0.014*"load" + 0.014*"kill" + 0.013*"steel" + 0.012*"cart" + 0.012*"leave" + 0.010*"strike" + 0.010*"feet" + 0.010*"supervisor" + 0.009*"contractor"
2017-10-22 15:47:40,538 : INFO : topic #8 (0.091): 0.036*"burn" + 0.012*"machine" + 0.011*"coworker" + 0.011*"fire" + 0.011*"hospitalize" + 0.010*"area" + 0.010*"golf" + 0.010*"sustain" + 0.009*"clean" + 0.008*"employer"
2017-10-22 15:47:40,545 : INFO : topic #3 (0.091): 0.043*"finger" + 0.024*"right" + 0.022*"hand" + 0.019*"amputate" + 0.016*"machine" + 0.012*"cut" + 0.011*"press" + 0.011*"fan" + 0.010*"pull" + 0.010*"belt"
2017-10-22 15:47:40,545 : INFO : topic diff=0.000485, rho=0.070888
2017-10-22 15:47:41,211 : INFO : -6.723 per-word bound, 105.6 perplexity estimate

In [16]:
# To see the topics, with the most probable words in each topic. What topics to you see?
lda.show_topics()

[(9,
  '0.017*"grain" + 0.015*"line" + 0.013*"water" + 0.013*"power" + 0.013*"air" + 0.012*"rope" + 0.011*"tank" + 0.010*"get" + 0.009*"tiger" + 0.009*"go"'),
 (7,
  '0.067*"forklift" + 0.032*"belt" + 0.017*"grind" + 0.014*"hand" + 0.014*"kill" + 0.013*"unit" + 0.012*"cut" + 0.011*"conveyor" + 0.010*"fan" + 0.010*"leave"'),
 (6,
  '0.030*"trench" + 0.018*"rail" + 0.016*"wall" + 0.014*"collapse" + 0.013*"concrete" + 0.012*"dock" + 0.010*"dog" + 0.010*"door" + 0.010*"section" + 0.010*"coworker"'),
 (2,
  '0.039*"roof" + 0.024*"tire" + 0.021*"nail" + 0.019*"tile" + 0.016*"coworker" + 0.012*"storm" + 0.011*"lift" + 0.011*"replace" + 0.010*"pin" + 0.010*"boom"'),
 (0,
  '0.081*"tree" + 0.026*"bucket" + 0.025*"cut" + 0.023*"strike" + 0.023*"kill" + 0.015*"bulldozer" + 0.012*"fall" + 0.012*"crush" + 0.011*"head" + 0.011*"loader"'),
 (1,
  '0.075*"fall" + 0.027*"work" + 0.024*"scaffold" + 0.020*"strike" + 0.017*"collapse" + 0.017*"kill" + 0.017*"hit" + 0.016*"wall" + 0.016*"water" + 0.015*"bel

In [17]:
# You can also request to see more words per topic
lda.show_topics(num_words=20)

[(6,
  '0.030*"trench" + 0.018*"rail" + 0.016*"wall" + 0.014*"collapse" + 0.013*"concrete" + 0.012*"dock" + 0.010*"dog" + 0.010*"door" + 0.010*"section" + 0.010*"coworker" + 0.009*"call" + 0.008*"fall" + 0.008*"grind" + 0.007*"area" + 0.007*"top" + 0.007*"police" + 0.007*"rescue" + 0.007*"screen" + 0.007*"plate" + 0.007*"scene"'),
 (9,
  '0.017*"grain" + 0.015*"line" + 0.013*"water" + 0.013*"power" + 0.013*"air" + 0.012*"rope" + 0.011*"tank" + 0.010*"get" + 0.009*"tiger" + 0.009*"go" + 0.008*"animal" + 0.008*"attempt" + 0.007*"kill" + 0.007*"room" + 0.007*"suffocate" + 0.007*"fee" + 0.007*"pump" + 0.007*"wire" + 0.007*"weld" + 0.006*"medical"'),
 (0,
  '0.081*"tree" + 0.026*"bucket" + 0.025*"cut" + 0.023*"strike" + 0.023*"kill" + 0.015*"bulldozer" + 0.012*"fall" + 0.012*"crush" + 0.011*"head" + 0.011*"loader" + 0.011*"vehicle" + 0.011*"second" + 0.010*"front" + 0.010*"first" + 0.010*"grind" + 0.010*"river" + 0.008*"track" + 0.008*"backhoe" + 0.008*"begin" + 0.007*"lodge"'),
 (2,
  '0.0

In [18]:
# A similar function showing each topic with its most probable words and its topic coherence score
lda.top_topics(dtm_train)

[([(0.033357211010191276, 'truck'),
   (0.023920346568057468, 'backhoe'),
   (0.02238723727971919, 'coworker'),
   (0.017766885883082752, 'strike'),
   (0.016084270943826057, 'tractor'),
   (0.014140281142242046, 'fracture'),
   (0.012418205329150793, 'operate'),
   (0.012164570605386728, 'run'),
   (0.012108340369185375, 'driver'),
   (0.010087377970671836, 'pipe'),
   (0.0096890635677625743, 'kill'),
   (0.009586584713052097, 'wood'),
   (0.0095705672964696252, 'hospitalize'),
   (0.0094580550293080108, 'transport'),
   (0.0093060098727045515, 'side'),
   (0.0087445090558081699, 'take'),
   (0.0087193476031280272, 'hit'),
   (0.0085746436038259492, 'treat'),
   (0.0084893535370799186, 'piece'),
   (0.0083841231329047639, 'eye')],
  -3.6487844874214344),
 ([(0.03585723763174288, 'burn'),
   (0.012095464363286514, 'machine'),
   (0.011465761679103669, 'coworker'),
   (0.01117948016975634, 'fire'),
   (0.010927598955686972, 'hospitalize'),
   (0.01032128434616416, 'area'),
   (0.0096471

In [19]:
# We can therefore derive the average topic coherence, as a way to evaluate the topic models
import numpy as np
lda_coherence = [n for _, n in lda.top_topics(dtm_train)]
np.mean(lda_coherence)

-6.4027733826063375

In [20]:
# Another metric for gauging goodness of models, perplexity, is accessed using bound() function
lda.bound(dtm_train)

-128434.13892029956

In [21]:
# Get the topic distribution of documents
dtopics_train = lda.get_document_topics(dtm_train)

In [22]:
# Get the topic likelihood for the first document in train set
for i in range(0, 10):
    print(dtopics_train[i])

[(8, 0.34565691816113764), (10, 0.61127729395773556)]
[(4, 0.90908590416029844)]
[(1, 0.20256653466227442), (4, 0.75197322715344939)]
[(0, 0.022727272743324367), (1, 0.77272109699353175), (2, 0.022728708901654519), (3, 0.022727602836130161), (4, 0.022727272737359364), (5, 0.022727705739514915), (6, 0.022730869863781266), (7, 0.022727272743301951), (8, 0.022727272733904555), (9, 0.022727272744105843), (10, 0.022727651963391604)]
[(0, 0.015151766956041979), (1, 0.015152768993302937), (2, 0.015151515154504601), (3, 0.015152175888982706), (4, 0.015152385059782329), (5, 0.84847951818855472), (6, 0.015151660497561288), (7, 0.015152415681985923), (8, 0.015152118761561575), (9, 0.01515165205994006), (10, 0.015152022757781873)]
[(0, 0.010101865702996557), (1, 0.010102554030031683), (2, 0.010101166411759736), (3, 0.010101455251340501), (4, 0.010101827374635709), (5, 0.01010145954173706), (6, 0.01010109793064606), (7, 0.010102104205009295), (8, 0.010101913109894466), (9, 0.30289239557283015), (10

In [75]:
# Pick the topic with the highest probability for each document, map it to the label
# NOTE: the mapping may change in a different run
from operator import itemgetter
top_train = [max(t, key=itemgetter(1))[0] for t in dtopics_train]
dict = {0: 'tbd', 1: 'tbd', 2: 'tbd', 3: 'tbd', 4: 'tbd', 5: 'tbd', 6: 'tbd', 7: 'tbd'}
topic_train = [dict[t] for t in top_train]

In [76]:
# Now let's see how well these topics match the actual categories

from sklearn import metrics
print(metrics.confusion_matrix(topic_train, Y_Cases_Trn))
print(np.mean(topic_train == Y_Cases_Trn) )
print(metrics.classification_report(topic_train, Y_Cases_Trn))

# The typical practice is to use the reserved test set for evaluation
X_Toks_Test = X_Cases_Tst.apply(my_tokenizer)
dtm_test = [dictionary.doc2bow(d) for d in X_Toks_Test]
dtopics_test = lda.get_document_topics(dtm_test)
top_test = [max(t,key=itemgetter(1))[0] for t in dtopics_test]
topic_test = [dict[t] for t in top_test]
print(metrics.confusion_matrix(topic_test, Y_Cases_Tst))
print(np.mean(topic_test == Y_Cases_Tst))
print(metrics.classification_report(topic_test, Y_Cases_Tst))

ValueError: Found input variables with inconsistent numbers of samples: [181, 53]

In [None]:
# Finally running the model on untagged data to get corresponding topics
AccidentCases_Osha.head()
AccidentCases_Osha_Tag = AccidentCases_Osha.apply(my_tokenizer)
dtm_osha = [dictionary.doc2bow(d) for d in AccidentCases_Osha_Tag]
dtopics_osha = lda.get_document_topics(dtm_osha)
for i in range(0, 10):
    print(dtopics_osha[i])
