In [1]:
import numpy as np
import pandas as pd

import networkx as nx
import matplotlib.pyplot as plt
import seaborn as sns

from itertools import product

In [2]:
## =======================================================================
## Trie node class, mostly used to track the path that the process takes
## =======================================================================
class TrieNode:
    def __init__(self, keys, depth=0, parent=None):
        ## Ensure that the keys that we have been given are actually a list
        if type(keys) is list:
            self.keys = keys
        else:
            self.keys = [keys]
        self.parent = parent
        ## Did we get a parent, update our depth then
        if self.parent is not None:
            self.depth = self.parent.depth + 1
        else:
            self.depth = depth

        ## Initial a list of children
        self.children = []
        ## How many times was the noode, or it was a head/tail
        self.count, self.head_count, self.tail_count = 1, 0, 0

    def updateChlidren(self, keys):
        for c in self.children:
            if c.keys == keys:
                return c

        c = TrieNode(keys, parent=self)
        self.children.append(c)
        return c

    def printTree(self, offset=0):
        print("    "*self.depth + f" => ({self.depth}, {offset}) [ keys : {str(self.keys)} ]")
        child_offset = 0
        for c in self.children:
            c.printTree(child_offset)
            child_offset += 1

In [3]:
## =======================================================================
## Trie to build process paths and allow us to calculate statistics using
##     pandas dataframes.
## =======================================================================
class ProcessTrie:
    def makeEmptyDict():
        return {
            "entity": [],
            "start_activity": [],             "stop_activity": [],
            "start_timestamp": [],            "stop_timestamp": [],
            "latitude1": [],                    "longitude1": [],
            "latitude2": [],                    "longitude2": [],
            "resuources": []
        }

    ## Constructor
    def __init__(self, data=None, entity_col="entity", entity_cols=None,
                 activity_col="activity", activity_cols=None,
                 timestamp_col="timestamp", resource_col="resources",
                 latitude_col="latitude", longitude_col="longitude",
                 looping=False, max_step=10000,
                 sep_tails=True, sep_heads=True,
                 debug=False,
                ):
        self.processTrees = []
        self.dataframe = None

        ## What are the definition for our column names
        if type(entity_col) is list: self.entity_cols = entity_col
        elif entity_cols is None:    self.entity_cols = [entity_col]
        else:                        self.entity_cols = entity_cols
        ## Activity Columns
        if type(activity_col) is list: self.activity_cols = activity_col
        elif activity_col is None:     self.activity_cols = [activity_col]
        else:                          self.activity_cols = activity_cols

        self.timestamp_col = timestamp_col
        self.resource_col = resource_col
        self.latitude_col = latitude_col
        self.longitude_col = longitude_col

        ## Some variables for tracking if something is looping and has
        ##     a number of max loops (only way I currently determine the
        ##     end of loops).  Also, should we track the nodes that are
        ##     nodes/heads seperate from the regular middle nodes.
        self.looping, self.max_step, self.sep_tails, self.sep_heads = looping, max_step, sep_tails, sep_heads
        self.debug = debug

        if data is not None:
            self.buildFromDataFrame(data)

    ## Update/Append to the head's of our different process tree's
    def updateHeadTrees(self, keys):
        f = False
        t = None

        key_copy = keys.copy()
        for t in self.processTrees:
            if set(t.keys) == set(key_copy):
                f = True
                break

        if not f:
            t = TrieNode(keys, depth=0)
            self.processTrees.append(t)

        if self.sep_heads == True:
            t.head_count += 1

        return t

    ## Build / expand our process_trie and dataframe from the
    ##     dataframe that we are passed as a parameter
    def buildFromDataFrame(self, data):
        records = ProcessTrie.makeEmpthDict()

        ## Make a copy of our data, and sort the values so that they come up correct
        sort_by = [c for c in self.entity_cols]
        sort_by.append(self.timestamp_col)
        sort_by.extend(self.activity_cols)
        #print(f"-> Sort values by {str(sort_by)}")
        data = data.copy().sort_values(sort_by)

        prev_entity          = [data.iloc[0][c] for c in self.entity_cols]
        prev_activity        = [data.iloc[0][c] for c in self.activity_cols]
        prev_timestamp       = data.iloc[0][self.timestamp_col]
        if self.sep_heads == True: prev_activity.append("HEAD")
        prev_node            = self.updateHeadTrees(prev_activity)
        if self.sep_heads == True: prev_node.head_count += 1
        else:                      prev_node.count += 1

        # What step / edge are we on
        step = 0
        edge = 0
#        print(f"entity: {prev_entity}, \t activity: {prev_activity}, \t timestamp: {prev_timestamp}, \t step: {step}")

        ## Loop through our data-frame and start building nodes
        for index, row in data.iloc[1:].iterrows():
            entity          = [row[c] for c in self.entity_cols]
            activity        = [row[c] for c in self.activity_cols]
            timestamp       = row[self.timestamp_col]
            ## Increment the stamp that we are currently doing
            step += 1
#            print(f"entity1: {entity}, \t activity: {activity}, \t timestamp: {timestamp}, \t step: {step}")

            ## If our previous and current entity are different, clear out
            ##     the previous_node and find the head of the new tree
            if set(prev_entity) != set(entity) or (self.looping and step > self.max_step):
#                print(f"Number of edges counted: {edge}")
#                print(f"Number of edges in array ('stop_activity'): {len(records['stop_activity'])}")

                ## Check if we are tracking tail nodes, if so update our prev_node,
                ##     to increment the number of tails that this node has been
                if self.sep_tails == True:
                    ## Check to see if "Tail is already in our list of activities
                    if "TAIL" not in records["stop_activity"][edge-1]:
                        records["stop_activity"][edge-1].append("TAIL")
                        prev_node.tail_count += 1

                ## Check if we are tracking head nodes sperately, and update
                ##     activities as appropriate
                if self.sep_heads == True: activity.append("HEAD")
                prev_node = self.updateHeadTrees(activity)
                if self.sep_heads == True: prev_node.head_count += 1
                ## Reset the number of steps to 0
                step = 0
#                print(f"entity2: {entity}, \t activity: {activity}, \t timestamp: {timestamp}, \t step: {step}")

            ## Otherwise update our previous_node, base upon our current activity
            else:
                prev_node = prev_node.updateChlidren(activity)
                records["entity"].append(entity)
                records["start_activity"].append(prev_activity)
                records["stop_activity"].append(activity)
                records["start_timestamp"].append(prev_timestamp)
                records["stop_timestamp"].append(timestamp)
                records["latitude1"].append(None)
                records["longitude1"].append(None)
                records["latitude2"].append(None)
                records["longitude2"].append(None)
                records["resources"].append(None)
                ## Increment our edge count
                edge += 1

#                print(f"entity3: {entity}, \t activity: {activity}, \t timestamp: {timestamp}, \t step: {step}")

            ## Set our previous values to the current values
            prev_node.count += 1
            prev_entity = entity
            prev_activity = activity
            prev_timestamp = timestamp

        ## Duplicate copy of records, saved just for trouble shooting
        if self.debug: self.records = records
        ## Update our dataframe
        if self.dataframe is None: self.dataframe = pd.DataFrame(records)
        else:                      self.dataframe = pd.concat([self.dataframe, pd.DataFrame(records)])