In [None]:
import numpy as np
import pandas as pd

http= "https://data.4tu.nl/file/1c51b037-bf30-4c1a-9c50-25d1617a4899/65710417-8249-40db-a47a-dfa4d25bfad4"
df = pd.read_csv(http, sep=";", parse_dates=['TIMESTAMP'], encoding='cp1252', low_memory=False)
print(df.columns)
df.head()

In [None]:
df[df["CustomerID"] == 2025826].sort_values(by="TIMESTAMP")

In [None]:
class ProcessTree:
    COL_VHOST = "VHOST"
    COL_URL_FILE = "URL_FILE"
    COL_PAGE_NAME = "PAGE_NAME"
    COL_SESSION_ID = "SessionID"

    def __init__(self, row, count=1):
        self.vhost = row[self.COL_VHOST]
        self.url_file = row[self.COL_URL_FILE]
        self.page_name = row[self.COL_PAGE_NAME]

        self.first_session_id = row[self.COL_SESSION_ID]
        self.count = count
        self.refresh_count = 0
        self.children = []
        pass

    def update_child(self, row):
        vhost, url_file, page_name = row[self.COL_VHOST], row[self.COL_URL_FILE], row[self.COL_PAGE_NAME]

        ## Check and see if the child already exist
        for c in self.children:
            if c.vhost == vhost and c.url_file == url_file and c.page_name == page_name:
                c.count += 1
                return c

        child = ProcessTree(row)
        self.children.append(child)
        return child

    def print_tree(self, depth=0):
        print(" -  " * depth + f"{self.vhost} {self.url_file} {self.page_name} (First Session: {self.first_session_id}) ({self.count})")
        for c in self.children:
            c.print_tree(depth + 1)


In [None]:
processes = []
prev_session_id = -1
prev_process_tree = None

tmp = df.sort_values(by=["SessionID", "TIMESTAMP"])[:5000]
for index, row in tmp.iterrows():
    session_id = row["SessionID"]

    if prev_session_id != session_id:
        found = False
        ## Check and see if our new process is in the head of of processes
        for p in processes:
            if p.vhost == row[ProcessTree.COL_VHOST] and p.url_file == row[ProcessTree.COL_URL_FILE] and p.page_name == row[ProcessTree.COL_PAGE_NAME]:
                p.count += 1
                prev_process_tree = p
                found = True
                break

        ## If we don't find it, then go ahead and create a new one
        if not found:
            prev_process_tree = ProcessTree(row)
            processes.append(prev_process_tree)
    else:
        if prev_process_tree.vhost == row[ProcessTree.COL_VHOST] and prev_process_tree.url_file == row[ProcessTree.COL_URL_FILE] and prev_process_tree.page_name == row[ProcessTree.COL_PAGE_NAME]:
            prev_process_tree.refresh_count += 1
        else:
            prev_process_tree = prev_process_tree.update_child(row)

    prev_session_id = session_id

In [None]:
import matplotlib.pyplot as plt
import networkx as nx
from operator import itemgetter

def addProcessEdge(grph, node, parent_node=None):
    if parent_node is not None:
        grph.add_edge(parent_node.vhost + "-" + parent_node.url_file + ";" + parent_node.page_name,
                      node.vhost + "-" + node.url_file + ";" + node.page_name)

    for c in node.children:
        addProcessEdge(grph, c, node)

#ax, fig = plt.subplots(figsize=(20, 10))
G = nx.DiGraph()
# Add all of the edges
for p in processes:
    addProcessEdge(G, p)

node_and_degree = G.degree()
(largest_hub, degree) = sorted(node_and_degree, key=itemgetter(1))[-1]
hub_ego = nx.ego_graph(G, largest_hub)
pos = nx.spring_layout(hub_ego)
print(pos)


def _testNode(node):
    name = node.vhost + "-" + node.url_file + ";" + node.page_name
    if not (name in pos):
        print(f"{name} not is pos")

    for n in node.children:
        _testNode(n)

for p in processes:
    _testNode(p)


#pos = nx.spring_layout(G)
#nx.draw(G, pos, with_labels=False, node_size=50)

#ax  = plt.gca()
#ax.margins(0.1)
#plt.axis("off")
#plt.show()