## Embedding a Bokeh server in a Notebook

This notebook shows how a Bokeh server application can be embedded inside a Jupyter notebook. 

In [None]:
import yaml
import datetime as dt

import pandas as pd
import numpy as np

from bokeh.layouts import column
from bokeh.models import (ColumnDataSource, Slider, LabelSet, CategoricalColorMapper, DataTable, HoverTool, IntEditor,
                          NumberEditor, NumberFormatter, SelectEditor,
                          StringEditor, StringFormatter, TableColumn)
from bokeh.plotting import figure
from bokeh.themes import Theme
from bokeh.palettes import Spectral4, viridis
from bokeh.io import show, output_notebook
from bokeh.transform import jitter
import colorcet as cc


output_notebook()

In [None]:
## IMPORT IO TRACES AND CREATE TWO DATAFRAMES

io_traces = None
with open("io_operations.yml", "r", encoding="utf-8") as traces:
    io_traces = yaml.load(traces, Loader=yaml.SafeLoader)

job_columns = [
    "job_id",
    "job_uid"
    "job_status", 
    "job_submit_ts", 
    "job_end_ts", 
    "job_duration",
    "origin_runtime",
    "origin_read_bytes",
    "origin_written_bytes",
    "origin_core_used",
    "origin_mpi_procs",
    "job_sleep_time",
]

# Not all columns are used by every type of action...
action_columns = [
    "act_name",
    "act_type",
    "act_status",
    "act_start_ts",
    "act_end_ts",
    "act_duration",
    "src_storage_service",
    "src_storage_server",
    "src_storage_disk",
    "src_file_path",
    "src_file_name",
    "src_file_size_bytes",
    "dst_storage_service",
    "dst_storage_server",
    "dst_storage_disk",
    "dst_file_path",
    "dst_file_name",
    "dst_file_size_bytes",
    "parent_job_id",
]

# DATAFRAME WITH JOBS ONLY
jobs = pd.DataFrame(io_traces, columns=job_columns)



# Merge all actions into a second DataFrame, with new "parent_job_id" field associated to each one.
action_list = []
for trace in io_traces:
    local_actions = trace["job_actions"]
    for l_act in local_actions:
        l_act["parent_job_id"] = trace["job_id"]
        if not "src_storage_service" in l_act:
            l_act["src_storage_service"] = "NA"
            l_act["src_storage_server"] = "NA"
            l_act["src_storage_disk"] = "NA"
            l_act["src_file_path"] = "NA"
            l_act["src_file_name"] = "NA"
            l_act["src_file_size_bytes"] = 0
        if not "dst_storage_service" in l_act:
            l_act["dst_storage_service"] = "NA"
            l_act["dst_storage_server"] = "NA"
            l_act["dst_storage_disk"] = "NA"
            l_act["dst_file_path"] = "NA"
            l_act["dst_file_name"] = "NA"
            l_act["dst_file_size_bytes"] = 0
        action_list.append(l_act)
        
# DATAFRAME WITH ACTIONS ONLY
actions = pd.DataFrame(action_list, columns=action_columns)

In [None]:
jobs = jobs.sort_values("job_submit_ts")
jobs["job_submit_td"] = jobs["job_submit_ts"].apply(pd.to_timedelta, unit="s")
jobs

In [None]:
actions = actions.sort_values("act_start_ts")

# Create Timedelta columns from timestamp, in order to display data with valid date range in following plots
actions["act_start_td"] = actions["act_start_ts"].apply(pd.to_timedelta, unit="s")
actions["act_end_td"] = actions["act_end_ts"].apply(pd.to_timedelta, unit="s")

print(f"# First action registered on {dt.datetime.fromtimestamp(actions['act_start_ts'].min())}")
print(f"# Last action ends on {dt.datetime.fromtimestamp(actions['act_end_ts'].max())}")

actions

In [None]:
## IMPORT STORAGE SERVICE ORIENTED IO TRACES AND CREATE TWO DATAFRAMES

io_traces_storage_service = None
with open("storage_service_operations.yml", "r", encoding="utf-8") as traces:
    io_traces_storage_service = yaml.load(traces, Loader=yaml.SafeLoader)
    
ss_io_columns = [
    "ts",
    "action_type",
    "action_name",
    "action_job",
    "storage_service",
    "disk",
    "volume_change_bytes",
    "total_allocation_server",
    "total_used_volume_bytes_server",
    "total_allocation_disk",
    "total_used_volume_bytes_disk",
]

actions_detail = pd.DataFrame(io_traces_storage_service, columns=ss_io_columns)
actions_detail = actions_detail.sort_values("ts")
actions_detail["ts"] = actions_detail["ts"].apply(pd.to_timedelta, unit="s")
#actions_detail["action_job"] = actions_detail["action_job"].astype('object')
actions_detail

In [None]:
JOB_UIDS = list(set(actions_detail["action_job"].astype("string").tolist()))
print(f"JOBS UIDS (unique) : {JOB_UIDS}")

JOB_UIDS_COLOR = [cc.glasbey_dark[i] for i in range(len(JOB_UIDS))]
print(JOB_UIDS_COLOR)

In [None]:
# Get mulitple sub-dataframes, one for each storage service in use in our data
details_by_storageservice = actions_detail.groupby('storage_service', axis=0)

In [None]:
# Extract a dataframe for each type of action

copy_actions_detail = actions_detail[actions_detail["action_type"] == "FILECOPY"]
write_actions_detail = actions_detail[actions_detail["action_type"] == "FILEWRITE"]
delete_actions_detail = actions_detail[actions_detail["action_type"] == "FILEDELETE"]

In [None]:
import numpy as np
 
from bokeh.models import ColumnDataSource, RangeTool, DatetimeTickFormatter, NumeralTickFormatter
from bokeh.transform import factor_cmap, factor_mark


VERBOSE_DT_FORMATTER = DatetimeTickFormatter(days='%d/%m', hours="%d/%m - %Hh", hourmin='%H:%M', minutes = '%H:%M')
BYTE_FORMATTER = NumeralTickFormatter(format='0.0b')
TYPES = ["FILECOPY", "FILEREAD", "COMPUTE", "FILEWRITE", "FILEDELETE"]
IO_TYPES = ["FILECOPY", "FILEREAD", "FILEWRITE", "FILEDELETE"]
IO_TYPES_NO_READ = ["FILECOPY", "FILEWRITE", "FILEDELETE"]
MARKERS_ACTION_TYPE = ['circle_y', 'circle_cross', 'circle_x']


def bkapp(doc):
    
    # General DataFrame
    df = actions.copy()
    source = ColumnDataSource(data=df)
    
    # Details DataFrame
    df_actions_details = actions_detail.copy()
    df_details_source  = ColumnDataSource(data=df_actions_details)
    
    df_copy_details = copy_actions_detail.copy()
    df_copy_source  = ColumnDataSource(data=df_copy_details)
    
    df_write_details = write_actions_detail.copy()
    df_write_source  = ColumnDataSource(data=df_write_details)
    
    df_delete_details = delete_actions_detail.copy()
    df_delete_source  = ColumnDataSource(data=df_delete_details)
    
    DT_X_RANGE = (df["act_start_td"].min() - dt.timedelta(seconds=10000), 
                  df["act_start_td"].max() + dt.timedelta(seconds=10000))

    
    # TIMELINE WITH VARIOUS METRICS
    p = figure(
        title="Cumulative R/W Volume", 
        height=300, 
        width=950, 
        tools=["xpan", "save", "reset", "box_zoom", "pan", "zoom_out"],
        x_axis_location="above",
        background_fill_color="#efefef", 
        x_range=DT_X_RANGE
    )
    
    # p.vbar(x='act_start_td', top='src_file_size_bytes', source=source , width=0.9)

    write_cumsum = df_write_details["volume_change_bytes"].cumsum()
    p.varea(x=df_write_details["ts"], 
            y1=np.zeros(write_cumsum.shape[0]), 
            y2=write_cumsum, 
            alpha=0.8, 
            color=(0, 255, 0, 0.5),
            legend_label="WRITE")
    
    copy_cumsum = df_copy_details["volume_change_bytes"].cumsum()
    p.varea(x=df_copy_details["ts"], 
            y1=np.zeros(copy_cumsum.shape[0]), 
            y2=copy_cumsum, 
            alpha=0.8, 
            color=(0, 0, 255, 0.5),
            legend_label="COPY")
        
    delete_cumsum = delete_actions_detail["volume_change_bytes"].cumsum()
    p.varea(x=df_delete_details["ts"], 
            y1=np.zeros(delete_cumsum.shape[0]), 
            y2=delete_cumsum, 
            alpha=0.8, 
            color=(255, 0, 0, 0.5),
            legend_label="DELETE")
    
    
    p.legend.click_policy="hide"
    
    #p.varea_stack(stackers=TYPES, x="ts")
    
    
    # p.line('act_start_td', 'src_file_size_bytes', source=source, line_color=(0, 128, 255, 0.5), line_width=2)
    # p.line('act_start_td', 'dst_file_size_bytes', source=source, line_color=(128, 255, 0, 0.5), line_width=2)
    p.yaxis.axis_label = 'Byte size of file'
    p.xaxis.formatter = VERBOSE_DT_FORMATTER
    p.yaxis.formatter = BYTE_FORMATTER
    
    
    p2 = figure(
        title="ActionTypes", 
        height=300, 
        width=950, 
        tools=["xpan", "save", "reset", "box_zoom", "pan", "zoom_out", "hover"],
        x_axis_location="above",
        background_fill_color="#efefef", 
        x_range=p.x_range, 
        # y_range=TYPES[::-1]
    )
    p2.hover.tooltips = [
        ("Action Type", "@action_type"),
        ("Bytes Copied/Written/Deleted", "@volume_change_bytes"), 
        ("Target Server", "@storage_service"),
        ("Target Disk", "@disk")
    ]
    
    p2.yaxis.axis_label = 'Bytes'
    p2.xaxis.formatter = VERBOSE_DT_FORMATTER
    p2.yaxis.formatter = BYTE_FORMATTER
    
    
    ## JOB_UID INDEXED
    p2.scatter(x="ts", y="volume_change_bytes", 
               source=df_details_source,
               legend_group="action_job", 
               fill_alpha=0.4, 
               size=12,
               marker=factor_mark('action_type', MARKERS_ACTION_TYPE, IO_TYPES_NO_READ),
               color=factor_cmap("action_job", cc.glasbey_dark, JOB_UIDS))
    
    
    ## STORAGE SERVICE INDEXED
    STORAGE_SERVICES = df_actions_details["storage_service"].unique()
    
    for service, color in zip(STORAGE_SERVICES, Spectral4):
        p2.line(x="ts", 
                y="total_used_volume_bytes_server", 
                source=df_actions_details[df_actions_details["storage_service"] == service], 
                line_width=2,
                color=color, 
                alpha=0.8, 
                muted_color=color, 
                muted_alpha=0.1, 
                legend_label=service)
        
    p2.legend.click_policy="mute"
    
    """
    labels = LabelSet(x="ts", y="volume_change_bytes", text="symbol", y_offset=20,
                  text_font_size="11px", text_color="#ffffff",
                  source=df_details_source, text_align='center')
    p2.add_layout(labels)
    """
    
    action_name = sorted(df_actions_details["action_name"].unique())
    
    columns = [
        TableColumn(field="action_name", title="Action Name",
                editor=SelectEditor(options=action_name),
                formatter=StringFormatter(font_style="bold"))
    ]
    
    """
        TableColumn(field="model", title="Model",
                editor=StringEditor(completions=models)),
        TableColumn(field="displ", title="Displacement",
                editor=NumberEditor(step=0.1), formatter=NumberFormatter(format="0.0")),
        TableColumn(field="year", title="Year", editor=IntEditor()),
        TableColumn(field="cyl", title="Cylinders", editor=IntEditor()),
        TableColumn(field="trans", title="Transmission",
                editor=SelectEditor(options=transmissions)),
        TableColumn(field="drv", title="Drive", editor=SelectEditor(options=drives)),
        TableColumn(field="class", title="Class", editor=SelectEditor(options=classes)),
        TableColumn(field="cty", title="City MPG", editor=IntEditor()),
        TableColumn(field="hwy", title="Highway MPG", editor=IntEditor()),
    """
    data_table = DataTable(source=df_details_source, columns=columns, editable=True, width=950,
                       index_position=-1, index_header="row index", index_width=60)
    
    
    # RANGE SELECTION    
    select = figure(title="Range selection plot",
                height=100, width=950, y_range=p.y_range, y_axis_type=None,
                tools="", toolbar_location=None, background_fill_color="#efefef")
    
    range_tool = RangeTool(x_range=p.x_range)
    range_tool.overlay.fill_color = "navy"
    range_tool.overlay.fill_alpha = 0.1

    select.line('act_start_td', 'src_file_size_bytes', source=source, line_color=(255, 128, 0, 0.5))
    select.line('act_start_td', 'dst_file_size_bytes', source=source, line_color=(128, 255, 0, 0.5))
    select.ygrid.grid_line_color = None
    select.xaxis.formatter = VERBOSE_DT_FORMATTER
    select.yaxis.formatter = BYTE_FORMATTER
    select.add_tools(range_tool)
    select.toolbar.active_multi = range_tool
    
    def callback(attr, old, new):
        if new == 0:
            data = df
        else:
            data = df
        source.data = ColumnDataSource.from_df(data)

    doc.add_root(column(p, p2, select, data_table))

    doc.theme = Theme(json=yaml.load("""
        attrs:
            figure:
                background_fill_color: "#DDDDDD"
                outline_line_color: white
                toolbar_location: above
                height: 700
                width: 1200
            Grid:
                grid_line_dash: [6, 4]
                grid_line_color: white
    """, Loader=yaml.FullLoader))

Now we can display our application using ``show``, which will automatically create an ``Application`` that wraps ``bkapp`` using ``FunctionHandler``. The end result is that the Bokeh server will call ``bkapp`` to build new documents for every new sessions that is opened.

**Note**: If the current notebook is not displayed at the default URL, you must update the `notebook_url` parameter in the comment below to match, and pass it to `show`.

In [None]:
show(bkapp) # notebook_url="http://localhost:8888" 