In [1]:
import yaml
import datetime as dt

import pandas as pd
import numpy as np
import matplotlib

from bokeh.layouts import column
from bokeh.models import (ColumnDataSource, Slider, LabelSet, CategoricalColorMapper, DataTable, HoverTool, IntEditor,
                          NumberEditor, NumberFormatter, SelectEditor,
                          StringEditor, StringFormatter, TableColumn, LinearColorMapper, ContinuousColorMapper)
from bokeh.plotting import figure
from bokeh.themes import Theme
from bokeh.palettes import Spectral4, viridis, Viridis256
from bokeh.io import show, output_notebook
from bokeh.transform import jitter, factor_cmap, linear_cmap

from squarify import normalize_sizes, squarify
import colorcet as cc


output_notebook()

In [2]:
## IMPORT IO TRACES AND CREATE TWO DATAFRAMES

columns = ["ts", "storage_service_name", "storage_hostname", "disk_id", "disk_capacity", "disk_free_space", "file_name"]
dtypes = {
    "ts": "Float64", 
    "storage_service_name": "string", 
    "storage_hostname": "string", 
    "disk_id": "string", 
    "disk_capacity": "UInt64",
    "disk_free_space": "UInt64",
    "file_name": "string"
}

"""
io_traces = None
with open("timestamped_io_operations_10.yml", "r", encoding="utf-8") as traces:
    io_traces = yaml.load(traces, Loader=yaml.SafeLoader)
    
ts_traces = pd.DataFrame(io_traces)
ts_traces
"""

ts_traces = pd.read_csv("timestamped_io_operations.csv", sep=",", header=0, dtype=dtypes)
ts_traces


In [3]:
ts_traces = ts_traces.sort_values(by=["ts", "storage_service_name"])
ts_traces["percent_free"] = ts_traces["disk_free_space"].mul(100) 
ts_traces["percent_free"] = ts_traces["percent_free"] / ts_traces["disk_capacity"] 
ts_traces["disk_capacity_tb"] = (ts_traces["disk_capacity"] / 1000 / 1000 / 1000 / 1000).round(decimals=2)

ts_trace_0 = ts_traces[ts_traces["action_name"] == 11]
ts_trace_simulation = ts_traces[ts_traces["action_name"] != 11]

In [4]:
ts_trace_0

In [5]:
ts_trace_simulation

In [6]:
"""
unique_ts = pd.Series(df_dummy["ts"].unique()).sort_values()
unique_ts
"""

unique_ts = pd.Series(ts_traces["ts"].unique()).sort_values()
unique_ts

In [7]:
"""
df_dummy_by_ts = df_dummy.groupby("ts")
keys = [key for key, _ in df_dummy_by_ts]
number_of_traces = len(keys)
"""

ts_traces_by_ts = ts_traces.groupby("ts")
keys = [key for key, _ in ts_traces_by_ts]
# print(keys)
number_of_traces = len(keys)

ts_traces_by_hostname = ts_traces.groupby("storage_hostname")
SERVER_NAMES = [key for key, _ in ts_traces_by_hostname]
# print(SERVER_NAMES)
NB_SERVERS = len(SERVER_NAMES)

In [8]:
import numpy as np
 
from bokeh.models import ColumnDataSource, RangeTool, DatetimeTickFormatter, NumeralTickFormatter
from bokeh.transform import factor_cmap, factor_mark

VERBOSE_DT_FORMATTER = DatetimeTickFormatter(days='%d/%m', hours="%d/%m - %Hh", hourmin='%H:%M', minutes = '%H:%M')
BYTE_FORMATTER = NumeralTickFormatter(format='0.0b')

X, Y, W, H = 0, 0, 1800, 1000
disks = None


def treemap(df, col, x, y, dx, dy):
    """Normalize treemap"""
    sub_df = df.copy() # nlargest(N, col)
    normed = normalize_sizes(sub_df[col], dx, dy)
    blocks = squarify(normed, x, y, dx, dy)
    blocks_df = pd.DataFrame.from_dict(blocks).set_index(sub_df.index)
    return sub_df.join(blocks_df, how="left").reset_index()


def compute_dfs(df: pd.DataFrame, index: int):
    """Prepare generic source from dataframe"""
    
    ts_trace_1 = ts_traces[ts_traces["ts"] == unique_ts[index]]
    ts_trace_1 = ts_trace_1[
        [
            "storage_hostname", 
            "disk_id", 
            "disk_capacity", 
            "disk_free_space", 
            "percent_free", 
            "disk_capacity_tb", 
            "file_name"
        ]
    ]
    ts_trace_1_by_server = (ts_trace_1[["storage_hostname", "disk_id", "disk_capacity", "file_name"]]
                            .groupby("storage_hostname").sum("disk_capacity").sort_values(["storage_hostname"]))
    
    blocks_by_server = treemap(ts_trace_1_by_server, "disk_capacity", X, Y, W, H)

    dfs = []
    for index, (storage_server, capacity, x, y, dx, dy) in blocks_by_server.iterrows():
        df = ts_trace_1[ts_trace_1.storage_hostname==storage_server]
        df = df.sort_values(["disk_capacity"])
        dfs.append(treemap(df, "disk_capacity", x, y, dx, dy))
    blocks = pd.concat(dfs)
    blocks["ytop"] = blocks.y + blocks.dy
    
    internal_blocks = blocks.copy()
    internal_blocks["x"] = internal_blocks["x"] + 5
    internal_blocks["y"] = internal_blocks["y"] + 5
    internal_blocks["dx"] = internal_blocks["dx"] - 10
    internal_blocks["dy"] = internal_blocks["dy"] - 10
    
    internal_blocks = internal_blocks.sort_values(["storage_hostname", "disk_id", "file_name"])
    internal_blocks = internal_blocks.reset_index(drop=True)
    internal_blocks = internal_blocks.drop("index", axis=1)
    
    return (blocks_by_server, internal_blocks)


def bkapp(doc):
        
    # Get a color map with 100 colors ranging from green to red
    colors = ["#3aeb34", "#ffba26", "#ff2a26"][::-1]
    cmap = matplotlib.colors.LinearSegmentedColormap.from_list("cmap_green_red", colors, N=100)
    hex_color_map = [matplotlib.colors.rgb2hex(cmap(i)) for i in range(cmap.N)]
    
    global disks
    servers, disks = compute_dfs(ts_traces, 0)
    servers_source = ColumnDataSource(servers)
    disks_source = ColumnDataSource(disks)
    
    p = figure(
        width=W, 
        height=H, 
        toolbar_location=None,
        x_axis_location=None, 
        y_axis_location=None
    )
    p.x_range.range_padding = p.y_range.range_padding = 0
    p.grid.grid_line_color = None

    # Servers
    p.block('x', 'y', 'dx', 'dy', source=servers_source, 
            line_width=2, line_color="black",
            fill_alpha=0.8, fill_color=factor_cmap("storage_hostname", viridis(NB_SERVERS), SERVER_NAMES))
    
    # Disks
    disk_blk = p.block('x', 'y', 'dx', 'dy', source=disks_source, line_width=1, line_color="black",
            fill_alpha=0.8, fill_color=linear_cmap('percent_free', hex_color_map, 0, 100))
    # p.text('x', 'ytop', x_offset=2, y_offset=5, text="disk_id", source=disks_source,
           #text_font_size="8pt", text_baseline="top")
    # capa_text = p.text('x', 'ytop', x_offset=2, y_offset=20, text="disk_capacity_tb", source=disks_source,
           #text_font_size="10pt", text_baseline="top", text_align="left")
    
    # 
    # p.text('x', 'y', x_offset=2, text="storage_hostname", source=servers_source,
    #       text_font_size="14pt", text_color="white")
        
    hover = HoverTool(name='ytd_ave',tooltips=[
            ("Storage service", "@storage_hostname"),
            ("Disk", "@disk_id"),
            ("Free space (%)", "@percent_free"),
            ("Free space (Bytes)", "@disk_free_space"),
            ("File", "@file_name")
        ]
    )
    hover.renderers = [disk_blk]
    p.add_tools(hover)
        
    # Slider for trace id control
    trace_id = Slider(title="trace", value=0, start=0, end=(number_of_traces - 1), step=1)
    
    def update_data(attrname, old, new):
        # Get the current slider value
        index = trace_id.value
        
        ts_trace_1 = ts_traces[ts_traces["ts"] == unique_ts[index]]
        ts_trace_1 = ts_trace_1[["storage_hostname", "disk_id", "disk_capacity", "disk_free_space", "percent_free", "disk_capacity_tb", "file_name"]].sort_values(["storage_hostname", "disk_id", "file_name"])
        ts_trace_1 = ts_trace_1.reset_index()
        ts_trace_1 = ts_trace_1.drop("index", axis=1)
        ts_trace_1 = ts_trace_1.set_index(keys=["storage_hostname", "disk_id"])
        ts_trace_1 = ts_trace_1.groupby(["storage_hostname", "disk_id"]).agg(
            {
                "disk_capacity": "first", 
                "disk_free_space": "first",
                "percent_free": "first",
                "disk_capacity_tb": "first",
                "file_name": ', '.join
            }
        )
        print(ts_trace_1)
        
        # servers.update(ts_trace_1_by_server)
        global disks
        disks = disks.set_index(keys=["storage_hostname", "disk_id"])
        temp_file_name = disks["file_name"].copy(deep=True)
        disks.update(ts_trace_1)
        print("### Disks after update")
        
        print("##########################")
        disks["previous_file_name"] = temp_file_name
        print(disks)
        disks["file_name"] = disks["previous_file_name"] + ", " + disks["file_name"]
        # disks["previous_file_name", "file_name"].apply(lambda x: ", ".join(x), axis=1)
        # disks.drop("previous_filename", axis=1)
        print("####### Disks after apply")
        print(disks)
        print("###########################2")
        disks = disks.reset_index()
        
        #n_disks = pd.concat([disks,ts_trace_1]).drop_duplicates(['storage_hostname','disk_id'],keep='last')
        #n_disks = n_disks.reset_index()
        #n_disks = n_disks.drop("index", axis=1)
        # disks.update(ts_trace_1)
        
        # servers_source = servers
        disks_source.data = disks
        
        """
        new_servers, new_disks = compute_dfs(ts_traces, index)
        servers_source = new_servers
        disks_source.data = new_disks
        """
        
    trace_id.on_change('value', update_data)

    doc.add_root(column(p, trace_id))

    doc.theme = Theme(json=yaml.load("""
        attrs:
            figure:
                background_fill_color: "#DDDDDD"
                outline_line_color: white
                toolbar_location: above
                height: 700
                width: 1200
            Grid:
                grid_line_dash: [6, 4]
                grid_line_color: white
    """, Loader=yaml.FullLoader))

In [9]:
show(bkapp) # notebook_url="http://localhost:8888" 

In [10]:
servers, disks = compute_dfs(ts_traces, 0)
disks

In [None]:
ts_trace_1 = ts_traces[ts_traces["ts"] == unique_ts[1]]
ts_trace_1 = ts_trace_1[["storage_hostname", "disk_id", "disk_capacity", "disk_free_space", "percent_free", "disk_capacity_tb", "file_name"]].sort_values(["storage_hostname", "disk_id"])
ts_trace_1 = ts_trace_1.reset_index()
ts_trace_1 = ts_trace_1.drop("index", axis=1)
ts_trace_1 = ts_trace_1.sort_values(["storage_hostname", "disk_id"])
ts_trace_1 = ts_trace_1.set_index(keys=["storage_hostname", "disk_id"])
ts_trace_1 = ts_trace_1.groupby(["storage_hostname", "disk_id"]).agg(
    {
        "disk_capacity": "first", 
        "disk_free_space": "first",
        "percent_free": "first",
        "disk_capacity_tb": "first",
        "file_name": ', '.join
    }
)
ts_trace_1

In [None]:
disks = disks.reset_index()
disks = disks.drop("index", axis=1)
disks = disks.set_index(keys=["storage_hostname", "disk_id"])
disks

In [None]:
disks.update(ts_trace_1)
disks

In [None]:
disks = pd.concat([disks,ts_trace_1]).drop_duplicates(['storage_hostname','disk_id'],keep='last')
disks = disks.reset_index()
disks = disks.drop("index", axis=1)
disks