# COVID-19 HPSS Usage

This notebook shows some basic HPSS usage information around the beginning of the COVID-19 shutdown.  The archive was experiencing unusual loads, partly originating from a few users using Globus DSI (which does not generate friendly backend I/Os) and partly due to just very high traffic.

In [None]:
%matplotlib inline

In [None]:
import datetime
import collections
import copy
import re
import os

In [None]:
import pandas
import matplotlib
import matplotlib.pyplot
import tokio

## Top users during the COVID-19 shutdown

In [None]:
# Input parameters for the notebook
TARGET_DATE_START = datetime.datetime(2020, 3, 13)
TARGET_DATE_END = datetime.datetime(2020, 3, 25)
TARGET_SYSTEM = 'archive'

In [None]:
# Identify the text file containing the daily report from flanders
report_filenames = tokio.tools.common.enumerate_dated_files(
    start=TARGET_DATE_START,
    end=TARGET_DATE_END,
    template=tokio.config.CONFIG.get('hpss_report_files'))

records = {}
for report_filename in report_filenames:
    hpss_report = tokio.connectors.hpss.HpssDailyReport(report_filename)
    for record in hpss_report[TARGET_SYSTEM]['largest users']:
        username = record['user']
        if username not in records:
            records[username] = {}
        for key, val in record.items():
            if key in ('user', 'app'):
                continue
            if key not in records[username]:
                records[username][key] = 0
            records[username][key] = val

In [None]:
dataframe = pandas.DataFrame.from_dict(records, orient='index')

In [None]:
dataframe.sort_values('io_gb')

In [None]:
def plot_report_section(report_section, read_key, write_key, include_total=False, ax=None):
    """Plots a read-vs-write horizontal bar graph of a report section
    
    Args:
        report_section (dict): A key for for the HpssDailyReport[SYSTEM]
            dictionary.  Something like ``io totals by hpss client gateway (ui) host``
        read_key (str): Column corresponding to a read value (``read_gb`` or ``r_ops``)
        write_key (str): Column corresponding to a write value (``write_gb`` or ``w_ops``)
        include_total (bool): If True, include the sum of all rows as its own hbar
        ax (matplotlib.axes.Axes): If provided, axes in which plot should be added

    Returns:
        matplotlib.axes.Axes: Axes on which plot was added.
    """
    if ax is None:
        fig, ax = matplotlib.pyplot.subplots()

    yticklabels = []
    for idx, host in enumerate(report_section.keys()):
        if host == 'total' and not include_total:
            continue
        ax.barh(y=idx, width=report_section[host][write_key], color="C%d" % idx, edgecolor='black')
        ax.barh(y=idx, width=-report_section[host][read_key], color="C%d" % idx, edgecolor='black')
        yticklabels.append((idx, host))    

    # this needs to immediately follow the barh since it alters tick widths
    fig.set_size_inches(8.0, 2 + 6.0/35.0 * len(list(report_section.keys())))

    ax.grid() # or ax.xaxis.grid(True)
    ax.set_axisbelow(True)

    max_x = max([abs(x) for x in ax.get_xlim()])
    ax.set_xlim([-max_x, max_x])
    ax.set_xticklabels([int(abs(x)) for x in ax.get_xticks()], rotation=30)
    trans = matplotlib.transforms.blended_transform_factory(
        ax.transAxes,
        ax.transData)
    ax.text(x=0.0, y=idx - 0.5, s="Read", transform=trans, ha="left", va="top")
    ax.text(x=1.0, y=idx - 0.5, s="Write", transform=trans, ha="right", va="top")

    yticks, yticklabels = zip(*yticklabels)
    ax.set_yticks(yticks)
    ax.set_yticklabels(yticklabels)
    ax.set_ylim(ax.get_ylim())

    ax.plot([0, 0], [-10, idx+10], color='black', linewidth=1)
    
    return ax

In [None]:
sorted_keys = sorted(records, key=lambda x: records[x]['io_gb'])
sorted_records = collections.OrderedDict([(x, records[x]) for x in sorted_keys[-10:]])

In [None]:
ax = plot_report_section(sorted_records, read_key='read_gb', write_key='write_gb')
ax.set_xlabel("GB (or GiB?) Moved")

## Show timeline of daily I/O volumes

In [None]:
TARGET_DATE_START = datetime.datetime(2020, 2, 25)
TARGET_DATE_END = datetime.datetime(2020, 3, 25)

In [None]:
# Identify the text file containing the daily report from flanders
report_filenames = tokio.tools.common.enumerate_dated_files(
    start=TARGET_DATE_START,
    end=TARGET_DATE_END,
    template=tokio.config.CONFIG.get('hpss_report_files'))

daily_totals = []
for report_filename in report_filenames:
    hpss_report = tokio.connectors.hpss.HpssDailyReport(report_filename)


    date = datetime.datetime.strptime(report_filename.split('_')[-1], "%Y%m%d")
    rec = {'date': date.date(), 'io_gb': hpss_report[TARGET_SYSTEM]['io totals by client application']['total']['io_gb']}
    daily_totals.append(rec)

In [None]:
ax = pandas.DataFrame.from_records(daily_totals, index='date').plot.bar(legend=False)
ax.set_ylabel("GB (or GiB?) Moved")
ax.yaxis.grid()
ax.set_axisbelow(True)
# ax.tick_params(axis='x', labelrotation=30, ha='right')
ax.set_xticklabels([x if y % 2 else None for y, x in enumerate(ax.get_xticklabels()) ], rotation=30, ha='right', fontsize='12')
pass