# Fraction of peak I/O bandwidth utilized

Regenerate the "Users seldom achieve large fractions of peak I/O bandwidth" slide from the _2014 NERSC Workload Analysis_ slides

In [None]:
%matplotlib inline

In [None]:
import datetime
import numpy

In [None]:
import tokio

In [None]:
import matplotlib

# default fontsize is tiny; make it bigger
matplotlib.rcParams['font.size'] = 16

In [None]:
# Input parameters
date_start = datetime.datetime(2017, 5, 1)
date_end = datetime.datetime(2017, 5, 7)
file_system = 'scratch3'

In [None]:
# Define some constants
peak_bandwidth = {
    "cscratch": 744,
    "scratch1": 24,
    "scratch2": 24,
    "scratch3": 36,
}

In [None]:
# Load data using pytokio - note that bigger time ranges will take a long time to load and a lot of memory!
df_writes = tokio.tools.hdf5.get_dataframe_from_time_range(
    fsname=file_system,
    dataset_name='datatargets/writerates',
    datetime_start=date_start,
    datetime_end=date_end)

df_reads = tokio.tools.hdf5.get_dataframe_from_time_range(
    fsname=file_system,
    dataset_name='datatargets/readrates',
    datetime_start=date_start,
    datetime_end=date_end)

In [None]:
# Convert bytes/sec to GiB/sec, then add up all GiB/sec across all OSTs for each timestep
sum_writerates = (df_writes / 2**30).sum(axis=1)
sum_readrates = (df_reads / 2**30).sum(axis=1)

Use matplotlib's `hist` function to plot cumulative histograms of the read and write rates sampled.

In [None]:
fig, ax = matplotlib.pyplot.subplots(figsize=(8,6))

# Draw cumulative distribution functions
n_reads, bins_reads, _ = ax.hist(sum_writerates, # values to bin up
                                 range(0, peak_bandwidth[file_system]), # bin extents
                                 histtype='step',
                                 cumulative=True,
                                 density=True,
                                 label='writes',
                                 color='red')
n_writes, bins_writes, _ = ax.hist(sum_readrates, # values to bin up
                                   range(0, peak_bandwidth[file_system]), # bin extents
                                   histtype='step',
                                   cumulative=True,
                                   density=True,
                                   label='reads',
                                   color='blue')
n_total, bins_total, _ = ax.hist(sum_writerates + sum_readrates, # values to bin up
                                 range(0, peak_bandwidth[file_system]), # bin extents
                                 histtype='step',
                                 cumulative=True,
                                 density=True,
                                 label='total',
                                 color='black')

# Draw the 10% line
line = ax.axvline(peak_bandwidth[file_system] / 10.0, label="10 % of peak", color='black')
line.set_linestyle('--')

# Pretty up the plot
ax.set_xlim(0, peak_bandwidth[file_system])
ax.set_ylim(0, 1)

ax.legend()

ax.set_xlabel("Inferred LMT I/O rate (GB / s)")
ax.set_ylabel("Fraction of time samples at or below rate")

Print out some of the values we calculated above

In [None]:
bins_reads

In [None]:
n_reads