# HPSS Object Size Distribution

Generate histograms of object size and object mass of all data objects in NERSC's 200 PB HPSS tape archive.

In [None]:
import glob
import math

In [None]:
import matplotlib
import matplotlib.pyplot

In [None]:
import dask
import dask.config
import dask.bag

In [None]:
import dask.diagnostics
dask.diagnostics.ProgressBar().register()

In [None]:
dask.config.set(scheduler='processes')

In [None]:
INPUT_FILE = 'hpss_file_sizes_20201016.log'
INPUT_CACHE = 'hpss_file_sizes_20201016-*.log.gz'
OUTPUT_HIST_CSV = "hpss_file_size_hist_20201016.csv"

CSCRATCH_DIST = 'cscratch_20190115_sizebytype_hist.csv'
PROJECT2_DIST = 'tlproject2_20181109_sizebytype_hist.csv'

In [None]:
def mapper(line):
    """Converts the raw DB2 dump to a simple list of sizes.
    """
    try:
        return int(line.strip())
    except ValueError:
        return None

def binner(value):
    """Bins each object size.  Start of each bin is inclusive.
    """
    if value == 0:
        return 0
    return int(math.log(value, 2)) + 1

def humanscale(value):
    """Converts a base-2 number into a human-readable measure of size.
    """
    SCALES = [
        (2**50, "PiB"),
        (2**40, "TiB"),
        (2**30, "GiB"),
        (2**20, "MiB"),
        (2**10, "KiB"),
    ]
    for scale in SCALES:
        if value >= scale[0]:
            return "%d %s" % (value / scale[0], scale[1])
    return "%d" % int(value)

In [None]:
# Either read the input file and generate an input cache, or read the cache if it exists
if (glob.glob(INPUT_CACHE)):
    print("Loading %s from cache" % INPUT_CACHE)
    db = dask.bag.read_text(INPUT_CACHE).map(int)
else:
    print("Loading %s from raw input" % INPUT_FILE)
    db = dask.bag.read_text(INPUT_FILE).map(mapper).filter(lambda x: x is not None)
    print("Writing out %s" % INPUT_CACHE)
    db.map(str).to_textfiles(INPUT_CACHE)
    
# Convert read data into a DataFrame
dd = db.to_dataframe(columns=["size (bytes)"])

## Generate Histograms

Bins all objects based on their size, then aggregates bins based on object count and sum of sizes.

In [None]:
dd['bin num'] = dd['size (bytes)'].map(binner)

In [None]:
dataframe = dd.groupby('bin num')
dataframe = dataframe.agg(['count', 'sum']).compute()

In [None]:
dataframe['bin start'] = dataframe.index.map(lambda x: 2**(x - 1) if x > 0 else 0)
dataframe['bin start (human)'] = dataframe['bin start'].map(humanscale)

In [None]:
REMAP_COLS = {
    'size (bytes) count': 'object count',
    'size (bytes) sum': 'object size sum (bytes)'
}

dataframe.columns = [' '.join(col).strip() for col in dataframe.columns.values]
dataframe.columns = list([REMAP_COLS.get(x, x) for x in dataframe.columns])

In [None]:
dataframe['object count cumul sum'] = dataframe['object count'].cumsum()
dataframe['object size cumul sum (bytes)'] = dataframe['object size sum (bytes)'].cumsum()

## Generate Plots

In [None]:
fig, ax = matplotlib.pyplot.subplots()

ax.bar(dataframe.index, dataframe['object count'] / 1e6, width=1, edgecolor='black')
ax.grid()
ax.set_axisbelow(True)
ax.set_ylabel("Millions of objects")
ax.set_xlabel("Object Size")
ax.set_xticks(dataframe.index[::8])
ax.set_xticklabels(dataframe['bin start (human)'][::8], rotation=30, ha='right')

In [None]:
fig, ax = matplotlib.pyplot.subplots()

ax.bar(dataframe.index, dataframe['object size sum (bytes)'] / 2**50, width=1, edgecolor='black')
ax.grid()
ax.set_axisbelow(True)
ax.set_ylabel("Petabytes of data")
ax.set_xlabel("Object Size")
ax.set_xticks(dataframe.index[::8])
ax.set_xticklabels(dataframe['bin start (human)'][::8], rotation=30, ha='right')

In [None]:
fig, ax = matplotlib.pyplot.subplots()

ax.plot(dataframe.index, dataframe['object count cumul sum'] / dataframe['object count cumul sum'].iloc[-1])
ax.grid()
ax.set_axisbelow(True)
ax.set_ylabel("Cumulative fraction of all objects")
ax.set_xlabel("Object Size")
ax.set_xticks(dataframe.index[::8])
ax.set_xticklabels(dataframe['bin start (human)'][::8], rotation=30, ha='right')

In [None]:
fig, ax = matplotlib.pyplot.subplots()

ax.plot(dataframe.index, dataframe['object size cumul sum (bytes)'] / dataframe['object size cumul sum (bytes)'].iloc[-1])
ax.grid()
ax.set_axisbelow(True)
ax.set_ylabel("Cumulative fraction of data")
ax.set_xlabel("Object Size")
ax.set_xticks(dataframe.index[::8])
ax.set_xticklabels(dataframe['bin start (human)'][::8], rotation=30, ha='right')

## Display/save numeric histograms

In [None]:
dataframe.to_csv(OUTPUT_HIST_CSV)
dataframe.head()

## Compare to file systems

In [None]:
import pandas

In [None]:
try:
    cscratch_hist = pandas.read_csv(CSCRATCH_DIST)[['bin_size', 'num_files']][1:]
except FileNotFoundError:
    cscratch_hist = None
    
if cscratch_hist is not None:
    cscratch_hist.columns = ['bin start', 'object count']
    cscratch_hist.index.name = "bin num"
    # note we have to alter the bins since the fs datasets label bins according to their inclusive ends, not inclusive starts
    cscratch_hist['bin start'].map(lambda x: int(x/2.0))
    cscratch_hist['object count cumul sum'] = cscratch_hist['object count'].cumsum()
    cscratch_hist['bin start (human)'] = cscratch_hist['bin start'].map(humanscale)
    cscratch_hist.head()

In [None]:
try:
    project2_hist = pandas.read_csv(PROJECT2_DIST)[['bin_size', 'num_files']][1:]
except FileNotFoundError:
    project2_hist = None

if project2_hist is not None:
    project2_hist.columns = ['bin start', 'object count']
    project2_hist.index.name = "bin num"
    # note we have to alter the bins since the fs datasets label bins according to their inclusive ends, not inclusive starts
    project2_hist['bin start'].map(lambda x: int(x/2.0))
    project2_hist['object count cumul sum'] = project2_hist['object count'].cumsum()
    project2_hist['bin start (human)'] = project2_hist['bin start'].map(humanscale)
    project2_hist.head()

In [None]:
fig, ax = matplotlib.pyplot.subplots(figsize=(8,4))

ax.plot(dataframe.index, dataframe['object count cumul sum'] / dataframe['object count cumul sum'].iloc[-1], label="Tape Archive (Oct 2020)")
if cscratch_hist is not None:
    ax.plot(cscratch_hist.index, cscratch_hist['object count cumul sum'] / cscratch_hist['object count cumul sum'].iloc[-1], label="Lustre Scratch (Jan 2019)")
if project2_hist is not None:
    ax.plot(project2_hist.index, project2_hist['object count cumul sum'] / project2_hist['object count cumul sum'].iloc[-1], label="GPFS Project (Nov 2018)")
ax.grid()
ax.set_axisbelow(True)
ax.set_ylabel("Cumulative fraction of all objects")
ax.set_xlabel("Object Size")
ax.set_xticks(dataframe.index[::8])
ax.set_xticklabels(dataframe['bin start (human)'][::8], rotation=30, ha='right')
ax.legend()
ax.set_xlim(0, 41)