# Defining the size of big jobs at NERSC

This notebook demonstrates how to quantify what fraction of a compute platform is typically used by "big" jobs.  We define "big" jobs as those which use up significant resources on the system, where resources could be

1. total fraction of available compute nodes
2. total fraction of available CPU hours delivered in a given allocation year

In this work we chose definition #2 so that we do not bias ourselves towards very large but very short debug jobs.  We select the jobs that consumed the most cycles, then examine the job size distribution within this collection of big jobs.

In [None]:
%matplotlib inline

In [None]:
import pandas
import numpy
import matplotlib.pyplot

In [None]:
MAX_EDISON = 5586.0
MAX_CORI_KNL = 9688.0
MAX_CORI_HSW = 2388.0

In [None]:
df = pandas.read_csv('cori-edison-jobs-2017.csv.gz')

In [None]:
df['nodehrs'] = df['numnodes'] * df['wallclock'] / 3600.0

## Distribution of cycles by job class

In [None]:
print "Millions of cycles delivered per superclass"
grouped = df.groupby(by=['hostname', 'superclass'])
grouped['nodehrs'].sum() / 1e6

In [None]:
print "Percent of delivered cycles per superclass"
grouped['nodehrs'].sum().groupby(level=0).transform(lambda x: 100.0 * x / x.sum())

In [None]:
# the below cori-hsw definition is WRONG
df_by_system = {
    'cori-knl': df[(df['hostname'] == 'cori') 
                    & ((df['superclass'].str.startswith('knl'))
                    |  ((df['numnodes'] > MAX_CORI_HSW)
                    &   (df['numnodes'] <= MAX_CORI_KNL)))].copy(),
    'cori-hsw': df[
                (df['hostname'] == 'cori') 
                & (~(df['superclass'].str.startswith('knl'))
                &   (df['superclass'] != 'system')
                &   (df['superclass'] != 'benchmark')
                &   ((df['superclass'] != 'special') | (df['numnodes'] <= MAX_CORI_HSW)))
                  ].copy(),
    'edison': df[df['hostname'] == 'edison'].copy(),
}

del df ### collect garbage now that the dataframe has been split out

In [None]:
for sys in df_by_system.keys():
    df_by_system[sys]['fraction_total'] = df_by_system[sys]['nodehrs'] / df_by_system[sys]['nodehrs'].sum()

## Job Count Distribution

In [None]:
def plot_hist(df, key, title=""):
    fig, ax = matplotlib.pyplot.subplots(figsize=(12,4))
    df[key].hist(ax=ax, bins=512)
    ax.set_yscale("log")
    ax.set_ylabel("Job Count")
    ax.set_xlabel(key)
    ax.set_xlim((1, 10000))
    fig.suptitle(title)
    return ax

In [None]:
for sys in sorted(df_by_system.keys()):
    df = df_by_system[sys]
    print "%s summary" % sys
    print df[['numnodes', 'nodehrs']].describe()
    plot_hist(df, 'nodehrs', sys.title())

## Cumulative Distribution of Cycles

### Helper functions to calculate and analyze CDFs

In [None]:
def calculate_csum(df, sys, by, norm=True):
    """Calculate the cumulative sum for """
    df_working = df.sort_values(by=by, ascending=False)

    csum = 0.0
    csum_y = []
    for value in df_working[by]:
        csum += value
        csum_y.append(csum)

    # Normalize cumulative sum so we get a fraction
    if norm:
        csum_y /= csum
    return csum_y

In [None]:
def plot_csum(csum_y):
    """Plot the cumulative sum function"""
    fig, ax = matplotlib.pyplot.subplots(figsize=(12,4))

    ax.plot(csum_y)
    # ax.set_yscale("log")
    ax.set_xscale("log")
    ax.set_ylim((0.01,1.0))
    ax.set_yticks(numpy.arange(0,1,0.1))
    ax.set_xlabel("Number of Jobs")
    ax.set_ylabel("Fraction of 2017 cycles consumed")
    ax.grid()

In [None]:
def define_bigjobs(df, sys, x):
    """Define what a 'big job' is"""
    tot_nodes = {
        'edison': MAX_EDISON,
        'cori-knl': MAX_CORI_KNL,
        'cori-hsw': MAX_CORI_HSW,
    }
    csum_y = calculate_csum(df, sys, by='nodehrs', norm=True)

    jobcount = numpy.interp(x=x,
                            fp=numpy.arange(len(csum_y)),
                            xp=csum_y)
    jobcount = int(jobcount)
    
    df_sorted_by_nodehrs = df.sort_values(by='nodehrs', ascending=False)
    summary = (df_sorted_by_nodehrs.iloc[0:jobcount]['numnodes'] / tot_nodes[sys]).describe()
    
    return summary

In [None]:
def summarize_summary(summary, sys):
    col_descriptions = {
        "mean": "average",
        "min": "smallest",
        "25%": "25th percentile",
        "50%": "median",
        "75%": "75th percentile",
        "max": "biggest",
    }
    print 'N is %d jobs' % summary['count']
    for stat, descr in col_descriptions.iteritems():
        str_begin = '%s "big job" used' % descr
        print "%30s %5.1f%% of %s\'s nodes" % (str_begin, 100.0 * summary[stat], sys)

### Run some actual analysis

In [None]:
x = 0.50
sys = 'cori-knl'

In [None]:
plot_csum(calculate_csum(df_by_system[sys], sys, by='nodehrs', norm=True))

Let's look at all available systems now

In [None]:
print 'Assuming "big" jobs are the N largest jobs who, in total, used up %d%% of the total cycles delivered:' % (100.0*x)
for sys in df_by_system.keys():
    print "\n===== %s =====" % sys
    summary = define_bigjobs(df_by_system[sys], sys, x)
    summarize_summary(summary, sys)