In [None]:
import json
from datetime import datetime, timedelta

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

In [None]:
#fname = 'bigpanda.hc_20324124.json'
#fname = 'bigpanda.hc20323822.tid1313.json'
fname = 'bigpanda.hc20324124.tid1337.json'
jd = json.load(open(fname))

In [None]:
jd.keys()


In [None]:
len(jd['jobs'])

In [None]:
jd['jobs'][0]

In [None]:

df = pd.DataFrame(jd['jobs'])
len(df)

In [None]:
# cleanup dataset and add some further parameters
df = df[df.jobstatus=='finished'] # only finished jobs
print(len(df))

df['readfrac'] = df.totrchar*1024/df.inputfilebytes
# calculate input rate in MB/s
df['readrate'] = df.totrchar/1024/df.durationsec
df['evtrate'] = df.nevents/df.durationsec
# convert start/end time to date
df['starttime'] = pd.to_datetime(df['starttime'])
df['endtime'] = pd.to_datetime(df['endtime'])
df['cputype']=[x[2:16] for x in df.cpuconsumptionunit]
# work load run-time from pilottiming list
df['wlruntime'] = [int(x.split('|')[2]) for x in df.pilottiming]
df['wlreadrate'] = df.totrchar/1024/df.wlruntime
df['wlcpueff'] = df.cpuconsumptiontime/df.wlruntime



In [None]:
df.plot.scatter('wlreadrate','raterchar');

In [None]:
df.plot.scatter('inputfilebytes','totrchar');

In [None]:
print(f'total GB read {df.totrchar.sum()/1e6:.3f}\ntotal GB filesize {df.inputfilebytes.sum()/1e9:.3f}')

In [None]:
# some basic dists

fig, axes = plt.subplots(nrows=3, ncols=2, figsize=(10, 8),constrained_layout=True)
pcols = ['readrate','cpuefficiency','wlruntime','readfrac','evtrate']

for x,p in zip(axes.flatten(),pcols):
    x.hist(df[p],bins=30)
    x.set_xlabel(p)



In [None]:
#df.hist('readrate',by='cputype',sharex=True)
# Create subplots
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(10, 6), constrained_layout=True)

# Using seaborn's displot for overlayed histograms
axes = axes.flatten()
sns.histplot(data=df, ax=axes[0], x='readrate', hue='cputype', bins=30)
sns.histplot(data=df, ax=axes[1], x='cpuefficiency', hue='cputype', bins=30)
sns.histplot(data=df, ax=axes[2], x='wlreadrate', hue='cputype', bins=30)
sns.histplot(data=df, ax=axes[3], x='wlcpueff', hue='cputype', bins=30)


In [None]:
# timeline of jobs and IO rate

st = df.starttime.min().floor('min')
et = df.endtime.max().ceil('min')
minutes_diff = (et-st).total_seconds() / 60
st,et,minutes_diff

In [None]:
# calculate sum of running jobs and sum of transfer-rate vs time

nbins = int(minutes_diff+1)
bins = np.arange(nbins+1)
counts = np.zeros(nbins)
trate = np.zeros(nbins)
ct = st
for i in range(nbins):
    counts[i] = df[(df.starttime<ct) &  (df.endtime>ct)].readrate.count()
    trate[i]  = df[(df.starttime<ct) &  (df.endtime>ct)].readrate.sum()
    ct += timedelta(minutes=1)

In [None]:
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(10, 5))
ax = axes[0]
ax.hist(bins[:-1],bins,weights=counts)
ax.set_xlabel('time (mins)')
ax.set_ylabel('#-jobs');

ax = axes[1]
ax.hist(bins[:-1],bins,weights=trate)
ax.set_xlabel('time (mins)')
ax.set_ylabel('total rate (MB/s)');
fig.suptitle('HC stress test transfers from panda job par');
#fig.savefig('hc_stress_es_jobpar.png')