In [1]:
import sys
import numpy as np
import pandas as pd
import xarray as xr

from datetime import timedelta
import json
from ifremer_utilities import load_json, save_json

  _pyproj_global_context_initialize()


In [2]:
def load_db(db_file):
    with open(db_file, 'r') as stream:
        db = json.load(stream)
    return db

DB = load_db("../data/simulations_db.json")
# DB = load_db("839031.datarmor0.json")

In [3]:
DB['simulations']['run-template']

{'comment': None,
 'runs-on': 'datarmor',
 'deployment-plan': {'src': '${repo}/data/2008-2018-Deployment-Plan.json',
  'start': '01/01/2008',
  'end': '31/12/2018',
  'float-number': 1648},
 'velocity-field': {'name': 'GLORYS-NATL'},
 'argo-configuration': {'name': 'default', 'file': None},
 'execute': {'duration': 'days=30',
  'step': 'minutes=5',
  'record': 'seconds=3600/2',
  'dask_hpcconfig': {'name': 'datarmor-local', 'client': None}},
 'pbs': {'jobid': None,
  'log': {'debug': None},
  'submit': {'file': '${repo}/local_work/simu04.pbs',
   'queue': 'mpi',
   'walltime': '48:00:00',
   'mem': '250g'},
  'resources_used': {'cpupercent': None,
   'cput': None,
   'mem': None,
   'ncpus': None,
   'vmem': None,
   'walltime': None}},
 'output': {'file': None,
  'size-on-disk': None,
  'array-size': {'traj': None, 'obs': None},
  'index': None}}

In [4]:
run_list = DB['simulations']
run_list.keys()

dict_keys(['run-template', 'run-1010801', 'run-1009216', 'run-1009214', 'run-1009213', 'run-0923846', 'run-0913494', 'run-0912543', 'run-0908795', 'run-0885949', 'run-0851194', 'run-0711242', 'run-0691822', 'run-0653526', 'run-0614790', 'run-0613257', 'run-0542230', 'run-0538001'])

In [39]:
def sort_dict_by_value(d, sort=None):
    if sort is None:
        return d
    else:
        reverse = False if sort == 'ascent' else True
        return dict(sorted(d.items(), key=lambda x: x[1], reverse=reverse))


def read_a_file_size(raw_yaml_value, unit=None):
    if raw_yaml_value is None:
        val, txt = None, ''
    else:
        val = raw_yaml_value.lower()
        # Convert all values to kb:
        if 'k' in val:
            val = val.replace('k', '').replace('b', '')
            val = float(val)
        elif 'm' in val:
            val = val.replace('m', '').replace('b', '')
            val = float(val) * 1024
        elif 'g' in val:
            val = val.replace('g', '').replace('b', '').replace('i', '')
            val = float(val) * 1024 * 1024
        txt = "%0.2f Kb" % val

        # Then possibly change unit:
        if unit is not None:
            if unit.lower() == 'mb':
                val = val / 1024
                txt = "%0.2f Mb" % val
            elif unit.lower() == 'gb':
                val = val / 1024 / 1024
                txt = "%0.2f Gb" % val

    return val, txt

def readthiskey(run, keys):
    val = None
    if len(keys) == 1:
        if keys[0] in run:
            val = run[keys[0]]
        else:
            raise ValueError("%s not found" % keys[0])
    if len(keys) == 2:
        if keys[1] in run[keys[0]]:
            val = run[keys[0]][keys[1]]
        else:
            raise ValueError("%s not found" % keys[1])
    if len(keys) == 3:
        if keys[2] in run[keys[0]][keys[1]]:
            val = run[keys[0]][keys[1]][keys[2]]
        else:
            raise ValueError("%s not found" % keys[2])
    if len(keys) == 4:
        if keys[3] in run[keys[0]][keys[1]][keys[2]]:
            val = run[keys[0]][keys[1]][keys[2]][keys[3]]
        else:
            raise ValueError("%s not found" % keys[3])
    return val

def convertthiskeyvalue(val, key, unit=None):
    if key in ['walltime', 'cput']:
        val, txt = read_a_timestamp(val, unit=unit)

    if key in ['mem', 'vmem', 'size-on-disk']:
        val, txt = read_a_file_size(val, unit=unit)

    return val

def listvalues(runs, keys, unit=None, sort=None):
    results = {}
    for run in runs:
        if "template" not in run:
            value = readthiskey(runs[run], keys)
            if value is not None:
                value = convertthiskeyvalue(value, keys[-1], unit=unit)
                results.update({run: value})
    return sort_dict_by_value(results, sort=sort)


In [41]:
grep = 'GSE-ext'

class some_args():
    sort = 'ascent'
    sortby = None
    sortby = 'sim.size'
    # sortby = 'pbs.vmem'

    reverse = False
    reverse = True
args = some_args()

# args = {'reverse': False, 'sortby': None}

if args.sortby is not None:
    if args.sortby in ['pbs.mem', 'mem']:
        keys = ['pbs', 'resources_used', 'mem']
    elif args.sortby in ['pbs.vmem', 'vmem']:
        keys = ['pbs', 'resources_used', 'vmem']
    elif args.sortby in ['pbs.cpupercent', 'cpupercent']:
        keys = ['pbs', 'resources_used', 'cpupercent']
    elif args.sortby in ['pbs.cput', 'cput']:
        keys = ['pbs', 'resources_used', 'cput']
    elif args.sortby in ['pbs.ncpus', 'ncpus']:
        keys = ['pbs', 'resources_used', 'ncpus']
    elif args.sortby in ['pbs.walltime', 'walltime']:
        keys = ['pbs', 'resources_used', 'walltime']
    elif args.sortby in ['sim.traj', 'traj']:
        keys = ['output', 'array-size', 'traj']
    elif args.sortby in ['sim.obs', 'obs']:
        keys = ['output', 'array-size', 'obs']
    elif args.sortby in ['sim.size', 'size']:
        keys = ['output', 'size-on-disk']
    listedvalues = listvalues(run_list, keys, sort=args.sort)
    run_order = listedvalues.keys()
    run_order = run_order if not args.reverse else reversed(run_order)
else:
    listedvalues = None
    run_order = run_list.keys() if not args.reverse else reversed(run_list.keys())

print(run_order)        
        
keys = []
for run in run_order:
    if 'template' not in run and grep in run_list[run]['comment']:
        keys.append(run)
keys        

<dict_reversekeyiterator object at 0x165a01b80>


['run-1009214', 'run-1009216', 'run-1010801']

In [10]:
run_list.keys()

dict_keys(['run-template', 'run-1010801', 'run-1009216', 'run-1009214', 'run-1009213', 'run-0923846', 'run-0913494', 'run-0912543', 'run-0908795', 'run-0885949', 'run-0851194', 'run-0711242', 'run-0691822', 'run-0653526', 'run-0614790', 'run-0613257', 'run-0542230', 'run-0538001'])

In [16]:
import collections
od = collections.OrderedDict(sorted(DB['simulations'].items(), reverse=True))
DB['simulations'] = od
DB['simulations'].keys()

odict_keys(['run-template', 'run-923846', 'run-913494', 'run-912543', 'run-908795', 'run-885949', 'run-851194', 'run-711242', 'run-691822', 'run-653526', 'run-614790', 'run-613257', 'run-542230', 'run-538001', 'run-1010801', 'run-1009216', 'run-1009214', 'run-1009213'])

In [32]:
# sorted([run.split("run-")[-1] for run in DB['simulations']])
RUNS = {}
RUNS["run-template"] = DB['simulations']["run-template"]
for run in DB['simulations']:
    if 'template' not in run:
        run_id = int(run.split("run-")[-1])
        # print("%0.7d" % run_id)
        RUNS["run-%0.7d" % run_id] = DB['simulations'][run]
# RUNS.keys()   
od = collections.OrderedDict(sorted(RUNS.items(), reverse=True))
DB['simulations'] = od
DB['simulations'].keys()

In [35]:
# save_json(DB, '../data/simulations_db.json')

{'thermodynamic': {'GLORYS': {'src': '/home/ref-ocean-reanalysis/global-reanalysis-phy-001-030-daily',
   'description': 'https://doi.org/10.48670/moi-00021',
   'comment': None,
   'size-on-disk': '11.8Tb',
   'array-size': {'x': 4320, 'y': 2041, 'z': 50},
   'start': '01/01/1993',
   'end': '31/12/2018',
   'step': 'daily'},
  'GLORYS-NATL-init': {'src': '/home/datawork-lops-bluecloud/natl-reanalysis-phy-001-030-daily',
   'description': 'https://doi.org/10.48670/moi-00021',
   'comment': 'Global product subsetted to the North-Atlantic. Deprecated, replaced by GLORYS-NATL',
   'size-on-disk': '220Gb',
   'array-size': {'x': 961, 'y': 721, 'z': 41},
   'domain': {'lon_min': -90,
    'lon_max': -10,
    'lat_min': 10,
    'lat_max': 70,
    'dpt_min': 0.494025,
    'dpt_max': 2225.078},
   'start': '01/01/2008',
   'end': '31/12/2018',
   'step': 'daily'},
  'GLORYS-NATL': {'src': '/home/datawork-lops-bluecloud/natl-reanalysis-phy-001-030-daily',
   'description': 'https://doi.org/10.4

2022-07-11 14:37:05,202 - distributed.client - ERROR - Failed to reconnect to scheduler after 30.00 seconds, closing client


In [46]:
run_list['run-711242']['pbs']['resources_used']['walltime']

147593

In [47]:
def sexagesimal2timedelta(sex):
    # https://stackoverflow.com/questions/52732222/in-the-yaml-format-file-as-test-yaml-some-mac-addresses-will-be-judged-as-numbe
    a, SS = np.divmod(sex, 60)
    a, MM = np.divmod(a, 60)
    a, HH = np.divmod(a, 60)
    return pd.Timedelta(SS+MM*60+HH*3600, unit='sec')

def sexagesimal2str(sex):
    val = sexagesimal2timedelta(sex)
    HH = np.fix(val.total_seconds() / 60 / 60)
    MM = np.fix((val.total_seconds() - HH*60*60) / 60)
    SS = val.total_seconds() - HH*60*60 - MM*60
    return "%0.2d:%0.2d:%0.2d" % (HH,MM,SS)

raw_yaml_value = run_list['run-711242']['pbs']['resources_used']['walltime']

# val = sexagesimal2timedelta(raw_yaml_value)
val = sexagesimal2str(raw_yaml_value)
val

'40:59:53'

In [48]:
for run in DB['simulations']:
    raw_yaml_value = DB['simulations'][run]['pbs']['resources_used']['walltime']
    if raw_yaml_value is not None:
        val = sexagesimal2str(raw_yaml_value)
        print(run, raw_yaml_value, val)
        DB['simulations'][run]['pbs']['resources_used']['walltime'] = val
        
    raw_yaml_value = DB['simulations'][run]['pbs']['resources_used']['cput']
    if raw_yaml_value is not None:
        val = sexagesimal2str(raw_yaml_value)
        print(run, raw_yaml_value, val)
        DB['simulations'][run]['pbs']['resources_used']['cput'] = val
        

run-777823 172974 48:02:54
run-777823 190207 52:50:07
run-711242 147593 40:59:53
run-711242 158203 43:56:43
run-691822 158857 44:07:37
run-691822 170974 47:29:34
run-653526 151472 42:04:32
run-653526 164528 45:42:08
run-614790 160133 44:28:53
run-614790 259913 12:11:53
run-613257 152870 42:27:50
run-613257 247308 08:41:48
run-542230 138360 38:26:00
run-542230 149174 41:26:14
run-538001 147214 40:53:34
run-538001 157998 43:53:18


In [32]:
run_list['run-711242']['pbs']['resources_used']['walltime']

'40:59:53'

In [71]:
with open("../data/simulations_db.json3", 'w') as stream:
    json.dump(DB, stream, indent=2)

In [51]:
def str2timedelta(txt):
    """Convert HH:MM:SS string to Timedelta"""
    HH = txt.split(":")[0]
    MM = txt.split(":")[1]
    SS = txt.split(":")[2]
    return pd.Timedelta(SS+MM*60+HH*3600, unit='sec')
str2timedelta("38:26:00")

ValueError: unit must not be specified if the value is a str

In [52]:
pd.Timedelta("38:26:00")

Timedelta('1 days 14:26:00')

In [55]:
import os
os.getenv('PBS_JOBID') if os.getenv('PBS_JOBID') else ""

''

In [76]:
from datetime import timedelta
duration = "days=365*10"
eval("timedelta(%s)" % duration)

datetime.timedelta(days=3650)

In [43]:
resources_used = {
          "cpupercent": 0,
          "cput": "",
          "mem": "",
          "ncpus": 0,
          "vmem": "",
          "walltime": ""
        }

s = """
PBS Job Id: 1132757.datarmor0
Job Name:   Virt_Fleet
Execution terminated
Exit_status=0
resources_used.cpupercent=108
resources_used.cput=45:39:36
resources_used.mem=35449048kb
resources_used.ncpus=28
resources_used.vmem=63648244kb
resources_used.walltime=41:57:17
"""

for line in s.split("\n"):
    if '.cpupercent' in line:
        resources_used['cpupercent'] = int(line.split("=")[-1])
    if '.cput' in line:
        resources_used['cput'] = line.split("=")[-1]
    if '.mem' in line:
        resources_used['mem'] = line.split("=")[-1]
    if '.ncpus' in line:
        resources_used['ncpus'] = line.split("=")[-1]
    if '.vmem' in line:
        resources_used['vmem'] = line.split("=")[-1]
    if '.walltime' in line:
        resources_used['walltime'] = line.split("=")[-1]
        
print(json.dumps(resources_used)        )

{"cpupercent": 108, "cput": "45:39:36", "mem": "35449048kb", "ncpus": "28", "vmem": "63648244kb", "walltime": "41:57:17"}


In [6]:
json.dumps(resources_used)

'{"cpupercent": 165, "cput": "79:49:07", "mem": "18367108kb", "ncpus": "28", "vmem": "265668188kb", "walltime": "48:05:09"}'

In [92]:
last_line = " 34% (894600.0 of 2592000.0) |###        | Elapsed Time: 0:04:08 ETA:   0:10:02"
last_line.split("%")[0]

' 34'

In [6]:
pd.to_datetime('07/08/2022 06:52:54') - pd.to_datetime('07/06/2022 02:19:03')

Timedelta('2 days 04:33:51')