# Datasets for archival

In [107]:
import glob

import cf_xarray
import dask
import distributed
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import xarray as xr

import pump

In [3]:
import ncar_jobqueue

cluster = ncar_jobqueue.NCARCluster(
    account="ncgd0048", scheduler_options=dict(dashboard_address=":9797")
)
cluster.scale(12)

In [105]:
client = distributed.Client(cluster)
dask.config.set(scheduler=client)
client

0,1
Client  Scheduler: tcp://10.12.205.30:45283  Dashboard: https://jupyterhub.ucar.edu/dav/user/dcherian/proxy/9797/status,Cluster  Workers: 12  Cores: 12  Memory: 300.00 GB


## Attributes

In [133]:
dataset_attrs = {
        "Conventions": "CF-1.6",
        "institution": "National Center for Atmospheric Research",
        "source": "MIT General Circulation Model (MITgcm) checkpoint 64v",
        "references": "Cherian et al. (2021) Off-equatorial deep-cycle turbulence forced by Tropical Instability Waves in the equatorial Pacific. Journal of Physical Oceanography.",
    }

In [138]:
variable_attrs = {
    "latitude": {"standard_name": "latitude", "units": "degrees_north", "axis": "Y"},
    "longitude": {"standard_name": "longitude", "units": "degrees_east", "axis": "X"},
    "time": {"standard_name": "time"},
    "depth": {"units": "m", "positive": "up"},
    "ETAN": {"standard_name": "sea_surface_height", "units": "m"},
    "theta": {"standard_name": "sea_water_potential_temperature", "units": "degC"},
    "salt": {"standard_name": "sea_water_practical_salinity", "units": "psu"},
    "u": {"standard_name": "eastward_sea_water_velocity", "units": "m/s"},
    "v": {"standard_name": "northward_sea_water_velocity", "units": "m/s"},
    "w": {"standard_name": "upward_sea_water_velocity", "units": "m/s"},
    "oceQnet": {
        "long_name": "net surface heat flux into the ocean",
        "units": "W/m^2 ",
        "description": "(+=down), >0 increases theta",
    },
    "oceQsw": {
        "long_name": "net short wave radiation into the ocean",
        "units": "W/m^2",
        "description": "(+=down), >0 increases theta",
    },
    "DFrI_TH": {
        "units": "degC m^3/s",
        "long_name": "Vertical Diffusive Flux of Pot.Temperature (Implicit part)",
    },
    "KPP_diffusivity": {
        "long_name": "KPP diffusivity for temperature",
        "units": "m^2/s^2",
    },
    "KPPhbl": {"long_name": "KPP boundary layer depth", "units": "m"},
    "KPPRi": {"long_name": "KPP bulk Richardson number", "units": "1"},
    "KPPbo": {"long_name": "Surface turbulence buoyancy forcing", "units": "m^2/s^3"},
    "KPPviscA": {
        "long_name": "KPP vertical eddy viscosity coefficient",
        "units": "m^2/s",
    },
    "KPPdiffT": {
        "long_name": "KPP Vertical diffusion coefficient for heat",
        "units": "m^2/s",
    },
    "KPPg_TH": {
        "long_name": "KPP non-local flux of potential temperature",
        "units": "degC m^3/s",
    },
    "VISrI_Um": {
        "long_name": "Vertical   Viscous Flux of U momentum (Implicit part)",
        "units": "m^4/s^2",
    },
    "VISrI_Vm": {
        "long_name": "Vertical   Viscous Flux of U momentum (Implicit part)",
        "units": "m^4/s^2",
    },
    "Um_Diss": {"long_name": "U momentum tendency from Dissipation", "units": "m/s^2"},
    "Vm_Diss": {"long_name": "V momentum tendency from Dissipation", "units": "m/s^2"},
}

## Full domain TIW season

In [196]:
dirname = "/glade/campaign/cgd/oce/people/dcherian/TPOS_MITgcm_1_hb/full_domain/HOLD_NC"
outdir = "/glade/campaign/cgd/oce/people/dcherian/Cherian-et-al-2021-TIW/full_domain"

In [5]:
files = sorted(glob.glob(f"{dirname}/*.nc*"))
files[:10]

['/glade/campaign/cgd/oce/people/dcherian/TPOS_MITgcm_1_hb/full_domain/HOLD_NC/Day_0001.nc',
 '/glade/campaign/cgd/oce/people/dcherian/TPOS_MITgcm_1_hb/full_domain/HOLD_NC/Day_0001_hb.nc',
 '/glade/campaign/cgd/oce/people/dcherian/TPOS_MITgcm_1_hb/full_domain/HOLD_NC/Day_0001_sf.nc',
 '/glade/campaign/cgd/oce/people/dcherian/TPOS_MITgcm_1_hb/full_domain/HOLD_NC/Day_0002.nc',
 '/glade/campaign/cgd/oce/people/dcherian/TPOS_MITgcm_1_hb/full_domain/HOLD_NC/Day_0002_hb.nc',
 '/glade/campaign/cgd/oce/people/dcherian/TPOS_MITgcm_1_hb/full_domain/HOLD_NC/Day_0002_sf.nc',
 '/glade/campaign/cgd/oce/people/dcherian/TPOS_MITgcm_1_hb/full_domain/HOLD_NC/Day_0003.nc',
 '/glade/campaign/cgd/oce/people/dcherian/TPOS_MITgcm_1_hb/full_domain/HOLD_NC/Day_0003_hb.nc',
 '/glade/campaign/cgd/oce/people/dcherian/TPOS_MITgcm_1_hb/full_domain/HOLD_NC/Day_0003_sf.nc',
 '/glade/campaign/cgd/oce/people/dcherian/TPOS_MITgcm_1_hb/full_domain/HOLD_NC/Day_0004.nc']

In [7]:
patterns = np.unique([ff.split("/")[-1][:8] for ff in files[::3]])
patterns[:10]

array(['Day_0001', 'Day_0002', 'Day_0003', 'Day_0004', 'Day_0005',
       'Day_0006', 'Day_0007', 'Day_0008', 'Day_0009', 'Day_0010'],
      dtype='<U8')

In [118]:
def process(patterns):
    import cf_xarray

    cmpr = dict(zlib=True, complevel=4, shuffle=True)

    decimal_places = {
        "u": 4,
        "v": 4,
        "w": 9,
        "salt": 4,
        "theta": 4,
        "DFrI_TH": 4,
        "oceQsw": 3,
        "oceQnet": 3,
    }

    if isinstance(patterns, str):
        patterns = [patterns]

    datasets = []
    for pattern in patterns:
        toopen = [f"{dirname}/{pattern}{suffix}.nc" for suffix in ["", "_hb", "_sf"]]
        # idx = ds.indexes["longitude"].get_loc(-110, method="nearest")

        datasets.append(
            xr.merge([xr.open_dataset(file, chunks=-1) for file in toopen]).sel(
                depth=slice(-305)
            )
            # .isel(longitude=[idx - 1, idx, idx + 1])
        )

    if len(datasets) > 1:
        with xr.set_options(keep_attrs=True):
            dset = xr.concat(datasets, "time").mean("time")
            dset["time"] = (
                ("time"),
                datasets[0].time.data + pd.Timedelta("8h"),
                datasets[0].time.attrs,
            )
    else:
        dset = datasets[0]

    dset = dset.cf.guess_coord_axis()
    dset = dset.drop_vars("TFLUX")
    for var in dset.variables:
        dset[var].encoding = datasets[0][var].encoding
        del dset[var].encoding["contiguous"]
        dset[var].encoding.update(cmpr)

        if var in attrs:
            dset[var].attrs.update(variable_attrs[var])
        # if var in decimal_places:
        #    dset[var].encoding["least_significant_digit"] = decimal_places[var]

    del dset["time"].attrs["long_name"]

    dset.attrs = dataset_attrs
    dset.attrs["title"] = "Daily averaged fields from tropical Pacific cold tongue regional model simulation"

    with dask.config.set(scheduler="single-threaded"):
        dset.sel(latitude=slice(-10, 10)).load().to_netcdf(f"{outdir}/{pattern}.nc")

    return patterns


# dset = process(patterns[0])
# dset.cf.describe()

In [101]:
reshaped = np.array(patterns[1:1459]).reshape(1458//6, 6)

In [119]:
tasks = [dask.delayed(process)(row) for row in reshaped[:24]]

In [120]:
dask.compute(*tasks)

(array(['Day_0002', 'Day_0003', 'Day_0004', 'Day_0005', 'Day_0006',
        'Day_0007'], dtype='<U8'),
 array(['Day_0008', 'Day_0009', 'Day_0010', 'Day_0011', 'Day_0012',
        'Day_0013'], dtype='<U8'),
 array(['Day_0014', 'Day_0015', 'Day_0016', 'Day_0017', 'Day_0018',
        'Day_0019'], dtype='<U8'),
 array(['Day_0020', 'Day_0021', 'Day_0022', 'Day_0023', 'Day_0024',
        'Day_0025'], dtype='<U8'),
 array(['Day_0026', 'Day_0027', 'Day_0028', 'Day_0029', 'Day_0030',
        'Day_0031'], dtype='<U8'),
 array(['Day_0032', 'Day_0033', 'Day_0034', 'Day_0035', 'Day_0036',
        'Day_0037'], dtype='<U8'),
 array(['Day_0038', 'Day_0039', 'Day_0040', 'Day_0041', 'Day_0042',
        'Day_0043'], dtype='<U8'),
 array(['Day_0044', 'Day_0045', 'Day_0046', 'Day_0047', 'Day_0048',
        'Day_0049'], dtype='<U8'),
 array(['Day_0050', 'Day_0051', 'Day_0052', 'Day_0053', 'Day_0054',
        'Day_0055'], dtype='<U8'),
 array(['Day_0056', 'Day_0057', 'Day_0058', 'Day_0059', 'Day_0060',
     

In [197]:
metrics = pump.model.read_metrics(
    "/glade/campaign/cgd/oce/people/dcherian/TPOS_MITgcm_1_hb/"
)
metrics["longitude"] = metrics.longitude - 169.025
(
    metrics
    .sel(depth=slice(-305), latitude=slice(-10, 10))
    .isel(depth_left=slice(270))
    .to_netcdf(f"{outdir}/metrics.nc")
)

## TIW hourly

In [198]:
dirname = "/glade/campaign/cgd/oce/people/dcherian/TPOS_MITgcm_1_hb/tiw/"
outdir = "/glade/campaign/cgd/oce/people/dcherian/Cherian-et-al-2021-TIW/tiw_hourly/"

In [123]:
files = sorted(glob.glob(f"{dirname}/*deepak*.nc*"))
files[:10]

['/glade/campaign/cgd/oce/people/dcherian/TPOS_MITgcm_1_hb/tiw/File_0001_deepak_KPP2D.nc',
 '/glade/campaign/cgd/oce/people/dcherian/TPOS_MITgcm_1_hb/tiw/File_0001_deepak_KPP3D.nc',
 '/glade/campaign/cgd/oce/people/dcherian/TPOS_MITgcm_1_hb/tiw/File_0001_deepak_buoy.nc',
 '/glade/campaign/cgd/oce/people/dcherian/TPOS_MITgcm_1_hb/tiw/File_0001_deepak_diss.nc',
 '/glade/campaign/cgd/oce/people/dcherian/TPOS_MITgcm_1_hb/tiw/File_0001_deepak_etan.nc',
 '/glade/campaign/cgd/oce/people/dcherian/TPOS_MITgcm_1_hb/tiw/File_0002_deepak_KPP2D.nc',
 '/glade/campaign/cgd/oce/people/dcherian/TPOS_MITgcm_1_hb/tiw/File_0002_deepak_KPP3D.nc',
 '/glade/campaign/cgd/oce/people/dcherian/TPOS_MITgcm_1_hb/tiw/File_0002_deepak_buoy.nc',
 '/glade/campaign/cgd/oce/people/dcherian/TPOS_MITgcm_1_hb/tiw/File_0002_deepak_diss.nc',
 '/glade/campaign/cgd/oce/people/dcherian/TPOS_MITgcm_1_hb/tiw/File_0002_deepak_etan.nc']

In [124]:
patterns = np.unique([ff.split("/")[-1][:9] for ff in files[::5]])
patterns[:10]

array(['File_0001', 'File_0002', 'File_0003', 'File_0004', 'File_0005',
       'File_0006', 'File_0007', 'File_0008', 'File_0009', 'File_0010'],
      dtype='<U9')

In [143]:
def process(pattern):
    toopen = [
        f"{dirname}/{pattern}_deepak_{suffix}.nc"
        for suffix in ["KPP2D", "KPP3D", "buoy", "diss", "etan"]
    ]
    
    dset = (
        xr.merge([xr.open_dataset(file) for file in toopen])
        .sel(depth=slice(-305))
    )
    idx = dset.indexes["longitude"].get_loc(-110, method="nearest")
    dset = dset.isel(longitude=[idx - 1, idx, idx + 1])
    
    cmpr = dict(zlib=True, complevel=4, shuffle=True)

    for var in dset.variables:
        dset[var].encoding.update(cmpr)
        if var in variable_attrs:
            dset[var].attrs.update(variable_attrs[var])
        del dset[var].encoding["contiguous"]
        
    del dset["time"].attrs["long_name"]
        
    dset.attrs = dataset_attrs
    dset.attrs["title"] = "Hourly averaged fields from tropical Pacific cold tongue regional model simulation"

    dset.load().to_netcdf(f"{outdir}/{pattern}.nc")

    return pattern


# process(patterns[0])

In [144]:
tasks = [dask.delayed(process)(pattern) for pattern in patterns]

In [148]:
computed = dask.compute(*tasks, scheduler=client)

In [199]:
metrics = pump.model.read_metrics(
    "/glade/campaign/cgd/oce/people/dcherian/TPOS_MITgcm_1_hb/"
)
metrics["longitude"] = metrics.longitude - 169.025
idx = metrics.indexes["longitude"].get_loc(-110, method="nearest")
(
    metrics.isel(longitude=[idx - 1, idx, idx + 1])
    .sel(depth=slice(-305), latitude=slice(-10, 10))
    .isel(depth_left=slice(270))
    .to_netcdf(f"{outdir}/metrics.nc")
)

In [156]:
tiw = xr.open_mfdataset(f"{outdir}/*.nc", parallel=True)
tiw