# Failed `zarr` conversion on KIT server

This notebook exists to document the failed attempts to convert the TRACMIP datasets to `zarr` from our local machine. For reasons most likely tied to the limited amount of resources on this machine (which is intended only for uploading and downloading of data), attempting to run this script led to either `MemoryErrors`, dead workers or both.

In [None]:
import xarray as xr
from glob import glob

In [None]:
times       = ['Aday', 'Amon', 'A3hr']
models      = ['GISS-ModelE2', 'MetUM-CTL', 'CAM5Nor', 'CAM3', 'CNRM-AM5', 'AM21', 'ECHAM61',
               'MetUM-ENT', 'MPAS', 'LMDZ5A', 'ECHAM63', 'CALTECH', 'MIROC5', 'CAM4']
experiments = ['aquaControl', 'aqua4xCO2', 'aquaAbs20', 'aquaAbs07', 'land4xCO2', 'landAbs20',
               'landAbs15', 'landOrbit', 'aquaAbs15', 'landControl', 'landAbs07']

In [None]:
from dask.distributed import Client

client = Client()
client

In [None]:
# get the models that are completed
with open("uploaded_models.txt", "r") as f:
    uploaded = [i.rstrip("\n") for i in f.readlines()]

for time in times:

    for mod in models:

        for exp in experiments:

            paths = glob("/lsdf/kit/imk-tro/projects/MOD/Gruppe_Voigt/TRACMIP_ESGFCOPY/*/%s/%s*/%s/*/*/*/*/*/*" 
                         % (mod, exp, time.lstrip("A")))
            for version in set([i.split("/")[-2] for i in paths]):

                # check that models not yet uploaded
                name = "%s_%s_%s_%s" % (time, exp, mod, version)
                if name not in uploaded:

                    print("attempting to upload %s" % name)

                    ver_paths = [i for i in paths if version in i]

                    merged = xr.open_dataset(ver_paths[0], chunks={})

                    for path in ver_paths:

                        print("    opening %s dataset" % path.split("/")[-1].split("_")[0])

                        # flatten height coordinate
                        ds = xr.open_dataset(path)
                        var = [i for i in ds.data_vars if "bnds" not in i][0]
                        if "height" in ds.coords:
                            ds[var].attrs["height"] = "%s %s" % (ds.height.values, ds.height.units)
                        merged[var] = ds[var].chunk({"time" : "auto"})

                    # remember to drop height from overall dataset!
                    if "height" in merged.coords:
                        merged = merged.drop("height")


                    print("    converting to zarr")

                    # convert to zarr and upload
                    merged.to_zarr("temp", mode="w", consolidated=True)
                    system("gsutil -m cp -r temp/* gs://pangeo-data/tracmip/%s/%s/%s/%s/" % (time, exp, mod, version))
                    system("rm -rf temp")

                    # mark as uploaded
                    with open("uploaded_models.txt", "a") as f:
                        f.write("%s\n" % name)             
                        