In [1]:
import os
import shutil
import glob

import netCDF4

import xarray as xr

%load_ext memory_profiler

In [2]:
%cd /g/data/tm70/ds0092/projects/um_output_memory

/g/data/tm70/ds0092/projects/um_output_memory


In [3]:
file = "./cj877a.pm000101_mon.1x1.nc4" # File is 1.5 MB
variable = "fld_s03i807"

# Opening a single 1.5MB files uses ~20MB of memory

In [4]:
%%time
%%memit

f = netCDF4.Dataset(file)
var = f[variable]
f.close()

peak memory: 142.83 MiB, increment: 19.13 MiB
CPU times: user 154 ms, sys: 37.5 ms, total: 192 ms
Wall time: 303 ms


# Memory grows when opening/concantenating many files with xarray

E.g. here we duplicate the file 500 times (750 MB of data in total)

In [4]:
copies = []
for idx in range(500):
    copy = os.path.join(
        os.path.dirname(file), f"copy.{str(idx).zfill(3)}.nc4"
    )
    shutil.copyfile(file, copy)
    copies.append(copy)
    
files = glob.glob("./copy.*.nc4")

### Open 100 files

In [5]:
%%time
%%memit

ds = xr.open_mfdataset(
    files[:100], 
    engine="netcdf4",
    combine="nested",
    concat_dim="time",
    decode_cf=False,
    decode_coords=False,
    decode_times=False,
    preprocess=lambda ds: ds[variable]
)

peak memory: 2358.50 MiB, increment: 2231.26 MiB
CPU times: user 27.7 s, sys: 3.02 s, total: 30.7 s
Wall time: 36.9 s


### Open all 500 files

In [5]:
%%time
%%memit

ds = xr.open_mfdataset(
    files, 
    engine="netcdf4",
    combine="nested",
    concat_dim="time",
    decode_cf=False,
    decode_coords=False,
    decode_times=False,
    preprocess=lambda ds: ds[variable]
)

peak memory: 3793.55 MiB, increment: 3666.23 MiB
CPU times: user 2min 4s, sys: 4.74 s, total: 2min 8s
Wall time: 2min 11s


### Opening with `h5netcdf` uses less memory but takes a ridiculously long time

In [5]:
%%time
%%memit

ds = xr.open_mfdataset(
    files[:100], 
    engine="h5netcdf",
    combine="nested",
    concat_dim="time",
    decode_cf=False,
    decode_coords=False,
    decode_times=False,
    preprocess=lambda ds: ds[variable]
)

peak memory: 872.97 MiB, increment: 745.65 MiB
CPU times: user 37min 26s, sys: 9.93 s, total: 37min 36s
Wall time: 38min 33s


### Extract variable array, dimensions and attrs explicitly from netCDF4 Dataset

Explicitly closing the file here after extracting what we want obviously reduces memory. However, usually we want to operate lazily with xarray meaning the file presumably remains open. 

In [5]:
%%time
%%memit

def _open_variable(file, variable):
    """Pull out variable/dimensions/attributes and pack into xarray DataArray"""
    f = netCDF4.Dataset(file)
    data = f[variable][:].data
    coords = {}
    for dim in f[variable].dimensions:
        coords[dim] = f[dim][:].data
    attrs = {}
    for attr in f[variable].ncattrs():
        attrs[attr] = f[variable].getncattr(attr)
    f.close()
    return xr.DataArray(data, coords, attrs=attrs)

ds = []
for file in files:
    ds.append(_open_variable(file, variable))
    
ds = xr.concat(ds, dim="time")

peak memory: 240.80 MiB, increment: 113.45 MiB
CPU times: user 44.4 s, sys: 1.04 s, total: 45.4 s
Wall time: 46.2 s


# What about netCDF4 MFDataset?

In [5]:
%%time
%%memit

f = netCDF4.MFDataset(files)
var = f[variable]
f.close()

peak memory: 9067.20 MiB, increment: 8939.89 MiB
CPU times: user 53.3 s, sys: 4.85 s, total: 58.2 s
Wall time: 58.8 s


In [6]:
for copy in copies:
    os.remove(copy)

# What have I tried?

 - convert to NETCDF3 - this fixes the issue. However, both NETCDF4 and NETCDF4_CLASSIC show the same behaviour
 - these files have `filling off`. I've recreated the data with `filling on` - no effect
 - played with chunking of variables in NETCDF4 files - no effect
 
# Things to note

 - the memory footprint is essentially the same for the reduced-size files here as for the full-size files. The reduced-size files include only one spatial grid point, whereas the full size files include 27,648. That is, it's almost like it's the metadata that is responsible for the large memory footprint.
 - This file contains 250 variables. I've never worked with NetCDF files containing this many variables - perhaps the problem could be related to this??