# CESM-LE analysis
 - trouble-shooting memory management
 - reading in a relatively large dataset (~5Gb according to xarray)
 - calculations with 36 dask workers from casper tend to spill to disk
 - any suggestions?
 - daniel kennedy, djk2120@ucar.edu

In [13]:
import numpy as np
import xarray as xr
import glob
import matplotlib
import matplotlib.pyplot as plt
import cftime
import dask
import os
from scipy import stats
%matplotlib inline

In [2]:
xr.__version__

'0.16.2'

In [3]:
dask.__version__

'2020.12.0'

In [4]:
#for use on Casper: ONLY RUN THIS CELL ONCE  
#   note you may need to change the project number
ncores = 36
nmem   = str(int(375*ncores/36))+'GB'
from dask_jobqueue import SLURMCluster
from dask.distributed import Client
cluster = SLURMCluster(cores=ncores,
                     processes=ncores, memory=nmem,
                     project='P93300641',
                     walltime='3:00:00')
cluster.scale(ncores)
client = Client(cluster)

In [5]:
# note that the extra workers can get stuck in the queue
#   rerun this cell until you see that the workers>0
client

0,1
Client  Scheduler: tcp://10.12.205.200:37174  Dashboard: http://10.12.205.200:8787/status,Cluster  Workers: 36  Cores: 36  Memory: 375.12 GB


In [7]:
##client.close()

### Find the files on /glade
 - note that for CESM2 the data are split by ensemble member and across time
     - the 35 years are split into 10/10/10/5
 - as such my open_mfdataset call is on a nested list of files

In [60]:
thedir = '/glade/scratch/djk2120/mem_debug/'
files = [[newdir+'nbp_ens'+str(ee).zfill(2)+'_yy'+str(yy)+'.nc' 
          for yy in range(4)]
          for ee in range(50)]

### Reading the data in:

In [76]:
ensdim  = xr.DataArray(np.arange(len(files)), dims='ens', name='ens')
ensdim.attrs['long_name']='ensemble number'
ds = xr.open_mfdataset(files,combine='nested',concat_dim=[ensdim,'time'],parallel=True)

In [78]:
ds.NBP

Unnamed: 0,Array,Chunk
Bytes,4.64 GB,26.54 MB
Shape,"(50, 420, 192, 288)","(1, 120, 192, 288)"
Count,1000 Tasks,200 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 4.64 GB 26.54 MB Shape (50, 420, 192, 288) (1, 120, 192, 288) Count 1000 Tasks 200 Chunks Type float32 numpy.ndarray",50  1  288  192  420,

Unnamed: 0,Array,Chunk
Bytes,4.64 GB,26.54 MB
Shape,"(50, 420, 192, 288)","(1, 120, 192, 288)"
Count,1000 Tasks,200 Chunks
Type,float32,numpy.ndarray


In [79]:
### dataset has 200 chunks
x = ds.NBP.mean(axis=[0,1,2],skipna=True).values

In [80]:
stream = os.popen('ls -lthr core-*')
output = stream.read()
print(output)

-rw------- 1 djk2120 ncar 474M Jan 21 09:00 core-casper11-24189-252757-11
-rw------- 1 djk2120 ncar 495M Jan 21 09:00 core-casper11-24189-252728-6
-rw------- 1 djk2120 ncar 530M Jan 21 09:00 core-casper11-24189-252791-11
-rw------- 1 djk2120 ncar 588M Jan 21 09:00 core-casper11-24189-246729-11



In [81]:
stream = os.popen('ls core-* | wc -l')
nfiles0 = int(stream.read())
print(str(nfiles0)+' new files')

4 new files


### try with larger chunks
 - open_mfdataset won't give me the chunks I wanted with my nested file structure
 - time dimension is split across files, and I want all time in single chunk

In [82]:
dsets = []
for i in range(50):
    ds = xr.open_mfdataset(
            files[i],combine='by_coords',parallel=True
            )
    nbp = ds.NBP.chunk({'time':-1})
    dsets.append(nbp)
nbp = xr.concat(dsets,dim=ensdim)

In [83]:
nbp

Unnamed: 0,Array,Chunk
Bytes,4.64 GB,92.90 MB
Shape,"(50, 420, 192, 288)","(1, 420, 192, 288)"
Count,750 Tasks,50 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 4.64 GB 92.90 MB Shape (50, 420, 192, 288) (1, 420, 192, 288) Count 750 Tasks 50 Chunks Type float32 numpy.ndarray",50  1  288  192  420,

Unnamed: 0,Array,Chunk
Bytes,4.64 GB,92.90 MB
Shape,"(50, 420, 192, 288)","(1, 420, 192, 288)"
Count,750 Tasks,50 Chunks
Type,float32,numpy.ndarray


In [84]:
#50 chunks
x = nbp.mean(axis=[0,1,2],skipna=True).values

In [85]:
stream = os.popen('ls core-* | wc -l')
nfiles1 = int(stream.read())-nfiles0
print(str(nfiles1)+' new files')

24 new files


### smaller chunks?

In [86]:
ds = xr.open_mfdataset(files,combine='nested',concat_dim=[ensdim,'time'],parallel=True,chunks={'time':60})

In [87]:
ds.NBP

Unnamed: 0,Array,Chunk
Bytes,4.64 GB,13.27 MB
Shape,"(50, 420, 192, 288)","(1, 60, 192, 288)"
Count,1600 Tasks,350 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 4.64 GB 13.27 MB Shape (50, 420, 192, 288) (1, 60, 192, 288) Count 1600 Tasks 350 Chunks Type float32 numpy.ndarray",50  1  288  192  420,

Unnamed: 0,Array,Chunk
Bytes,4.64 GB,13.27 MB
Shape,"(50, 420, 192, 288)","(1, 60, 192, 288)"
Count,1600 Tasks,350 Chunks
Type,float32,numpy.ndarray


In [88]:
#350 chunks
x = ds.NBP.mean(axis=[0,1,2],skipna=True).values

In [89]:
stream = os.popen('ls core-* | wc -l')
nfiles2 = int(stream.read())-nfiles0-nfiles1
print(str(nfiles2)+' new files')

2 new files


### How many ensemble members without data spill?

In [96]:
# 1 ensemble member
x = ds.NBP.sel(ens=0).mean(skipna=True,axis=[0,1,2]).values
stream = os.popen('ls core-* | wc -l')
nfiles = int(stream.read())
print(str(nfiles)+ ' new files')

0 new files


In [98]:
# 2 ensemble members
ix = ds.ens<2
x = ds.NBP.isel(ens=ix).mean(skipna=True,axis=[0,1,2]).values
stream = os.popen('ls core-* | wc -l')
nfiles = int(stream.read())
print(str(nfiles)+ ' new files')

0 new files


In [99]:
# 5 ensemble members
ix = ds.ens<5
x = ds.NBP.isel(ens=ix).mean(skipna=True,axis=[0,1,2]).values
stream = os.popen('ls core-* | wc -l')
nfiles = int(stream.read())
print(str(nfiles)+ ' new files')

0 new files


In [101]:
# 10 ensemble members
ix = ds.ens<10
x = ds.NBP.isel(ens=ix).mean(skipna=True,axis=[0,1,2]).values
stream = os.popen('ls core-* | wc -l')
nfiles = int(stream.read())
print(str(nfiles)+ ' new files')

1 new files
