# ML Dataset stats

I want to perform some verification on the ml dataset I generated.

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import dask.distributed
import logging
import pathlib
import xarray as xr

from crims2s.dask import create_dask_cluster

In [3]:
ML_DATASET_DIR = '***BASEDIR***/mlready/2021-07-25-test/'

## Boot dask cluster

In [4]:
cluster = create_dask_cluster(cores=6)

In [5]:
cluster

VBox(children=(HTML(value='<h2>SLURMCluster</h2>'), HBox(children=(HTML(value='\n<div>\n  <style scoped>\n    …

In [6]:
cluster.scale(jobs=2)

In [7]:
client = dask.distributed.Client(cluster)

In [8]:
client

0,1
Client  Scheduler: tcp://10.20.40.95:45141  Dashboard: http://10.20.40.95:8787/status,Cluster  Workers: 0  Cores: 0  Memory: 0 B


## Load dataset

In [9]:
dataset_files = sorted([x for x in pathlib.Path(ML_DATASET_DIR).iterdir() if '0102.nc' in x.name])

In [10]:
dataset_files[:10]

[PosixPath('***BASEDIR***/mlready/2021-07-25-test/train_example_20000102.nc'),
 PosixPath('***BASEDIR***/mlready/2021-07-25-test/train_example_20010102.nc'),
 PosixPath('***BASEDIR***/mlready/2021-07-25-test/train_example_20020102.nc'),
 PosixPath('***BASEDIR***/mlready/2021-07-25-test/train_example_20030102.nc'),
 PosixPath('***BASEDIR***/mlready/2021-07-25-test/train_example_20040102.nc'),
 PosixPath('***BASEDIR***/mlready/2021-07-25-test/train_example_20050102.nc'),
 PosixPath('***BASEDIR***/mlready/2021-07-25-test/train_example_20060102.nc'),
 PosixPath('***BASEDIR***/mlready/2021-07-25-test/train_example_20070102.nc'),
 PosixPath('***BASEDIR***/mlready/2021-07-25-test/train_example_20080102.nc'),
 PosixPath('***BASEDIR***/mlready/2021-07-25-test/train_example_20090102.nc')]

In [11]:
def preprocess_one_example(dataset):
    return dataset.expand_dims('forecast_time')

In [12]:
features = xr.open_mfdataset(dataset_files[:3], group='/x', concat_dim='forecast_time', preprocess=preprocess_one_example)

In [13]:
model = xr.open_mfdataset(dataset_files[:3], group='/model', concat_dim='forecast_time', preprocess=preprocess_one_example)

In [14]:
obs = xr.open_mfdataset(dataset_files[:3], group='/obs', concat_dim='forecast_time', preprocess=preprocess_one_example)

In [15]:
model

Unnamed: 0,Array,Chunk
Bytes,1.08 kiB,368 B
Shape,"(3, 46)","(1, 46)"
Count,8 Tasks,3 Chunks
Type,datetime64[ns],numpy.ndarray
"Array Chunk Bytes 1.08 kiB 368 B Shape (3, 46) (1, 46) Count 8 Tasks 3 Chunks Type datetime64[ns] numpy.ndarray",46  3,

Unnamed: 0,Array,Chunk
Bytes,1.08 kiB,368 B
Shape,"(3, 46)","(1, 46)"
Count,8 Tasks,3 Chunks
Type,datetime64[ns],numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,168.16 MiB,56.05 MiB
Shape,"(3, 46, 121, 240, 11)","(1, 46, 121, 240, 11)"
Count,12 Tasks,3 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 168.16 MiB 56.05 MiB Shape (3, 46, 121, 240, 11) (1, 46, 121, 240, 11) Count 12 Tasks 3 Chunks Type float32 numpy.ndarray",46  3  11  240  121,

Unnamed: 0,Array,Chunk
Bytes,168.16 MiB,56.05 MiB
Shape,"(3, 46, 121, 240, 11)","(1, 46, 121, 240, 11)"
Count,12 Tasks,3 Chunks
Type,float32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,168.16 MiB,56.05 MiB
Shape,"(3, 46, 121, 240, 11)","(1, 46, 121, 240, 11)"
Count,12 Tasks,3 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 168.16 MiB 56.05 MiB Shape (3, 46, 121, 240, 11) (1, 46, 121, 240, 11) Count 12 Tasks 3 Chunks Type float32 numpy.ndarray",46  3  11  240  121,

Unnamed: 0,Array,Chunk
Bytes,168.16 MiB,56.05 MiB
Shape,"(3, 46, 121, 240, 11)","(1, 46, 121, 240, 11)"
Count,12 Tasks,3 Chunks
Type,float32,numpy.ndarray


In [21]:
obs.isnull().sum().compute()

In [17]:
features

Unnamed: 0,Array,Chunk
Bytes,1.08 kiB,368 B
Shape,"(3, 46)","(1, 46)"
Count,8 Tasks,3 Chunks
Type,datetime64[ns],numpy.ndarray
"Array Chunk Bytes 1.08 kiB 368 B Shape (3, 46) (1, 46) Count 8 Tasks 3 Chunks Type datetime64[ns] numpy.ndarray",46  3,

Unnamed: 0,Array,Chunk
Bytes,1.08 kiB,368 B
Shape,"(3, 46)","(1, 46)"
Count,8 Tasks,3 Chunks
Type,datetime64[ns],numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,2.13 GiB,728.70 MiB
Shape,"(3, 46, 121, 240, 11, 13)","(1, 46, 121, 240, 11, 13)"
Count,12 Tasks,3 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 2.13 GiB 728.70 MiB Shape (3, 46, 121, 240, 11, 13) (1, 46, 121, 240, 11, 13) Count 12 Tasks 3 Chunks Type float32 numpy.ndarray",121  46  3  13  11  240,

Unnamed: 0,Array,Chunk
Bytes,2.13 GiB,728.70 MiB
Shape,"(3, 46, 121, 240, 11, 13)","(1, 46, 121, 240, 11, 13)"
Count,12 Tasks,3 Chunks
Type,float32,numpy.ndarray


In [18]:
obs

Unnamed: 0,Array,Chunk
Bytes,1.08 kiB,368 B
Shape,"(3, 46)","(1, 46)"
Count,8 Tasks,3 Chunks
Type,datetime64[ns],numpy.ndarray
"Array Chunk Bytes 1.08 kiB 368 B Shape (3, 46) (1, 46) Count 8 Tasks 3 Chunks Type datetime64[ns] numpy.ndarray",46  3,

Unnamed: 0,Array,Chunk
Bytes,1.08 kiB,368 B
Shape,"(3, 46)","(1, 46)"
Count,8 Tasks,3 Chunks
Type,datetime64[ns],numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,1.08 kiB,368 B
Shape,"(3, 46)","(1, 46)"
Count,8 Tasks,3 Chunks
Type,datetime64[ns],numpy.ndarray
"Array Chunk Bytes 1.08 kiB 368 B Shape (3, 46) (1, 46) Count 8 Tasks 3 Chunks Type datetime64[ns] numpy.ndarray",46  3,

Unnamed: 0,Array,Chunk
Bytes,1.08 kiB,368 B
Shape,"(3, 46)","(1, 46)"
Count,8 Tasks,3 Chunks
Type,datetime64[ns],numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,15.29 MiB,5.10 MiB
Shape,"(3, 46, 121, 240)","(1, 46, 121, 240)"
Count,12 Tasks,3 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 15.29 MiB 5.10 MiB Shape (3, 46, 121, 240) (1, 46, 121, 240) Count 12 Tasks 3 Chunks Type float32 numpy.ndarray",3  1  240  121  46,

Unnamed: 0,Array,Chunk
Bytes,15.29 MiB,5.10 MiB
Shape,"(3, 46, 121, 240)","(1, 46, 121, 240)"
Count,12 Tasks,3 Chunks
Type,float32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,15.29 MiB,5.10 MiB
Shape,"(3, 46, 121, 240)","(1, 46, 121, 240)"
Count,12 Tasks,3 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 15.29 MiB 5.10 MiB Shape (3, 46, 121, 240) (1, 46, 121, 240) Count 12 Tasks 3 Chunks Type float32 numpy.ndarray",3  1  240  121  46,

Unnamed: 0,Array,Chunk
Bytes,15.29 MiB,5.10 MiB
Shape,"(3, 46, 121, 240)","(1, 46, 121, 240)"
Count,12 Tasks,3 Chunks
Type,float32,numpy.ndarray


In [None]:
len(obs.lead_time)

In [None]:
46 / 7

In [None]:
dataset

In [None]:
sample = dataset.isel(forecast_time=slice(20))

In [None]:
sample

In [None]:
sample = sample.persist()

In [None]:
some_computed = sample.isnull().sum(dim=['latitude', 'longitude', 'realization'])

In [None]:
some_computed.sum(dim=['variable', 'forecast_time']).compute().x.plot()

In [None]:
some_computed.isel(lead_time=[0, 1]).sum(dim='variable').compute()

In [None]:
sample.sel(variable='sst').isel(lead_time=slice(1, None)).sum(dim=['forecast_time', 'lead_time', 'realization']).compute().x.plot()

## Check y

In [None]:
target = xr.open_mfdataset(dataset_files, group='/y', concat_dim='forecast_time', preprocess=preprocess_one_example)

In [None]:
target = target.compute()

In [None]:
target.isnull().sum(dim=['category', 'forecast_time', 'lead_time', 'variable']).y.plot()