In [1]:
import dask
import xarray as xr
from dask.array import image
import pandas as pd
import numpy as np
from tqdm import tqdm
import pickle

In [None]:
def add_coord(dataset, value, name):
    c = xr.DataArray(value, dims=['chip_id'], coords=[dataset.chip_id], name=name)
    return dataset.assign_coords({name:c})


def get_dataset_pc(n=None):
    df = pd.read_csv("/driven-data/cloud-cover/train_metadata.csv").iloc[:n]
    xy = np.arange(512)
    band = ['B02', 'B03', 'B04', 'B08', 'cloud_mask']
    xs = []

    for i, row in tqdm(df.iterrows()):
        x = dask.array.concatenate([
            image.imread(f'/driven-data/cloud-cover/train_features/{row.chip_id}/*.tif'), 
            image.imread(f'/driven-data/cloud-cover/train_labels/{row.chip_id}.tif')
        ])
        x = xr.DataArray(x, dims=['band', 'x', 'y'], coords=[band, xy, xy], name='images')
        x = x.assign_coords({'chip_id':row.chip_id})
        x = x.expand_dims('chip_id')
        x = add_coord(x, row.location, 'location')
        x = add_coord(x, row.datetime, 'datetime')
        xs.append(x)

    ds = xr.concat(xs, dim='chip_id').to_dataset()
    return ds

In [None]:
%%time
ds = get_dataset_pc(100)

In [None]:
ds

In [None]:
print(f"expected to take {14.2*12000/100 / 60} minutes for full dataset")

Could pickle thr result though

In [None]:
with open('test.pkl', 'wb') as f:
    pickle.dump(ds, f)

In [None]:
ls -lh test.pkl

In [None]:
with open('test.pkl', 'rb') as f:
    ds2 = pickle.load(f)

In [None]:
ds2

In [None]:
%%time
for n in range(100):
    x = ds.isel(chip_id=n).compute(scheduler='single-threaded')

this is 12 times slower than zarr store on stream

## Lets do it anyway

In [None]:
ds = get_dataset_pc()

In [None]:
with open('xarray_train_from_tiff.pkl', 'wb') as f:
    pickle.dump(ds, f)

In [None]:
ls -lh xarray_train_from_tiff.pkl

In [None]:
ds = ds.chunk(dict(band=-1))

In [None]:
ds

In [None]:
with open('xarray_train_from_tiff.pkl', 'rb') as f:
    ds2 = pickle.load(f)

In [None]:
%%time
for n in range(100):
    x = ds2.isel(chip_id=n).compute(scheduler='single-threaded')

In [None]:
with open('xarray_train_from_tiff_rechunk.pkl', 'rb') as f:
    ds2 = pickle.load(f)

In [None]:
%%time
for n in range(100):
    x = ds2.isel(chip_id=n).compute(scheduler='single-threaded')

In [None]:
ds2 = xr.open_zarr('cloudmask/data/train_zarr_remade')

In [None]:
%%time
for n in range(100):
    x = ds2.isel(chip_id=n).compute(scheduler='single-threaded')

## Try different way

In [2]:
from PIL import Image
from dask import delayed

In [3]:
band = ['B02', 'B03', 'B04', 'B08']

In [4]:
def get_array(chip_id):
    band = ['B02', 'B03', 'B04', 'B08']
    xs=[]
    for b in band:
        xs += [np.array(Image.open(f"/driven-data/cloud-cover/train_features/{chip_id}/{b}.tif"))]
    xs += [np.array(Image.open(f"/driven-data/cloud-cover/train_labels/{chip_id}.tif"))]
    return np.array(xs).astype(np.int16)

In [6]:
%timeit -n 10 -r 1 get_array('agpw')

127 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 10 loops each)


In [8]:
127*400

50800

In [None]:
def add_coord(dataset, value, name):
    c = xr.DataArray(value, dims=['chip_id'], coords=[dataset.chip_id], name=name)
    return dataset.assign_coords({name:c})


def get_dataset_pc(n=None):
    df = pd.read_csv("/driven-data/cloud-cover/train_metadata.csv").iloc[:n]
    xy = np.arange(512)
    band = ['B02', 'B03', 'B04', 'B08', 'cloud_mask']
    xs = []

    for i, row in tqdm(df.iterrows()):
        xs += [dask.array.from_delayed(delayed(get_array)(row.chip_id), shape=(5, 512, 512), dtype=np.int16)]
    xs = dask.array.array(xs)
    da = xr.DataArray(xs, dims=['chip_id', 'band', 'x', 'y'], coords=[df.chip_id.values, band, xy, xy], name='images')
    ds = da.to_dataset()
    ds = add_coord(ds, df.location.values, 'location')
    ds = add_coord(ds, df.datetime.values, 'datetime')
    return ds

In [None]:
ds3 = get_dataset_pc()

In [None]:
ds3

In [None]:
with open('xarray_train_from_tiff_all.pkl', 'wb') as f:
    pickle.dump(ds3, f)

In [None]:
with open('xarray_train_from_tiff_all.pkl', 'rb') as f:
    ds3 = pickle.load(f)

In [None]:
%%time
for n in range(100): get_array('agpw')

In [None]:
%%time
for n in range(10):
    x = ds3.isel(chip_id=n).compute(scheduler='single-threaded')

In [None]:
ds3