# System Verification

This notebook is used to verify the capabilities of ESDL Jupyterhub as stated in Statement of Work

### Datacube

In [None]:
import os
import pandas as pd
import time

ROOT_CUBE_DIR = '/home/jovyan/work/datacube'
CUBE_NAME = 'esdc-8d-0.25deg-1x720x1440-1.0.2_1'  # the newest cube is selected as the default cube name

def show_all_datacubes():
    cube_names = []
    df = pd.DataFrame(columns=['cube_name','temporal_resolution','spatial_resolution','chunk_size','version_name', 'var_nums', 'size_gb', 'last_modified_time'])
    total_size = 0
    for filename in sorted(os.listdir(ROOT_CUBE_DIR)):
        if filename.startswith('esdc') and 'zarr' not in filename:
            var_dir = os.path.join(ROOT_CUBE_DIR, filename)
            var_list = os.listdir(os.path.join(var_dir,'data'))
            cube_names.append(filename)
            split = filename.split('-')
            size = round(get_dir_size(var_dir), 2)
            last_modified_time = time.strftime('%Y%m%d_%H%M%S', time.localtime(os.path.getmtime(var_dir)))
            total_size += size
            df = df.append({'cube_name': filename, 'temporal_resolution':split[1],'spatial_resolution':split[2],'chunk_size':split[3], 'version_name':split[4], 'var_nums':len(var_list), 'size_gb': size, 'last_modified_time':last_modified_time}, ignore_index=True)
    df = df.sort_values(by=['version_name', 'var_nums'], ascending=False)
    df = df.append({'cube_name':'TOTAL', 'temporal_resolution':'','spatial_resolution':'','chunk_size':'', 'version_name':'', 'var_nums':'', 'size_gb':total_size, 'last_modified_time':''}, ignore_index=True)
    return df

def get_dir_size(root_path):
    total_size = 0
    for path, dirs, files in os.walk(root_path):
        for f in files:
            fp = os.path.join(path, f)
            total_size += os.path.getsize(fp)
    return total_size / (1024 * 1024 * 1024)



#### Here is a list of available datacubes

_Done by browsing through the datacubes in the file system_

In [None]:
show_all_datacubes()

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
from ipywidgets import interact
import xarray as xr
import warnings
warnings.filterwarnings('ignore')

def show_available_variables(cube_name=CUBE_NAME):
    root_var_dir = f'/home/jovyan/work/datacube/{cube_name}/data'
    df = pd.DataFrame(columns=['var_name','start_year','end_year']) 
    for variable_name in sorted(os.listdir(root_var_dir)):
        var_dir = os.path.join(root_var_dir, variable_name)
        yearly_files = sorted(os.listdir(var_dir))
        ds = xr.open_mfdataset(var_dir + '/*.nc')
        start_year = pd.DatetimeIndex([ds.time[0].values])[0].year
        end_year = pd.DatetimeIndex([ds.time[-1].values])[0].year
        df = df.append({'var_name': variable_name, 'start_year': start_year, 'end_year': end_year}, ignore_index=True)
        ds.close()
    plot_var_temporal_range(df, cube_name)

def plot_var_temporal_range(df, cube_name):
    plt.figure(figsize=(16,14))
    plt.barh(bottom=df.index.values, width=df.end_year-df.start_year, left=df.start_year, tick_label=df.var_name, color='orange')
    plt.title(f'Temporal range for {cube_name}')

In [None]:
cube_names = []
for filename in sorted(os.listdir(ROOT_CUBE_DIR)):
    if filename.startswith('esdc') and 'zarr' not in filename:
        cube_names.append(filename)

#### Available variables on each datacube

On the latest datacube, QA4ECV data (black_sky_albedo_avhrr, white_sky_albedo_avhrr, fapar_tip, and leaf_area_index) have been added and the temporal range has been extended. Use the dropdown list below to see temporal range and variable availability for each datacube.

In [None]:
xr.open_dataset('/home/jovyan/work/datacube/esdc-8d-0.0083deg-46x60x60-1.0.2_colombia/data/LSTday/2001_LSTday.nc')

In [None]:
interact(show_available_variables, cube_name=cube_names)

#### A quick way to verify that the data and its attributes in each dataset is valid

This is done by plotting each time-step of a single variable using xarray built-in plot() function, which selects some information on the dataset and display them as supporting information on the plot. To select different variable name, use the dropdown list from interact. To select different time step, unfortunately this has to be done manually from the function: change 0 in __ds[var_name][0].plot(figsize=(18,9), aspect='auto')__ to any index value within the valid time range for the selected variable.

In [None]:
var_names = []
cube_dir = os.path.join(ROOT_CUBE_DIR, CUBE_NAME, 'data')
for filename in sorted(os.listdir(cube_dir)):
    var_names.append(filename)

In [None]:
def plot_single_var_single_time(var_name='air_temperature_2m'):
    ds = xr.open_mfdataset(os.path.join(ROOT_CUBE_DIR, CUBE_NAME, 'data', var_name, '*.nc'))
    ds[var_name][0].plot(figsize=(18,9), aspect='auto')

In [None]:
interact(plot_single_var_single_time, var_name=var_names)