# S2S Competition - Data Processing

## Notebook configuration

In [13]:
%load_ext autoreload
%autoreload 2

In [33]:
BOOT_DASK_CLUSTER: bool = False
    
INPUT_DATA: str = '***BASEDIR***training-input/0.3.0/netcdf'
OUTPUT_DIR: str = '***HOME***Projets/S2S-Competition/outputs'
FILTER_FILE: str = "eccc"
    
variables_experimentales: list = ["t", "gh", "u", "v", "sst", "lsm"]
variables_to_predict: list = ["t2m", "tp"]
CENTER = 'eccc'

## Imports packages

In [15]:
import dask
import dask.array as da
import dask_jobqueue
import dask_jobqueue

In [16]:
import pathlib
import xarray as xr

import sys
import os
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import time

In [17]:
from crims2s.util import fix_dataset_dims

## Boot Dask Cluster

In [18]:
if BOOT_DASK_CLUSTER:
    cluster = dask_jobqueue.SLURMCluster(
        cores=12,
        processes=6,
        memory='128G',
        env_extra=['source ***HOME***.bash_profile','conda activate s2s'],
        name='agri-dask',
        local_directory='***CACHE***', # METTRE VOTRE LOGIN CRIM ICI
        walltime='3:00:00'
    )
    
    cluster.scale(jobs=2)  # Scale to two working nodes as configured.
    client = dask.distributed.Client(cluster)
    
    client

# Extract informations from Data

In [19]:
def extract_informations_from_data(files: list) -> pd.DataFrame:
    df: pd.DataFrame = pd.DataFrame(columns=["File", "Variable_shortname", "Variable_longname", "Variable_cfname", "Unit", 
                                             "Step_type", "Level_type", "Level", "Number_of_points", "Missing_values", "Dimensions",
                                             "filepath"])

    for file, i in zip(files, tqdm(range(0, len(files)), desc ="Extract informations from dataset files")):
                
        ds = xr.open_dataset(file)
        var_name = list(ds.data_vars.keys())
        filename = os.path.splitext(os.path.basename(str(file)))[0]
        var_shortname = ds[var_name[0]].GRIB_shortName
        long_name = ds[var_name[0]].long_name
        var_unit = ds[var_name[0]].GRIB_units  
        var_cfname = ds[var_name[0]].GRIB_cfVarName
        missing_values = ds[var_name[0]].GRIB_missingValue 
        level = ds[var_name[0]].realization.data
        level_type = ds[var_name[0]].GRIB_typeOfLevel
        number_of_points = ds[var_name[0]].GRIB_numberOfPoints
        step_type = ds[var_name[0]].GRIB_stepType 
        dim = list(set(["forecast_time", "latitude", "lead_time", "longitude", "realization", "valid_time"]).symmetric_difference(set(list(ds[var_name[0]].coords.keys()))))

        df = df.append({"File": filename, 
                        "Variable_shortname": var_shortname, 
                        "Variable_longname": long_name,
                        "Variable_cfname": var_cfname, 
                        "Unit": var_unit,
                        "Step_type": step_type,
                        "Level_type": level_type,
                        "Level": level,
                        "Number_of_points": number_of_points,
                        "Missing_values": missing_values,
                        "Dimensions": ", ".join(dim),
                        "filepath": str(file)}, 
                       ignore_index=True)
    return df

In [20]:
input_path = pathlib.Path(INPUT_DATA)

In [21]:
all_center = set(sorted([os.path.splitext(os.path.basename(x))[0].split("-")[0] for x in input_path.iterdir()]))
print("Availble center are:", all_center)

Availble center are: {'ecmwf', 'eccc', 'ncep'}


### Extract eccc files

In [22]:
eccc_files = sorted([x for x in input_path.iterdir() if x.stem.startswith(FILTER_FILE)])

In [23]:
eccc_files_df: pd.DataFrame = extract_informations_from_data(eccc_files)

Extract informations from dataset files:   0%|          | 0/901 [00:00<?, ?it/s]

### Extract ecmwf files

In [24]:
ecmwf_files = sorted([x for x in input_path.iterdir() if x.stem.startswith("ecmwf")])

In [25]:
ecmwf_files: pd.DataFrame = extract_informations_from_data(ecmwf_files)

Extract informations from dataset files:   0%|          | 0/1113 [00:00<?, ?it/s]

KeyboardInterrupt: 

### Extract ncep files

In [None]:
ncep_files = sorted([x for x in input_path.iterdir() if x.stem.startswith("ncep")])

In [None]:
ncep_files: pd.DataFrame = extract_informations_from_data(ncep_files)

## Check available variables from ECCC files

In [26]:
sub_eccc_files_df = eccc_files_df.drop_duplicates(subset=['Variable_shortname'])
sub_eccc_files_df.set_index("Variable_shortname", inplace=True)
sub_eccc_files_df

Unnamed: 0_level_0,File,Variable_longname,Variable_cfname,Unit,Step_type,Level_type,Level,Number_of_points,Missing_values,Dimensions,filepath
Variable_shortname,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
ci,eccc-hindcast-ci-20200102,Sea ice area fraction,siconc,(0 - 1),avg,surface,"[0, 1, 2, 3]",29040,9999,,***BASEDIR***training-inp...
gh,eccc-hindcast-gh-20200102,Geopotential Height,gh,gpm,instant,isobaricInhPa,"[0, 1, 2, 3]",29040,9999,plev,***BASEDIR***training-inp...
lsm,eccc-hindcast-lsm-20200102,Land-sea mask,lsm,(0 - 1),instant,surface,[0],29040,9999,,***BASEDIR***training-inp...
msl,eccc-hindcast-msl-20200102,Mean sea level pressure,msl,Pa,instant,meanSea,"[0, 1, 2, 3]",29040,9999,meanSea,***BASEDIR***training-inp...
q,eccc-hindcast-q-20200102,Specific humidity,q,kg kg**-1,instant,isobaricInhPa,"[0, 1, 2, 3]",29040,9999,plev,***BASEDIR***training-inp...
rsn,eccc-hindcast-rsn-20200102,Snow density,rsn,kg m**-3,avg,surface,"[0, 1, 2, 3]",29040,9999,,***BASEDIR***training-inp...
sp,eccc-hindcast-sp-20200102,Surface pressure,sp,Pa,instant,surface,"[0, 1, 2, 3]",29040,9999,,***BASEDIR***training-inp...
sst,eccc-hindcast-sst-20200102,Sea surface temperature,sst,K,avg,surface,"[0, 1, 2, 3]",29040,9999,,***BASEDIR***training-inp...
t,eccc-hindcast-t-20200102,Temperature,t,K,instant,isobaricInhPa,"[0, 1, 2, 3]",29040,9999,plev,***BASEDIR***training-inp...
2t,eccc-hindcast-t2m-20200102,2 metre temperature,t2m,K,avg,heightAboveGround,"[0, 1, 2, 3]",29040,9999,,***BASEDIR***training-inp...


In [30]:
ecmwf_files = ecmwf_files.drop_duplicates(subset=['Variable_shortname'])
ecmwf_files.set_index("Variable_shortname", inplace=True)
ecmwf_files

Unnamed: 0_level_0,File,Variable_longname,Variable_cfname,Unit,Step_type,Level_type,Level,Number_of_points,Missing_values,Dimensions,filepath
Variable_shortname,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
ci,ecmwf-hindcast-ci-20200102,Sea ice area fraction,siconc,(0 - 1),avg,surface,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]",29040,9999,,***BASEDIR***training-inp...
gh,ecmwf-hindcast-gh-20200102,Geopotential Height,gh,gpm,instant,isobaricInhPa,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]",29040,9999,plev,***BASEDIR***training-inp...
lsm,ecmwf-hindcast-lsm-20200102,Land-sea mask,lsm,(0 - 1),instant,surface,[0],29040,9999,,***BASEDIR***training-inp...
msl,ecmwf-hindcast-msl-20200102,Mean sea level pressure,msl,Pa,instant,meanSea,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]",29040,9999,meanSea,***BASEDIR***training-inp...
q,ecmwf-hindcast-q-20200102,Specific humidity,q,kg kg**-1,instant,isobaricInhPa,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]",29040,9999,plev,***BASEDIR***training-inp...
rsn,ecmwf-hindcast-rsn-20200102,Snow density,rsn,kg m**-3,avg,surface,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]",29040,9999,,***BASEDIR***training-inp...
sm100,ecmwf-hindcast-sm100-20200102,Soil moisture top 100 cm,sm100,kg m**-3,avg,depthBelowLandLayer,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]",29040,9999,depth_below_and_layer,***BASEDIR***training-inp...
sm20,ecmwf-hindcast-sm20-20200102,Soil moisture top 20 cm,sm20,kg m**-3,avg,depthBelowLandLayer,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]",29040,9999,depth_below_and_layer,***BASEDIR***training-inp...
sp,ecmwf-hindcast-sp-20200102,Surface pressure,sp,Pa,instant,surface,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]",29040,9999,,***BASEDIR***training-inp...
sst,ecmwf-hindcast-sst-20200102,Sea surface temperature,sst,K,avg,surface,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]",29040,9999,,***BASEDIR***training-inp...


In [31]:
ncep_files = ncep_files.drop_duplicates(subset=['Variable_shortname'])
ncep_files.set_index("Variable_shortname", inplace=True)
ncep_files

Unnamed: 0_level_0,File,Variable_longname,Variable_cfname,Unit,Step_type,Level_type,Level,Number_of_points,Missing_values,Dimensions,filepath
Variable_shortname,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
ci,ncep-hindcast-ci-20100107,Sea ice area fraction,siconc,(0 - 1),avg,surface,"[0, 1, 2, 3]",29040,9999,,***BASEDIR***training-inp...
gh,ncep-hindcast-gh-20100107,Geopotential Height,gh,gpm,instant,isobaricInhPa,"[0, 1, 2, 3]",29040,9999,plev,***BASEDIR***training-inp...
lsm,ncep-hindcast-lsm-20100107,Land-sea mask,lsm,(0 - 1),instant,surface,[0],29040,9999,,***BASEDIR***training-inp...
msl,ncep-hindcast-msl-20100107,Mean sea level pressure,msl,Pa,instant,meanSea,"[0, 1, 2, 3]",29040,9999,meanSea,***BASEDIR***training-inp...
q,ncep-hindcast-q-20100107,Specific humidity,q,kg kg**-1,instant,isobaricInhPa,"[0, 1, 2, 3]",29040,9999,plev,***BASEDIR***training-inp...
sm100,ncep-hindcast-sm100-20100107,Soil moisture top 100 cm,sm100,kg m**-3,avg,depthBelowLandLayer,"[0, 1, 2, 3]",29040,9999,depth_below_and_layer,***BASEDIR***training-inp...
sm20,ncep-hindcast-sm20-20100107,Soil moisture top 20 cm,sm20,kg m**-3,avg,depthBelowLandLayer,"[0, 1, 2, 3]",29040,9999,depth_below_and_layer,***BASEDIR***training-inp...
sp,ncep-hindcast-sp-20100107,Surface pressure,sp,Pa,instant,surface,"[0, 1, 2, 3]",29040,9999,,***BASEDIR***training-inp...
sst,ncep-hindcast-sst-20100107,Sea surface temperature,sst,K,avg,surface,"[0, 1, 2, 3]",29040,9999,,***BASEDIR***training-inp...
st100,ncep-hindcast-st100-20100107,Soil temperature top 100 cm,st100,K,avg,depthBelowLandLayer,"[0, 1, 2, 3]",29040,9999,depth_below_and_layer,***BASEDIR***training-inp...


In [17]:
sub_eccc_files_df.to_csv(os.path.join(OUTPUT_DIR, 'eccc_exploration.csv'))

In [32]:
ecmwf_files.to_csv(os.path.join(OUTPUT_DIR, 'ecmwf_exploration.csv'))

In [33]:
ncep_files.to_csv(os.path.join(OUTPUT_DIR, 'ncep_exploration.csv'))

## Check experimental variables in ECCC files

In [27]:
sub_eccc_files_df[sub_eccc_files_df.Variable_cfname.isin(variables_experimentales)]

Unnamed: 0_level_0,File,Variable_longname,Variable_cfname,Unit,Step_type,Level_type,Level,Number_of_points,Missing_values,Dimensions,filepath
Variable_shortname,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
gh,eccc-hindcast-gh-20200102,Geopotential Height,gh,gpm,instant,isobaricInhPa,"[0, 1, 2, 3]",29040,9999,plev,***BASEDIR***training-inp...
lsm,eccc-hindcast-lsm-20200102,Land-sea mask,lsm,(0 - 1),instant,surface,[0],29040,9999,,***BASEDIR***training-inp...
sst,eccc-hindcast-sst-20200102,Sea surface temperature,sst,K,avg,surface,"[0, 1, 2, 3]",29040,9999,,***BASEDIR***training-inp...
t,eccc-hindcast-t-20200102,Temperature,t,K,instant,isobaricInhPa,"[0, 1, 2, 3]",29040,9999,plev,***BASEDIR***training-inp...
u,eccc-hindcast-u-20200102,U component of wind,u,m s**-1,instant,isobaricInhPa,"[0, 1, 2, 3]",29040,9999,plev,***BASEDIR***training-inp...
v,eccc-hindcast-v-20200102,V component of wind,v,m s**-1,instant,isobaricInhPa,"[0, 1, 2, 3]",29040,9999,plev,***BASEDIR***training-inp...


In [28]:
sub_eccc_files_df[sub_eccc_files_df.Variable_cfname.isin(variables_to_predict)]

Unnamed: 0_level_0,File,Variable_longname,Variable_cfname,Unit,Step_type,Level_type,Level,Number_of_points,Missing_values,Dimensions,filepath
Variable_shortname,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2t,eccc-hindcast-t2m-20200102,2 metre temperature,t2m,K,avg,heightAboveGround,"[0, 1, 2, 3]",29040,9999,,***BASEDIR***training-inp...
tp,eccc-hindcast-tp-20200102,Total Precipitation,tp,kg m**-2,accum,surface,"[0, 1, 2, 3]",29040,9999,,***BASEDIR***training-inp...


## Process data

In [76]:
def extract_data_for_plev(center_dataframe, variables_with_plev: list, variables_without_plev: list, plev: int = 1000):
    
    files_without_plev = center_dataframe[center_dataframe.Variable_cfname.isin(variables_without_plev)].filepath.tolist()
    files_with_plev = center_dataframe[center_dataframe.Variable_cfname.isin(variables_with_plev)].filepath.tolist()
    
    data_without_plev = xr.open_mfdataset(files_without_plev, preprocess=fix_dataset_dims)
    data_with_plev = xr.open_mfdataset(files_with_plev, preprocess=fix_dataset_dims)
    
    data_one_plev = data_with_plev.sel(plev=[plev])
      
    return xr.merge([data_without_plev, data_one_plev])

In [29]:
variables_without_plev = eccc_files_df[(eccc_files_df.Dimensions == "") & (eccc_files_df.Variable_cfname.isin(variables_experimentales))].Variable_cfname.unique().tolist()
print("Variables without plev dimensions are", variables_without_plev)

Variables without plev dimensions are ['lsm', 'sst']


In [30]:
variables_with_plev = eccc_files_df[(eccc_files_df.Dimensions == "plev") & (eccc_files_df.Variable_cfname.isin(variables_experimentales))].Variable_cfname.unique().tolist()
print("Variables with plev dimensions are", variables_with_plev)

Variables with plev dimensions are ['gh', 't', 'u', 'v']


In [78]:
flat_data = extract_data_for_plev(eccc_files_df, variables_with_plev, variables_without_plev, plev=1000)
flat_data

Unnamed: 0,Array,Chunk
Bytes,265.00 kiB,5.00 kiB
Shape,"(53, 20, 32)","(1, 20, 32)"
Count,1258 Tasks,53 Chunks
Type,datetime64[ns],numpy.ndarray
"Array Chunk Bytes 265.00 kiB 5.00 kiB Shape (53, 20, 32) (1, 20, 32) Count 1258 Tasks 53 Chunks Type datetime64[ns] numpy.ndarray",32  20  53,

Unnamed: 0,Array,Chunk
Bytes,265.00 kiB,5.00 kiB
Shape,"(53, 20, 32)","(1, 20, 32)"
Count,1258 Tasks,53 Chunks
Type,datetime64[ns],numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,14.68 GiB,70.90 MiB
Shape,"(20, 53, 4, 32, 121, 240)","(20, 1, 1, 32, 121, 240)"
Count,1924 Tasks,212 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 14.68 GiB 70.90 MiB Shape (20, 53, 4, 32, 121, 240) (20, 1, 1, 32, 121, 240) Count 1924 Tasks 212 Chunks Type float32 numpy.ndarray",4  53  20  240  121  32,

Unnamed: 0,Array,Chunk
Bytes,14.68 GiB,70.90 MiB
Shape,"(20, 53, 4, 32, 121, 240)","(20, 1, 1, 32, 121, 240)"
Count,1924 Tasks,212 Chunks
Type,float32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,14.68 GiB,283.59 MiB
Shape,"(20, 53, 4, 32, 121, 240)","(20, 1, 4, 32, 121, 240)"
Count,265 Tasks,53 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 14.68 GiB 283.59 MiB Shape (20, 53, 4, 32, 121, 240) (20, 1, 4, 32, 121, 240) Count 265 Tasks 53 Chunks Type float32 numpy.ndarray",4  53  20  240  121  32,

Unnamed: 0,Array,Chunk
Bytes,14.68 GiB,283.59 MiB
Shape,"(20, 53, 4, 32, 121, 240)","(20, 1, 4, 32, 121, 240)"
Count,265 Tasks,53 Chunks
Type,float32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,14.68 GiB,283.59 MiB
Shape,"(20, 53, 4, 32, 1, 121, 240)","(20, 1, 4, 32, 1, 121, 240)"
Count,318 Tasks,53 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 14.68 GiB 283.59 MiB Shape (20, 53, 4, 32, 1, 121, 240) (20, 1, 4, 32, 1, 121, 240) Count 318 Tasks 53 Chunks Type float32 numpy.ndarray",20  1  32  4  53  240  121  1,

Unnamed: 0,Array,Chunk
Bytes,14.68 GiB,283.59 MiB
Shape,"(20, 53, 4, 32, 1, 121, 240)","(20, 1, 4, 32, 1, 121, 240)"
Count,318 Tasks,53 Chunks
Type,float32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,14.68 GiB,283.59 MiB
Shape,"(20, 53, 4, 32, 1, 121, 240)","(20, 1, 4, 32, 1, 121, 240)"
Count,318 Tasks,53 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 14.68 GiB 283.59 MiB Shape (20, 53, 4, 32, 1, 121, 240) (20, 1, 4, 32, 1, 121, 240) Count 318 Tasks 53 Chunks Type float32 numpy.ndarray",20  1  32  4  53  240  121  1,

Unnamed: 0,Array,Chunk
Bytes,14.68 GiB,283.59 MiB
Shape,"(20, 53, 4, 32, 1, 121, 240)","(20, 1, 4, 32, 1, 121, 240)"
Count,318 Tasks,53 Chunks
Type,float32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,14.68 GiB,283.59 MiB
Shape,"(20, 53, 4, 32, 1, 121, 240)","(20, 1, 4, 32, 1, 121, 240)"
Count,318 Tasks,53 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 14.68 GiB 283.59 MiB Shape (20, 53, 4, 32, 1, 121, 240) (20, 1, 4, 32, 1, 121, 240) Count 318 Tasks 53 Chunks Type float32 numpy.ndarray",20  1  32  4  53  240  121  1,

Unnamed: 0,Array,Chunk
Bytes,14.68 GiB,283.59 MiB
Shape,"(20, 53, 4, 32, 1, 121, 240)","(20, 1, 4, 32, 1, 121, 240)"
Count,318 Tasks,53 Chunks
Type,float32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,14.68 GiB,283.59 MiB
Shape,"(20, 53, 4, 32, 1, 121, 240)","(20, 1, 4, 32, 1, 121, 240)"
Count,318 Tasks,53 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 14.68 GiB 283.59 MiB Shape (20, 53, 4, 32, 1, 121, 240) (20, 1, 4, 32, 1, 121, 240) Count 318 Tasks 53 Chunks Type float32 numpy.ndarray",20  1  32  4  53  240  121  1,

Unnamed: 0,Array,Chunk
Bytes,14.68 GiB,283.59 MiB
Shape,"(20, 53, 4, 32, 1, 121, 240)","(20, 1, 4, 32, 1, 121, 240)"
Count,318 Tasks,53 Chunks
Type,float32,numpy.ndarray
