# S2S Competition - Data Processing

## Notebook configuration

In [None]:
BOOT_DASK_CLUSTER: bool = False
    
INPUT_DATA: str = '***BASEDIR***training-input/0.3.0/netcdf'
FILTER_FILE: str = "eccc"

## Imports packages

In [None]:
import dask
import dask.array as da
import dask_jobqueue
import dask_jobqueue

In [None]:
import pathlib
import xarray as xr

import os
import pandas as pd
from tqdm.notebook import tqdm
import time

## Boot Dask Cluster

In [None]:
if BOOT_DASK_CLUSTER:
    cluster = dask_jobqueue.SLURMCluster(
        cores=12,
        processes=6,
        memory='128G',
        env_extra=['source ***HOME***.bash_profile','conda activate s2s'],
        name='agri-dask',
        local_directory='***CACHE***', # METTRE VOTRE LOGIN CRIM ICI
        walltime='3:00:00'
    )

# Extract informations from Data

In [None]:
input_path = pathlib.Path(INPUT_DATA)

In [None]:
eccc_files = sorted([x for x in input_path.iterdir() if x.stem.startswith(FILTER_FILE)])

In [None]:
variables: set = set([os.path.splitext(os.path.basename(str(file)))[0].split("-")[-2] for file in eccc_files])    

In [None]:
eccc_files_df: pd.DataFrame = pd.DataFrame(columns=["File", "Variable_shortname", "Variable_longname", "Variable_cfname", "Unit", "Step_type", "Level_type", "Level", "Number_of_points", "Missing_values", "Dimensions"])

for file, i in zip(eccc_files, tqdm(range(0, len(eccc_files)), desc ="Extract informations from dataset files")):
    ds = xr.open_dataset(file)
    var_name = list(ds.data_vars.keys())
    filename = os.path.splitext(os.path.basename(str(file)))[0]
    var_shortname = ds[var_name[0]].GRIB_shortName
    long_name = ds[var_name[0]].long_name
    var_unit = ds[var_name[0]].GRIB_units  
    var_cfname = ds[var_name[0]].GRIB_cfVarName
    missing_values = ds[var_name[0]].GRIB_missingValue 
    level = ds[var_name[0]].realization.data
    level_type = ds[var_name[0]].GRIB_typeOfLevel
    number_of_points = ds[var_name[0]].GRIB_numberOfPoints
    step_type = ds[var_name[0]].GRIB_stepType 
    dim = set(["forecast_time", "latitude", "lead_time", "longitude", "realization", "valid_time"]).symmetric_difference(set(list(ds[var_name[0]].coords.keys())))
    
    eccc_files_df = eccc_files_df.append({"File": filename, 
                                          "Variable_shortname": var_shortname,  
                                          "Variable_longname": long_name, 
                                          "Variable_cfname": var_cfname, 
                                          "Unit": var_unit,
                                          "Step_type": step_type,
                                          "Level_type": level_type,
                                          "Level": level,
                                          "Number_of_points": number_of_points,
                                          "Missing_values": missing_values,
                                          "Dimensions": dim}, 
                                         ignore_index=True)

## Check available variables with some informations

In [None]:
sub_eccc_files_df = eccc_files_df.drop_duplicates(subset=['Variable_shortname'])
sub_eccc_files_df.set_index("Variable_shortname", inplace=True)
sub_eccc_files_df

In [None]:
ds = xr.open_dataset(eccc_files[0])
ds