# S2S Competition - Data Processing

## Notebook configuration

In [None]:
BOOT_DASK_CLUSTER: bool = False
    
INPUT_DATA: str = '***BASEDIR***training-input/0.3.0/netcdf'
OUTPUT_DIR: str = '***HOME***Projets/S2S-Competition/outputs'
FILTER_FILE: str = "eccc"
    
variables_experimentales: list = ["t", "gh", "u", "v", "sst", "lsm"]
variables_to_predict: list = ["t2m", "tp"]
CENTER = 'eccc'

## Imports packages

In [None]:
import dask
import dask.array as da
import dask_jobqueue
import dask_jobqueue

In [None]:
import pathlib
import xarray as xr

import sys
import os
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import time

In [17]:
from crims2s.util import fix_dataset_dims

## Boot Dask Cluster

In [None]:
if BOOT_DASK_CLUSTER:
    cluster = dask_jobqueue.SLURMCluster(
        cores=12,
        processes=6,
        memory='128G',
        env_extra=['source ***HOME***.bash_profile','conda activate s2s'],
        name='agri-dask',
        local_directory='***CACHE***', # METTRE VOTRE LOGIN CRIM ICI
        walltime='3:00:00'
    )
    
    cluster.scale(jobs=2)  # Scale to two working nodes as configured.
    client = dask.distributed.Client(cluster)
    
    client

# Extract informations from Data

In [None]:
input_path = pathlib.Path(INPUT_DATA)

In [None]:
eccc_files = sorted([x for x in input_path.iterdir() if x.stem.startswith(FILTER_FILE)])

In [None]:
### Extract ecmwf files

In [None]:
ecmwf_files: pd.DataFrame = extract_informations_from_data(ecmwf_files)

### Extract ncep files

In [None]:
ncep_files = sorted([x for x in input_path.iterdir() if x.stem.startswith("ncep")])

In [None]:
ncep_files: pd.DataFrame = extract_informations_from_data(ncep_files)

## Check available variables from ECCC files

In [None]:
sub_eccc_files_df = eccc_files_df.drop_duplicates(subset=['Variable_shortname'])
sub_eccc_files_df.set_index("Variable_shortname", inplace=True)
sub_eccc_files_df

In [None]:
flat_data = extract_data_for_plev(eccc_files_df, variables_with_plev, variables_without_plev, plev=1000)
flat_data