# Load data

1. Set path:
    - dir_obs: path to train and test file containing precipitation observation
    - dir_predictors_train: path to predictors containing training period (2000 - 2017)  
    - dir_predictors_test: path to predictors containing test data (2018 and 2019)

In [3]:
import xarray as xr
import numpy as np
import pandas as pd

from sklearn.model_selection import TimeSeriesSplit

import matplotlib.pyplot as plt
 
from matplotlib import cm
from matplotlib.colors import ListedColormap, LinearSegmentedColormap
import cartopy.crs as ccrs

# Data:

1. Target variable: Precipitation amount
2. Data is divided into train and test
3. Grid of data: 19 x 61 
4. Training days: 6209
5. Test days: 730

In [4]:

dir_obs = '/home/gregor/datasets/precipitation/observation/'
dir_predictors_train = '/home/gregor/datasets/precipitation/predictors/train/'
dir_predictors_test = '/home/gregor/datasets/precipitation/predictors/test/' 

# Load observation

In [5]:
# load data
obs_train = xr.open_dataset(dir_obs + 'obs_precip_train.nc') 
obs_test = xr.open_dataset(dir_obs + 'obs_precip_test.nc')

In [6]:
obs_train.precipitationCal.values.shape

(6209, 19, 61)

In [7]:
obs_test.precipitationCal.values.shape

(730, 19, 61)

# Create splits

In [14]:
cv_splits = TimeSeriesSplit(n_splits=7, test_size=365)

In [15]:
split = cv_splits.split(obs_train.precipitationCal.values)

In [16]:
for i, (train_idx, val_idx) in enumerate(cv_splits.split(obs_train.precipitationCal.values)):
    print(f'Fold {i}:')
    print(obs_train.precipitationCal.values[train_idx].shape, obs_train.precipitationCal.values[val_idx].shape)
    print(f'Train start: {train_idx[0]}, train end: {train_idx[-1]}')
    print(f'Val start: {val_idx[0]}, val end: {val_idx[-1]} \n')

Fold 0:
(3654, 19, 61) (365, 19, 61)
Train start: 0, train end: 3653
Val start: 3654, val end: 4018 

Fold 1:
(4019, 19, 61) (365, 19, 61)
Train start: 0, train end: 4018
Val start: 4019, val end: 4383 

Fold 2:
(4384, 19, 61) (365, 19, 61)
Train start: 0, train end: 4383
Val start: 4384, val end: 4748 

Fold 3:
(4749, 19, 61) (365, 19, 61)
Train start: 0, train end: 4748
Val start: 4749, val end: 5113 

Fold 4:
(5114, 19, 61) (365, 19, 61)
Train start: 0, train end: 5113
Val start: 5114, val end: 5478 

Fold 5:
(5479, 19, 61) (365, 19, 61)
Train start: 0, train end: 5478
Val start: 5479, val end: 5843 

Fold 6:
(5844, 19, 61) (365, 19, 61)
Train start: 0, train end: 5843
Val start: 5844, val end: 6208 



# Define subset of data

In [None]:
# to be done (e.g. in data module)



# Start with Subset of data?

# --> Use the following: 
# kindx_train_np
# corr1_train_np
# pw_train_np
# cape_train_np
# rh8_train_np
# d2m_train_np
# geodiff_train_np
# sp_train_np

# Test dataset merge

In [28]:
d2m_train = xr.open_dataset(dir_predictors_train + 'd2m_2000_2017.nc') 

In [29]:
d2m_train

In [30]:
g7_train = xr.open_dataset(dir_predictors_train + 'geo700_2000_2017.nc')

In [41]:
type(g7_train.z.values)

numpy.ndarray

In [42]:
combined = xr.merge([g7_train, d2m_train])

In [46]:
combined.values

<bound method Mapping.values of <xarray.Dataset>
Dimensions:    (longitude: 61, latitude: 19, time: 6209)
Coordinates:
  * longitude  (longitude) float32 -25.0 -24.0 -23.0 -22.0 ... 33.0 34.0 35.0
  * latitude   (latitude) float32 0.0 1.0 2.0 3.0 4.0 ... 15.0 16.0 17.0 18.0
  * time       (time) datetime64[ns] 2000-12-01 2000-12-02 ... 2017-11-30
Data variables:
    z          (time, latitude, longitude) float32 3.094e+04 ... 3.107e+04
    d2m        (time, latitude, longitude) float32 ...
Attributes:
    Conventions:  CF-1.6
    history:      2022-03-30 15:07:04 GMT by grib_to_netcdf-2.24.3: /opt/ecmw...>