# ERA5 dataset conversion

You may find more information under `data/era5/README.md` for data description

In short, the dataset is made of data on a grid representing the Earth, and each one is an image

In [6]:
%load_ext autoreload
%autoreload 2

import numpy as np
import matplotlib.pyplot as plt
import os
from pathlib import Path
import h5py

import sys; sys.path.append('../')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [7]:
data_dir = Path('/datasets/openai_hackathon/')

# Open hdf5 file
fields = h5py.File(data_dir/'data/2018.h5', 'r')
precipitations = h5py.File(data_dir/'precipitation_data/2018.h5', 'r')

We need to get the variables from here https://github.com/NVlabs/FourCastNet/blob/master/data_process/parallel_copy_small_set.py

In [8]:
variables = ['u10', 'v10', 't2m', 'sp', 'mslp', 't850', 'u1000', 'v1000', 'z1000', 'u850', 'v850', 'z850', 'u500', 'v500', 'z500', 't500', 'z50', 'r500', 'r850', 'tcwv', 'sst']

# variables description dictionary
variables_description = {
    'u10': '10 metre U wind component [m/s]',
    'v10': '10 metre V wind component [m/s]',
    't2m': '2 metre temperature [K]',
    'sp': 'Surface pressure [Pa]',
    'mslp': 'Mean sea level pressure [Pa]',
    't850': 'Temperature at 850 hPa [K]',
    'u1000': '1000 metre U wind component [m/s]',
    'v1000': '1000 metre V wind component [m/s]',
    'z1000': '1000 metre geopotential [m^2/s^2]',
    'u850': '850 metre U wind component [m/s]',
    'v850': '850 metre V wind component [m/s]',
    'z850': '850 metre geopotential [m^2/s^2]',
    'u500': '500 metre U wind component [m/s]',
    'v500': '500 metre V wind component [m/s]',
    'z500': '500 metre geopotential [m^2/s^2]',
    't500': 'Temperature at 500 hPa [K]',
    'z50': '50 metre geopotential [m^2/s^2]',
    'r500': 'Relative humidity at 500 hPa [%]',
    'r850': 'Relative humidity at 850 hPa [%]',
    'tcwv': 'Total column water vapour [kg/m^2]',
    'sst': 'Sea surface temperature [K]'
}

## One week

In [9]:
# 7 days slice
start = 1072 # around Nov 13
end = start +  7*4 # 3 days * 4 time steps per day
data_sliced = fields['fields'][start:end, :, :, :]
precip_sliced = precipitations['tp'][start:end, :, :]

In [10]:
# Save in a single file with variable names
with h5py.File('single_week.h5', 'w') as f:
    for i, var in enumerate(variables):
        f.create_dataset(var, data=data_sliced[:, i, :, :])
        f[var].attrs['description'] = variables_description[var]
    f.create_dataset('tp', data=precip_sliced)
    f['tp'].attrs['description'] = 'Total precipitation [mm]'

In [12]:
## Single

# 7 days slice
start = 1072 # around Nov 13
end = start +  1*1 # 3 days * 4 time steps per day
data_sliced = fields['fields'][start:end, :, :, :]
precip_sliced = precipitations['tp'][start:end, :, :]


# Save in a single file with variable names
with h5py.File('sample.h5', 'w') as f:
    for i, var in enumerate(variables):
        f.create_dataset(var, data=data_sliced[:, i, :, :])
        f[var].attrs['description'] = variables_description[var]
    f.create_dataset('tp', data=precip_sliced)
    f['tp'].attrs['description'] = 'Total precipitation [mm]'