In [1]:
import urllib.request
import xarray as xr
import utils
import numpy as np
import pandas as pd
import os
from tqdm import tqdm
from urllib.error import HTTPError
from scipy import interpolate
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)


file_num = 0

MODE = 'train'
VARNAME = 't_rh'
PATH = f'E:/ERA5_{VARNAME}_aggregated_3h'

sample_mswep = xr.open_dataset('D:/sample_mswep.nc').isel(time=0)
box_radius = utils.convert_box_radius(500, res=0.1)

# only do a fraction of the df per notebook, as I run 15 notebooks in parallel to accelerate the process
df = pd.read_csv(f'../data/{MODE}_2001_all/ibtracs_2001_{MODE}.csv')
step = int(np.floor(len(df) / 15)) # 15 is the number of notebooks in parellel I run
start = step * file_num 
end = step * (file_num + 1)
df = df[start:end]


new_df_as_list = []
done = 0
nans_found = 0
timesteps_not_found = 0 # num of files not found because the timestep was not a multiple of 3h
had_inf_bias = 0 # num of files that had infinite bias between ERA5 and MSWEP, likely because the TC was misplaced/absent in ERA5
wrong_shapes = 0 # num of files that had the wrong shape, likely because they were too close to lon=180
was_empty_era5 = 0 # num of files that had no rainfall
was_empty_mswep = 0 # num of files that had no rainfall
for idx, row in tqdm(df.iterrows(), total=df.shape[0]):
    TC_lat = row.LAT
    TC_lon = row.LON
    if TC_lon > 179.9:
        TC_lon -= 360
    TC_wind = row.WMO_WIND
    idx = row.IDX_TRUE
    iso_time = utils.add_to_iso_time(row.ISO_TIME, hours_delta=1, minutes_delta=30)
    year, month, day, hour = utils.convert_iso_time_3h(iso_time)
    
    with xr.open_dataset(f'{PATH}/ERA5_{VARNAME}_{year}_{month}.nc') as era5_ds:
        # select current timestep in ERA5
        try:
            timestep_slice_t = utils.slice_ds_timestep(era5_ds, iso_time, 't')
            timestep_slice_rh = utils.slice_ds_timestep(era5_ds, iso_time, 'r')
        except IndexError:
            timesteps_not_found += 1
            continue
        era5_t_interp_mswep = utils.interp_era5_to_mswep(timestep_slice_t, sample_mswep)
        era5_rh_interp_mswep = utils.interp_era5_to_mswep(timestep_slice_rh, sample_mswep)
        # get domain lats and lons (used to crop data around TC centre)
        domain_lats = era5_t_interp_mswep['latitude'].values
        domain_lons = era5_t_interp_mswep['longitude'].values
        # find the id inside of domain_lats/domain_lons that corresponds to the TC centre
        TC_lat_id = utils.find_id(TC_lat, domain_lats)
        TC_lon_id = utils.find_id(TC_lon, domain_lons)
        # box ERA5 around IBTrACS TC centre
        boxed_era5_t = utils.box_var(era5_t_interp_mswep, TC_lat_id, TC_lon_id, box_radius, domain_lats, domain_lons, idx)
        boxed_era5_rh = utils.box_var(era5_rh_interp_mswep, TC_lat_id, TC_lon_id, box_radius, domain_lats, domain_lons, idx)
        if np.isnan(boxed_era5_t).any() or np.isnan(boxed_era5_rh).any():
            boxed_era5_t = boxed_era5_t.fillna(0)
            boxed_era5_rh = boxed_era5_t.fillna(0)

    # save ERA5 and MSWEP snapshots
    #boxed_era5_t.to_netcdf(f'data/{MODE}_2001/ERA5_t/ERA5_t_cropped_{idx}.nc')
    #boxed_era5_rh.to_netcdf(f'data/{MODE}_2001/ERA5_rh/ERA5_rh_cropped_{idx}.nc')
    # update count of saved files
    done +=1

  0%|          | 0/5391 [00:08<?, ?it/s]


TypeError: Index.get_loc() got an unexpected keyword argument 'method'

In [10]:
timesteps_not_found

0