This is a script for converting the variables (visibility observations and reforecasts, and air temperature reforecasts) in the EUPPBench dataset from netCDF format into a dataframe of records by flattening. This data preparation is for the training set (1997 to 2016).

In [1]:
import numpy as np
import xarray as xr
import pandas as pd
import datetime

In [None]:
data_path = "./"
export_path = "./"

In [2]:
# Loading the data in netCDF format
vis_reforecast_obs = xr.open_dataarray(data_path + "vis_reforecast_observations.nc")
vis_reforecast = xr.open_dataarray(data_path + "vis_reforecasts.nc")
t_reforecast = xr.open_dataarray(data_path + "t_reforecasts.nc")



In [3]:
# Data structure
# vis_obs['forecast_period', spot_index', 'forecast_reference_time']
# vis_reforecast['forecast_period', 'realization', 'spot_index', 'forecast_reference_time']
vis_reforecast

In [4]:
# Create arrays for latitude, longitudes, base times, lead times and orecasted/observed values
lat = np.array(vis_reforecast_obs['latitude'])
lon = np.array(vis_reforecast_obs['longitude'])
alt = np.array(vis_reforecast_obs['altitude'])
base= np.array(vis_reforecast_obs['forecast_reference_time'])
lead = np.array(vis_reforecast_obs['forecast_period'])
lead_hr = np.array(lead*10**-9/3600, dtype='int')
vis_reforecast_obs_array = np.array(vis_reforecast_obs)
vis_reforecast_array = np.array(vis_reforecast[:, :, 0:118, :]) # consider first 119 stations only
t_reforecast_array = np.array(t_reforecast[:, :, 0:118, :]) # consider first 119 stations only

In [5]:
# Compute ensemble mean
import time

st = time.time()

vis_reforecast_array_ensemble = vis_reforecast_array
t_reforecast_array_ensemble = t_reforecast_array
for l in range(len(lead)):
    for s in range(len(lon)):
        for b in range(len(base)):
            vis_reforecast_array_ensemble[l,:,s,b] = np.mean(vis_reforecast_array_ensemble[l,:,s,b])
            t_reforecast_array_ensemble[l,:,s,b] = np.mean(t_reforecast_array_ensemble[l,:,s,b])
            
vis_reforecast_array_ensemble = vis_reforecast_array_ensemble[:, 0, 0:118, :]
t_reforecast_array_ensemble = t_reforecast_array_ensemble[:, 0, 0:118, :]

et = time.time()
elapsed_time = et - st
print('Execution time:', elapsed_time, 'seconds')

Execution time: 166.7318241596222 seconds


In [6]:
# Check dimensions
vis_reforecast_obs_array.shape, t_reforecast_array_ensemble.shape, vis_reforecast_array_ensemble.shape

((21, 118, 4180), (21, 118, 4180), (21, 118, 4180))

In [7]:
# Create arrays for writting a dataframe of all records
base_df = np.tile(base, len(lat)*len(lead_hr))
lon_df = np.tile(np.repeat(lon, len(base)), len(lead_hr))
lat_df = np.tile(np.repeat(lat, len(base)), len(lead_hr))
alt_df = np.tile(np.repeat(alt, len(base)), len(lead_hr))
lead_df = np.repeat(lead, len(base)*len(lat))
lead_hr_df = np.repeat(lead_hr, len(base)*len(lat))
forecast_time_df = base_df + lead_df
forecast_time_pd = pd.DatetimeIndex(forecast_time_df)
time_in_day = [i.hour for i in forecast_time_pd]
vis_reforecast_obs_df = vis_reforecast_obs_array.flatten()
vis_reforecast_df = vis_reforecast_array_ensemble.flatten()
t_reforecast_df = t_reforecast_array_ensemble.flatten()

In [8]:
# Check dimensions
vis_reforecast_df.shape, vis_reforecast_obs_df.shape

((10358040,), (10358040,))

In [9]:
# Create the dataframe
df = pd.DataFrame({'base': base_df,
                   'lead': lead_df,
                   'lead_hr': lead_hr_df,
                   'forecast_time': forecast_time_df,
                   'time_in_day': time_in_day,
                   'station_lat': lat_df,
                   'station_lon': lon_df,
                   'station_alt': alt_df,
                   't_forecast': t_reforecast_df,
                   'vis_forecast': vis_reforecast_df,
                   'vis_obs': vis_reforecast_obs_df})

df

Unnamed: 0,base,lead,lead_hr,forecast_time,time_in_day,station_lat,station_lon,station_alt,t_forecast,vis_forecast,vis_obs
0,1997-01-02,0 days,0,1997-01-02,0,52.928000,4.781000,1.2,264.695526,9035.876953,12000.0
1,1997-01-05,0 days,0,1997-01-05,0,52.928000,4.781000,1.2,270.306030,18376.275391,25000.0
2,1997-01-09,0 days,0,1997-01-09,0,52.928000,4.781000,1.2,272.648438,18909.019531,4000.0
3,1997-01-12,0 days,0,1997-01-12,0,52.928000,4.781000,1.2,274.975983,21590.419922,3500.0
4,1997-01-16,0 days,0,1997-01-16,0,52.928000,4.781000,1.2,280.201233,20659.673828,3900.0
...,...,...,...,...,...,...,...,...,...,...,...
10358035,2017-12-17,5 days,120,2017-12-22,0,45.786833,3.149333,331.0,274.542358,4150.832031,4862.0
10358036,2017-12-20,5 days,120,2017-12-25,0,45.786833,3.149333,331.0,279.226501,23566.400391,628.0
10358037,2017-12-24,5 days,120,2017-12-29,0,45.786833,3.149333,331.0,268.363068,42204.875000,30056.0
10358038,2017-12-27,5 days,120,2018-01-01,0,45.786833,3.149333,331.0,277.685028,37050.960938,60000.0


In [10]:
# Training period: 1997-2016
df_train = df[df['base']<np.datetime64('2017-01-01')]
df_train.shape

(10097850, 11)

In [11]:
# Dropping extreme observations
df_drop = df_train.drop(df_train[(df_train.vis_obs>100000)|(df_train.vis_forecast>100000)].index)
df_drop.shape

(7461350, 11)

In [12]:
# Save into a csv file
df_drop.to_csv(export_path + "df_training.csv")

In [13]:
# Check the records in the dataframe are matched correctly
i = 127182 # a random entry
print(df.iloc[i])
l = np.asarray(lead==df.iloc[i]['lead']).nonzero()[0]
s = np.asarray(lat==df.iloc[i]['station_lat']).nonzero()[0]
b = np.asarray(base==df.iloc[i]['base']).nonzero()[0]
print(int(lead[l]*10**-9/3600),
      float(lat[s]),
      base[b],
      float(alt[s]),
      float(np.mean(t_reforecast[l,:,s,b])),
      float(np.mean(vis_reforecast[l,:,s,b])),
      float(vis_reforecast_obs[l,s,b]))

base                                 2006-01-11 00:00:00
lead                                     0 days 00:00:00
lead_hr                                                0
forecast_time                        2006-01-11 00:00:00
time_in_day                                            0
station_lat                                      49.2641
station_lon                                       6.6868
station_alt                                        363.0
t_forecast                                     273.06073
vis_forecast                                48664.976562
vis_obs          9969209968386869046778552952102584320.0
Name: 127182, dtype: object
0 49.2641 ['2006-01-11T00:00:00.000000000'] 363.0 273.06072998046875 48664.9765625 9.969209968386869e+36
