In [110]:
import datetime
import math
import numpy as np
import netCDF4 as nc
import os
import pandas

np.random.seed(42)

In [2]:
def datadir_era5(x):
    return "/projectnb/labci/Lucia/data/era5_npy/" + x

def datadir_gpm(x):
    return "/projectnb/labci/Indrajit/Rainfall/data/GPM/SatelliteDataFull/" + x

def _load_numpy_arr(file):
    return np.load(file)

def load_numpy_arrays(files):
    final_arr = []
    
    for i in range(len(files)):
        arr = _load_numpy_arr(files[i])
        final_arr.append(arr) 
        
    return np.array(final_arr)

## Select location(s) in ERA5 and GPM

In [27]:
era5_sample_nc = nc.Dataset("/projectnb/labci/Lucia/data/era5_nc/cape/convective_available_potential_energy_download_1990-1999.nc")
gpm_sample_nc = nc.Dataset(datadir_gpm("") + "IMERG_20000601.nc4", allow_pickle=True)

lat_era5 = era5_sample_nc["latitude"][:]
lon_era5 = era5_sample_nc["longitude"][:]
lat_gpm = gpm_sample_nc["lat"][:]
lon_gpm = gpm_sample_nc["lon"][:]

#### Lake Volta (approximate)

In [29]:
lat_volta = 7.55
lon_volta = -0.05

#### Capital City, Accra (approximate)

In [30]:
lat_accra = 5.55
lon_accra = -0.3

#### Random

In [70]:
x = np.random.choice(lat_era5)
y = np.random.choice(lon_era5)
print((x, y))

(6.05, -1.3)


In [65]:
print(lat_gpm)
print(lon_gpm)

[ 4.3500032  4.4500017  4.55       4.6499987  4.750005   4.8500032
  4.9500017  5.05       5.1499987  5.250005   5.3500032  5.4500017
  5.55       5.6499987  5.750005   5.8500032  5.9500017  6.05
  6.1499987  6.250005   6.3500032  6.4500017  6.55       6.6499987
  6.750005   6.8500032  6.9500017  7.05       7.1499987  7.250005
  7.3500032  7.4500017  7.55       7.6499987  7.750005   7.8500032
  7.9500017  8.05       8.149999   8.250005   8.350003   8.450002
  8.55       8.649999   8.750005   8.850003   8.950002   9.05
  9.149999   9.250005   9.350003   9.450002   9.55       9.649999
  9.750005   9.850003   9.950002  10.05      10.149999  10.250005
 10.350003  10.450002  10.55      10.649999  10.750005  10.850003
 10.950002  11.05      11.149999  11.250005  11.350003  11.450002
 11.55     ]
[-3.750003   -3.649997   -3.549991   -3.45       -3.349994   -3.250003
 -3.149997   -3.049991   -2.95       -2.849994   -2.750003   -2.649997
 -2.549991   -2.45       -2.349994   -2.250003   -2.14999

In [69]:
print(lon_era5)

[-3.8  -3.55 -3.3  -3.05 -2.8  -2.55 -2.3  -2.05 -1.8  -1.55 -1.3  -1.05
 -0.8  -0.55 -0.3  -0.05  0.2   0.45  0.7   0.95  1.2   1.45  1.7 ]


In [72]:
(1.349994 - 1.3) - 0.000999999999999

0.04899400000000087

### Apply Mask

In [73]:
def mask_lat_lon(lat, lon, x, y, tolerance=0.000999999999999, gpm=False):

    mask_lat = np.where(abs(lat - x) <  tolerance, True, False)
    
    if gpm:
        tolerance += 0.04899400000000087  # extra tolerance for longitude grid in GPM
        
    mask_lon = np.where(abs(lon - y) <  tolerance, True, False)        
    
    return mask_lat, mask_lon

In [74]:
volta_mask_gpm = mask_lat_lon(lat_gpm, lon_gpm, lat_volta, lon_volta, gpm=True)
accra_mask_gpm = mask_lat_lon(lat_gpm, lon_gpm, lat_accra, lon_accra, gpm=True)
rand_mask_gpm  = mask_lat_lon(lat_gpm, lon_gpm, x, y, gpm=True)

volta_mask_era = mask_lat_lon(lat_era5, lon_era5, lat_volta, lon_volta)
accra_mask_era = mask_lat_lon(lat_era5, lon_era5, lat_accra, lon_accra)
rand_mask_era  = mask_lat_lon(lat_era5, lon_era5, x, y)

## Select dates

GPM data runs daily from June 1, 2000 - September 30, 2021. ERA5 data runs every 6 hours from 00:00 AM on January 1, 1990 - 18:00 PM on December 31, 2022 (all variables except specific_humdity, q, which starts 00:00 AM on January 1, 2000). For the ERA5 variables, we start by selecting just the observation at 12:00 PM the previous day.

In [35]:
era_start = datetime.datetime(1990, 1, 1)  # all variables with the exception of q, specific_humidity
q_start = datetime.datetime(2000, 1, 1)

gpm_start = datetime.datetime(2000, 6, 1)
gpm_end = datetime.datetime(2021, 9, 30)

start_ind = ((gpm_start - era_start) - datetime.timedelta(days=1)).days  # get noon observations from previous day
start_ind_q = ((gpm_start - q_start) - datetime.timedelta(days=1)).days

end_ind = (gpm_end - era_start).days
end_ind_q = (gpm_end - q_start).days



## Create ERA5 timeseries

In [123]:
def create_time_series(vars_list, loc_mask, start_from=0, end_on=None, hour=2):
    series_dict = {}

    for var in vars_list:
        print(f"Creating timeseries for {var}")

        var_files = os.listdir(datadir_era5(var))
        for i in range(len(var_files)):
            var_files[i] = datadir_era5(var + "/" + var_files[i])

        var_files.sort()

        full_arr = load_numpy_arrays(var_files)
        
        if end_on is not None:
            series = full_arr[start_from:end_on, hour, loc_mask[0], loc_mask[1]]
            series_dict[var] = series.flatten()
        else:
            series = full_arr[start_from:, hour, loc_mask[0], loc_mask[1]]
            series_dict[var] = series.flatten()
    
    return series_dict

In [124]:
variables = os.listdir(datadir_era5(""))
q_list = []

for var_name in variables: 
    if 'q' in var_name:
        q_list.append(var_name) 
        variables.remove(var_name)

series = create_time_series(variables, rand_mask_era, start_from=start_ind, end_on=end_ind, hour=2)
q_series = create_time_series(q_list, rand_mask_era, start_from=start_ind_q, end_on=end_ind_q, hour=2)

for q in q_series.keys():
    series[q] = q_series[q]

Creating timeseries for u950
Creating timeseries for t700
Creating timeseries for cin
Creating timeseries for tcwv
Creating timeseries for sp
Creating timeseries for w950
Creating timeseries for t950
Creating timeseries for w925
Creating timeseries for kx
Creating timeseries for u600
Creating timeseries for vimd
Creating timeseries for r700
Creating timeseries for w500
Creating timeseries for w300
Creating timeseries for t300
Creating timeseries for t600
Creating timeseries for d2m
Creating timeseries for r950
Creating timeseries for u925
Creating timeseries for w850
Creating timeseries for tcc
Creating timeseries for t925
Creating timeseries for u500
Creating timeseries for u300
Creating timeseries for r300
Creating timeseries for r600
Creating timeseries for w700
Creating timeseries for t500
Creating timeseries for t2m
Creating timeseries for tclw
Creating timeseries for u850
Creating timeseries for r850
Creating timeseries for cape
Creating timeseries for r925
Creating timeseries fo

## Add precipitation data

In [127]:
def _extract_array_from_nc(file, vname, loc):
    nc_obj = nc.Dataset(datadir_gpm("") + file)
    var = nc_obj.variables[vname][:].data

    return var[:, loc[1], loc[0]]  # GPM data is lon-lat, not lat-lon


def extract_var_array(files, vname, loc):
    var = _extract_array_from_nc(files[0], vname, loc)

    for i in range(len(files)):
        if i == 0:
            continue
        else:
            curr_arr = _extract_array_from_nc(files[i], vname, loc)
            var = np.vstack((var, curr_arr))

    return var.flatten()

In [87]:
gpm_files = os.listdir(datadir_gpm(""))
gpm_files.sort()

precip = extract_var_array(gpm_files, "precipitationCal", rand_mask_gpm)

In [129]:
series["precip"] = precip

### Threshold precipitation values

In [130]:
thresh = np.where(precip > 0.2, 1, 0)

In [131]:
series["precip_thresh"] = thresh

## Compute & add time variables

In [132]:
t1 = [math.sin( (2*math.pi*d)/365) for d in range(1, 7793)]
t2 = [math.cos( (2*math.pi*d)/365) for d in range(1, 7793)]

In [133]:
series["time1"] = t1
series["time2"] = t2

## Save timeseries

In [134]:
series_df = pandas.DataFrame.from_dict(series)
series_df.head()

Unnamed: 0,u950,t700,cin,tcwv,sp,w950,t950,w925,kx,u600,...,q700,q950,q300,q600,q850,q925,precip,precip_thresh,time1,time2
0,0.982141,282.515948,106.570866,50.653315,99310.332614,-0.086843,297.319761,-0.118163,33.5875,-10.112195,...,0.007735,0.015573,0.000259,0.005371,0.012479,0.015486,0.439947,1,0.017213,0.999852
1,2.256932,281.834625,35.950883,55.837907,99378.690588,0.082913,296.190478,0.153973,37.239126,-13.155744,...,0.009245,0.017023,0.000517,0.005897,0.013292,0.015521,0.030229,0,0.034422,0.999407
2,2.74647,281.726739,166.417792,51.619123,99490.713348,0.021853,296.125981,0.041329,36.237697,-15.313253,...,0.008078,0.015075,0.000481,0.005444,0.010656,0.014376,0.160053,0,0.05162,0.998667
3,2.878268,282.716475,276.681403,48.833767,99307.020113,0.08765,297.372531,0.142656,29.408643,-11.578592,...,0.006635,0.015014,0.000623,0.005471,0.010872,0.014318,0.90023,1,0.068802,0.99763
4,2.42971,282.232161,14.008028,52.871208,99370.459525,-0.096318,297.048874,-0.15922,34.768312,-13.901126,...,0.007795,0.016255,0.000299,0.005528,0.012845,0.016017,7.516011,1,0.085965,0.996298


In [135]:
series_df.to_csv("/projectnb/labci/Lucia/data/timeseries/timeseriesGPM+ERA5.csv", header=True, index=False)