In [1]:
%load_ext autoreload
%autoreload 2

import sys

import numpy as np
import pandas as pd
import pycomlink as pycml
import xarray as xr

In [2]:
from pathlib import Path

# Navigate to your local sandbox clone
path_transformer = str(
    Path("/home/erlend/Documents/GitHub/OPENSENSE_sandbox/notebooks/").resolve()
)
sys.path.append(path_transformer)

In [3]:
import opensense_data_downloader_and_transformer as oddt

In [4]:
# User specified starting times
start = "2015-07-25T12:30"
end = "2015-07-25T15:00"

# local path to OpenMRG data (will be created if it does not exist)
local_path = "/home/erlend/offline_data/andersson_2022_OpenMRG/"

# Note:
The following code creates example data used for testing mergeplg. It requires pycomlink to run. 

# Derive small example dataset from the OpenMRG dataset from SMHI with large CML dataset
source: https://zenodo.org/record/6673751

In [5]:
oddt.download_andersson_2022_OpenMRG(local_path=local_path, print_output=True)

File already exists at desired location /home/erlend/offline_data/andersson_2022_OpenMRG/OpenMRG.zip
Not downloading!


In [6]:
# Transform first part of the data
ds1 = oddt.transform_andersson_2022_OpenMRG(
    fn=local_path + "OpenMRG.zip",  # navigate to your local sandbox clone
    path_to_extract_to=local_path,
    time_start_end=(
        None,
        "2015-07-15T00:00",
    ),  # default (None, None) -> no timeslicing. ie. ('2015-08-31T00', None),
    restructure_data=True,
)

  ds_multindex = ds.assign_coords({'sublink':df_metadata.index})


In [7]:
# Transform second part of the data
ds2 = oddt.transform_andersson_2022_OpenMRG(
    fn=local_path + "OpenMRG.zip",  # navigate to your local sandbox clone
    path_to_extract_to=local_path,
    time_start_end=(
        "2015-07-15T00:00",
        None,
    ),  # default (None, None) -> no timeslicing. ie. ('2015-08-31T00', None),
    restructure_data=True,
)

  ds_multindex = ds.assign_coords({'sublink':df_metadata.index})


In [8]:
# Potentially dampen largest overestimations due to noise.
ds1 = ds1.resample(time="1min").first(skipna=True)
ds2 = ds2.resample(time="1min").first(skipna=True)

In [9]:
# concat and drop overlaying duplicate
ds_cml = xr.concat([ds1, ds2], dim="time").drop_duplicates(dim="time")

In [10]:
ds_cml["tsl"] = ds_cml.tsl.interpolate_na(dim="time", method="linear", max_gap="5min")
ds_cml["rsl"] = ds_cml.rsl.interpolate_na(dim="time", method="linear", max_gap="5min")

In [11]:
ds_cml.attrs["file author(s)"] = "Maximilian Graf, Erlend Øydvin and Christian Chwala"
ds_cml.attrs["title"] = "Transformed and resampled OpenMRG-CML"
ds_cml.attrs["comment"] += (
    "\n\nTransformed and resampled dataset: \n"
    "rsl and tsl was resampled to 1 minute resolution using the first occurring"
    "value in every minute. "
    "Gaps shorter than 5min was linearly interpolated. "
)
ds_cml.attrs["contact"] += ", erlend.oydvin@nmbu.no"

# CML data

### CML quality control

In [12]:
# calculate total loss
ds_cml["tl"] = ds_cml.tsl - ds_cml.rsl

In [13]:
# remove cmls with strong diurnal cycles
keep = np.where(
    (
        (ds_cml.tl.rolling(time=60 * 5, center=True).std() > 2).mean(dim="time") <= 0.1
    ).all(dim="sublink_id")
)[0]
ds_cml = ds_cml.isel(cml_id=keep)

In [14]:
# remove cmls with very noisy periods
keep = np.where(
    (
        (ds_cml.tl.rolling(time=60, center=True).std() > 0.8).mean(dim="time") <= 0.35
    ).all(dim="sublink_id")
)[0]
ds_cml = ds_cml.isel(cml_id=keep)

### CML rain rates

In [15]:
ds_cml = ds_cml.isel(cml_id=keep)

In [16]:
# # Calculate wet periods
roll_std_dev = ds_cml.tl.rolling(time=60, center=True).std()
threshold = 1.12 * roll_std_dev.quantile(0.8, dim="time")
ds_cml["wet_std"] = roll_std_dev > threshold

ds_cml["baseline"] = pycml.processing.baseline.baseline_constant(
    trsl=ds_cml.tl,
    wet=ds_cml.wet_std,
    n_average_last_dry=5,
)

  return fnb._ureduce(a,


In [17]:
ds_cml["A_obs"] = ds_cml.tl - ds_cml.baseline
ds_cml["A_obs"] = ds_cml.A_obs.where(ds_cml.A_obs >= 0, 0)

# Pastorek using parameters that looks good for the German,
# Swedish and Norwegian dataset
ds_cml["waa"] = pycml.processing.wet_antenna.waa_pastorek_2021_from_A_obs(
    A_obs=ds_cml.A_obs,
    f_Hz=ds_cml.frequency * 1e6,
    pol=ds_cml.polarization.data,
    L_km=ds_cml.length / 1000,
    A_max=6,
    zeta=0.7,  # 0.55 is default
    d=0.15,
)

# calculate attenuation caused by rain and remove negative attenuation
ds_cml["A"] = ds_cml.tl - ds_cml.baseline - ds_cml.waa
ds_cml["A"].data[ds_cml.A < 0] = 0
# derive rain rate via the k-R relation
ds_cml["R"] = pycml.processing.k_R_relation.calc_R_from_A(
    A=ds_cml.A,
    L_km=ds_cml.length.astype(float) / 1000,  # convert to km
    f_GHz=ds_cml.frequency / 1000,  # convert to GHz
    pol=ds_cml.polarization,
)

In [49]:
# Slice and convert to sum 5 min
ds_cml_res = (
    ds_cml[["R"]]
    .isel(sublink_id=0)
    .sel(time=slice(start, end))
    .resample(time="5min", label="right", closed="right")
    .sum(skipna=True)
    / 60
)

In [50]:
ds_cml_res.to_netcdf("./openmrg_cml.nc")

# Radar data

In [51]:
# read radar data and convert to Opensense naming conventions
ds_rad = (
    xr.open_dataset(local_path + "radar/radar.nc")
    .rename(  # create using notebook in data folder
        {"lat": "latitudes", "lon": "longitudes"}
    )
    .sel(time=slice(start, end))
    .transpose("time", "y", "x")
)

In [52]:
# Apply masrhal palmer to get rainfall rates
ds_rad["rainfall_amount"] = (10 ** (ds_rad.data / 10) / 200) ** (5 / 8)

In [53]:
# flip along y axis to work in the grid intersection function
ds_rad["latitudes"] = (("y", "x"), np.flip(ds_rad.latitudes.data, axis=0))
ds_rad["rainfall_amount"] = (
    ("time", "y", "x"),
    np.flip(ds_rad.rainfall_amount.data, axis=1),
)

# convert to sum 5 min
ds_rad["rainfall_amount"] = ds_rad.rainfall_amount * (5 / 60)

ds_rad.attrs["comment"] += (
    "\n dBZ was converted to rainfall [mm/h] using the marshal-palmer equation: "
    "( 10 **(dBZ/10) / 200)**(5/8). "
    " Done by Erlend Øydvin. "
)
ds_rad.rainfall_amount.attrs["units"] = "sum 5min"

In [54]:
ds_rad = ds_rad.drop_vars("data")

In [55]:
ds_rad.to_netcdf("./openmrg_rad.nc")

# Municipality gauge data

In [56]:
# read gauge data from CSV and store to xarray, copied from Graf compare article
df_gauge = pd.read_csv(
    local_path + "gauges/city/CityGauges-2015JJA.csv", index_col=0, parse_dates=True
)
df_gauge_meta = pd.read_csv(local_path + "gauges/city/CityGauges-metadata.csv")

df_gauge.index = df_gauge.index.tz_localize(None).astype("datetime64[ns]")

ds_gauges = xr.Dataset(
    data_vars={"rainfall_amount": (["station_id", "time"], df_gauge.T)},
    coords={
        "station_id": df_gauge_meta.index.to_numpy(),
        "time": df_gauge.index.to_numpy(),
        "lon": (["station_id"], df_gauge_meta.Longitude_DecDeg),
        "lat": (["station_id"], df_gauge_meta.Latitude_DecDeg),
        "location": (["station_id"], df_gauge_meta.Location),
        "type": (["station_id"], df_gauge_meta.Type),
        "quantization": (["station_id"], df_gauge_meta["Resolution (mm)"]),
    },
)
# shorten and resample to sum 5 min
ds_gauges = (
    ds_gauges.sel(time=slice(start, end))
    .resample(time="5min", label="right", closed="right")
    .sum()
)
ds_gauges.to_netcdf("./openmrg_municp_gauge.nc")

# SMHI gauge data

In [57]:
# Taken from the comparison paper
df_gauge_smhi = pd.read_csv(
    local_path + "gauges/smhi/GbgA-71420-2015JJA.csv",
    index_col=0,
    parse_dates=True,
)

# Convert to no timezone to make to_numpy work instead of .values (RUFF complains)
df_gauge_smhi.index = df_gauge_smhi.index.tz_localize(None).astype("datetime64[ns]")


ds_gauges_smhi = xr.Dataset(
    data_vars={
        "rainfall_amount": (["station_id", "time"], [df_gauge_smhi.Pvol_mm.to_numpy()]),
    },
    coords={
        "station_id": ["SMHI"],
        "time": df_gauge_smhi.index.to_numpy(),
        "lon": (["station_id"], [11.9924]),
        "lat": (["station_id"], [57.7156]),
        "location": (["station_id"], ["Goeteburg A"]),
        "type": (["station_id"], ["15 min rainfall sum"]),
        "quantization": (["station_id"], [0.1]),
    },
)

# Slice time
ds_gauges_smhi = ds_gauges_smhi.sel(time=slice(start, end))

# from 15 min sum to 5 min sum
ds_gauges_smhi = ds_gauges_smhi.resample(time="5min").bfill() / 3

# Save
ds_gauges_smhi.to_netcdf("./openmrg_smhi_gauge.nc")