In [1]:
import pandas as pd
import numpy as np
import jax.numpy as jnp
from jax.lax import scan
from functools import partial

In [2]:
CLIM_DATA = "../data/interim/climatena_1990-2020_monthly_long.csv"
clim = pd.read_csv(CLIM_DATA).set_index(["PLOT_ID", "PERIOD"])
clim.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 4522032 entries, (60101550679, '1990-01') to (530907572668, '2020-12')
Data columns (total 15 columns):
 #   Column  Dtype  
---  ------  -----  
 0   TMAX    float64
 1   TMIN    float64
 2   TAVE    float64
 3   PPT     int64  
 4   RAD     float64
 5   DD_0_   int64  
 6   DD5_    int64  
 7   DD_18_  int64  
 8   DD18_   int64  
 9   NFFD    int64  
 10  PAS     int64  
 11  EREF    int64  
 12  CMD     int64  
 13  RH      int64  
 14  CMI     float64
dtypes: float64(5), int64(10)
memory usage: 535.1+ MB


In [3]:
SOIL_MOISTURE = "../data/interim/monthly_soil_moisture_for_plots"
moisture = pd.read_csv(SOIL_MOISTURE)
moisture["PLOT_ID"] = moisture["PLOT_ID"].astype("Int64")
moisture = moisture.set_index(["PLOT_ID", "PERIOD"])
moisture.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12027 entries, 60101550679 to 530907572668
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype
---  ------    --------------  -----
 0   SMI_MIN   12027 non-null  Int64
 1   SMI_CRIT  12027 non-null  Int64
 2   SMI_MAX   12027 non-null  Int64
dtypes: Int64(3)
memory usage: 411.1 KB


In [7]:
SUN_DATA = "../data/interim/monthly_irradiance_for_plots.csv"
sun = pd.read_csv(SUN_DATA)
sun = sun.set_index(["PLOT_ID", "PERIOD"])
sun.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 4667904 entries, (60101572041, '1990-01') to (530907572668, '2021-12')
Data columns (total 8 columns):
 #   Column              Dtype  
---  ------              -----  
 0   POA_GLOBAL          float64
 1   POA_DIRECT          float64
 2   POA_DIFFUSE         float64
 3   POA_SKY_DIFFUSE     float64
 4   POA_GROUND_DIFFUSE  float64
 5   CLEARSKY_GHI        float64
 6   CLEARSKY_DNI        float64
 7   CLEARSKY_DHI        float64
dtypes: float64(8)
memory usage: 303.1+ MB


In [8]:
CLOUD_COEFS = "../data/interim/cloud_linear_coefs.csv"
cloud_coefs = pd.read_csv(CLOUD_COEFS).set_index("PLOT_ID")
cloud_coefs.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12156 entries, 60101550679 to 530907572668
Data columns (total 5 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   INTERCEPT                   12156 non-null  float64
 1   bRH                         12156 non-null  float64
 2   bELEV                       12156 non-null  float64
 3   RMSE                        12156 non-null  float64
 4   ANNUAL_CLOUDY_GHI_PCT_DIFF  12156 non-null  float64
dtypes: float64(5)
memory usage: 569.8 KB


In [9]:
def pred_cloud_cover(
    intercept, slope_rh, slope_elev, relative_humidity, elevation, clip_clouds=False
):
    cloud_cover = intercept + slope_rh * relative_humidity + slope_elev * elevation
    if clip_clouds:
        cloud_cover = np.clip(cloud_cover, 0, 1)
    return cloud_cover


def cloud_cover_correction(clearsky_ghi, cloud_cover, offset=0.35):
    ghi = clearsky_ghi * (offset + (1 - offset) * (1 - cloud_cover))
    return ghi


def pred_cloudy_ghi(
    intercept,
    slope_rh,
    slope_elev,
    relative_humidity,
    elevation,
    clearsky_ghi,
    clip_clouds=False,
):
    cloud_cover = pred_cloud_cover(
        intercept,
        slope_rh,
        slope_elev,
        relative_humidity,
        elevation,
        clip_clouds=clip_clouds,
    )
    return cloud_cover_correction(clearsky_ghi, cloud_cover)

In [10]:
PLOT_INFO = "../data/interim/plot_info_for_climatena.csv"
plots = (
    pd.read_csv(PLOT_INFO)
    .rename({"ID1": "PLOT_ID", "el": "ELEV"}, axis=1)
    .drop(["ID2"], axis=1)
    .set_index("PLOT_ID")
)
plots.columns = [col.upper() for col in plots.columns]
plots.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12156 entries, 60101550679 to 530907572668
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   LAT     12156 non-null  float64
 1   LON     12156 non-null  float64
 2   ELEV    12156 non-null  int64  
dtypes: float64(2), int64(1)
memory usage: 379.9 KB


In [11]:
cloud_data = (
    cloud_coefs[["INTERCEPT", "bRH", "bELEV"]]
    .merge(clim["RH"], left_index=True, right_index=True, how="inner")
    .merge(plots["ELEV"], left_index=True, right_index=True, how="inner")
    .merge(sun["POA_GLOBAL"], left_index=True, right_index=True, how="inner")
)
cloud_data["CLOUD_COVER"] = pred_cloud_cover(
    cloud_data.INTERCEPT,
    cloud_data.bRH,
    cloud_data.bELEV,
    cloud_data.RH,
    cloud_data.ELEV,
    clip_clouds=True,
)
cloud_data["SOLAR_RADIATION"] = cloud_cover_correction(
    cloud_data.POA_GLOBAL, cloud_data.CLOUD_COVER
)

In [12]:
merged = (
    clim.merge(sun, left_index=True, right_index=True, how="left")
    .merge(
        cloud_data[["CLOUD_COVER", "SOLAR_RADIATION"]],
        left_index=True,
        right_index=True,
        how="left",
    )
    .merge(smi_pred, left_index=True, right_index=True, how="left")
)
merged.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 4522032 entries, (60101550679, '1990-01') to (530907572668, '2020-12')
Data columns (total 26 columns):
 #   Column              Dtype  
---  ------              -----  
 0   TMAX                float64
 1   TMIN                float64
 2   TAVE                float64
 3   PPT                 int64  
 4   RAD                 float64
 5   DD_0_               int64  
 6   DD5_                int64  
 7   DD_18_              int64  
 8   DD18_               int64  
 9   NFFD                int64  
 10  PAS                 int64  
 11  EREF                int64  
 12  CMD                 int64  
 13  RH                  int64  
 14  CMI                 float64
 15  POA_GLOBAL          float64
 16  POA_DIRECT          float64
 17  POA_DIFFUSE         float64
 18  POA_SKY_DIFFUSE     float64
 19  POA_GROUND_DIFFUSE  float64
 20  CLEARSKY_GHI        float64
 21  CLEARSKY_DNI        float64
 22  CLEARSKY_DHI        float64
 23  CLOUD_COVER     

In [18]:
(merged == -9999).sum()

TMAX                        0
TMIN                        0
TAVE                        0
PPT                         0
RAD                   1508868
DD_0_                       0
DD5_                        0
DD_18_                      0
DD18_                       0
NFFD                        0
PAS                         0
EREF                        0
CMD                         0
RH                          0
CMI                         0
POA_GLOBAL                  0
POA_DIRECT                  0
POA_DIFFUSE                 0
POA_SKY_DIFFUSE             0
POA_GROUND_DIFFUSE          0
CLEARSKY_GHI                0
CLEARSKY_DNI                0
CLEARSKY_DHI                0
CLOUD_COVER                 0
SOLAR_RADIATION             0
SOIL_MOISTURE               0
dtype: int64

In [19]:
merged.loc[merged.RAD == -9999, "RAD"] = np.nan

In [20]:
merged.isna().sum()

TMAX                        0
TMIN                        0
TAVE                        0
PPT                         0
RAD                   1508868
DD_0_                       0
DD5_                        0
DD_18_                      0
DD18_                       0
NFFD                        0
PAS                         0
EREF                        0
CMD                         0
RH                          0
CMI                         0
POA_GLOBAL                  0
POA_DIRECT                  0
POA_DIFFUSE                 0
POA_SKY_DIFFUSE             0
POA_GROUND_DIFFUSE          0
CLEARSKY_GHI                0
CLEARSKY_DNI                0
CLEARSKY_DHI                0
CLOUD_COVER                 0
SOLAR_RADIATION             0
SOIL_MOISTURE               0
dtype: int64

In [21]:
merged.to_csv(
    "../data/interim/monthly_climatic_drivers_for_plots.csv", index=True, header=True
)