In [1]:
from concurrent.futures import ProcessPoolExecutor, as_completed
from datetime import timedelta
from matplotlib import pyplot as plt
import numpy as np
import os
import pandas as pd
from pvlib import location, irradiance, solarposition
from tqdm.notebook import tqdm
import warnings

In [2]:
DATA_DIR = "../../data/"

In [3]:
def monthly_clearsky(
    lat, lon, elev, slope, aspect, start_year=1990, end_year=2022, timezone="Etc/GMT+8"
):
    if "GMT+" in timezone:
        tz_diff = int(timezone.split("GMT+")[-1])
        start_time = f"01-01-{start_year} {tz_diff:02d}:00"
        end_time = f"01-01-{end_year+1} {tz_diff-1:02d}:59"
    elif "GMT-" in timezone:
        tz_diff = int(timezone.split("GMT-")[-1])
        start_time = f"12-31-{start_year - 1} {24-tz_diff:02d}:00"
        end_time = f"01-01-{end_year} {23-tz_diff:02d}:59"
        tz_diff = -1 * tz_diff

    times = pd.date_range(start=start_time, end=end_time, freq="H", tz=timezone)
    site_location = location.Location(lat, lon, altitude=elev, tz=timezone)
    clearsky = site_location.get_clearsky(times)
    solar_position = site_location.get_solarposition(times=times)
    poa_irrad = irradiance.get_total_irradiance(
        surface_tilt=np.arctan(slope),
        surface_azimuth=aspect,
        dni=clearsky["dni"],
        ghi=clearsky["ghi"],
        dhi=clearsky["dhi"],
        solar_zenith=solar_position["apparent_zenith"],
        solar_azimuth=solar_position["azimuth"],
    )
    poa_irrad["CLEARSKY_GHI"] = clearsky["ghi"]
    poa_irrad["CLEARSKY_DNI"] = clearsky["dni"]
    poa_irrad["CLEARSKY_DHI"] = clearsky["dhi"]

    with warnings.catch_warnings():
        warnings.simplefilter("ignore", category=UserWarning)
        poa_irrad.index = poa_irrad.index.shift(periods=-tz_diff).to_period("M")

    monthly = poa_irrad.groupby(level=0).sum() * 3600

    return monthly

In [4]:
def cloud_cover_correction(clearsky_ghi, cloud_cover, offset=0.35):
    ghi = clearsky_ghi * (offset + (1 - offset) * (1 - cloud_cover))
    return ghi

In [5]:
PLOT_DATA = os.path.join(DATA_DIR, "interim", "plot_info_for_climatena.csv")
plots = (
    pd.read_csv(PLOT_DATA)
    .rename({"ID1": "PLOT_ID", "el": "ELEV"}, axis=1)
    .drop("ID2", axis=1)
)
plots.columns = [col.upper() for col in plots.columns]
plots = plots.set_index("PLOT_ID")
plots.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12156 entries, 60101550679 to 530907572668
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   LAT     12156 non-null  float64
 1   LON     12156 non-null  float64
 2   ELEV    12156 non-null  int64  
dtypes: float64(2), int64(1)
memory usage: 379.9 KB


In [6]:
TREE_DATA = os.path.join(DATA_DIR, "interim", "FIA_remeasured_trees_for_training.csv")
trees = pd.read_csv(TREE_DATA)[["PLOT_ID", "SLOPE", "ASPECT"]]
terrain = trees.groupby(by=["PLOT_ID"])[["SLOPE", "ASPECT"]].first()
plots["ASPECT"] = terrain["ASPECT"]
plots["SLOPE"] = terrain["SLOPE"] / 100.0
# some plots had null values for ASPECT and SLOPE
# we will replace them with zeros for modeling sunlight
plots["ASPECT"].fillna(0, inplace=True)
plots["SLOPE"].fillna(0, inplace=True)
plots.head()

Unnamed: 0_level_0,LAT,LON,ELEV,ASPECT,SLOPE
PLOT_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
60101550679,41.806228,-123.788726,761,190.0,0.4
60101551744,41.980638,-124.193526,91,32.0,0.7
60101551969,41.681432,-123.803842,701,335.0,0.6
60101552953,41.938125,-123.870868,640,140.0,0.55
60101553315,41.738938,-123.783382,1432,137.0,0.53


In [7]:
def get_monthly_df(plot_id, start_year=1990, end_year=2022, timezone="Etc/GMT+8"):
    lat, lon, elev, slope, aspect = plots.loc[
        plot_id, ["LAT", "LON", "ELEV", "SLOPE", "ASPECT"]
    ]
    monthly_df = monthly_clearsky(
        lat,
        lon,
        elev,
        slope,
        aspect,
        start_year=start_year,
        end_year=end_year,
        timezone=timezone,
    ).reset_index()
    monthly_df = monthly_df.rename({"index": "PERIOD"}, axis=1)
    monthly_df["PLOT_ID"] = plot_id
    monthly_df.set_index(["PLOT_ID", "PERIOD"], inplace=True)
    monthly_df.columns = [col.upper() for col in monthly_df.columns]
    monthly_df = monthly_df / 1e6  # convert from Watt-seconds/m2 per month to MJ/m2 per month

    return monthly_df

def write_monthly_df(plot_id, start_year=1990, end_year=2022, timezone="Etc/GMT+8", overwrite=False):
    outfile = os.path.join(DATA_DIR, "raw", "pvlib", f"{plot_id}.parquet")
    if not os.path.exists(outfile) or overwrite:
        df = get_monthly_df(plot_id, start_year=start_year, end_year=end_year, timezone=timezone)
        df = df.reset_index()
        df["PERIOD"] = df.PERIOD.astype(str)
        df.to_parquet(outfile, index=False)
    
    return

In [8]:
example = get_monthly_df(plots.index[0])
example.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,POA_GLOBAL,POA_DIRECT,POA_DIFFUSE,POA_SKY_DIFFUSE,POA_GROUND_DIFFUSE,CLEARSKY_GHI,CLEARSKY_DNI,CLEARSKY_DHI
PLOT_ID,PERIOD,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
60101550679,1990-01,299.289978,270.301855,28.988123,28.987309,0.000814,295.36929,772.686148,28.987628
60101550679,1990-02,392.845146,354.771914,38.073232,38.072159,0.001073,389.233387,824.865362,38.072579
60101550679,1990-03,608.978489,535.698346,73.280142,73.278473,0.001669,605.655056,1006.264303,73.279281
60101550679,1990-04,755.728981,654.043253,101.685728,101.683651,0.002077,753.554807,1058.426825,101.684772
60101550679,1990-05,911.515517,795.722781,115.792736,115.790227,0.002509,910.333363,1204.009917,115.791503


In [9]:
os.makedirs(os.path.join(DATA_DIR, "raw", "pvlib"), exist_ok=True)

In [10]:
OVERWRITE = False

with tqdm(total=len(plots)) as pbar:
    with ProcessPoolExecutor(48) as executor:
        jobs = [executor.submit(write_monthly_df, int(plot_id), overwrite=OVERWRITE) for plot_id in plots.index]
        for job in as_completed(jobs):
            pbar.update()

  0%|          | 0/12156 [00:00<?, ?it/s]