## Import libraries


In [None]:
import argparse
import concurrent.futures
import json
import os
import warnings
from concurrent.futures import ProcessPoolExecutor
from datetime import datetime, timedelta
from glob import glob

import geopandas as gpd
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from tqdm import tqdm
from tqdm.contrib.concurrent import process_map

warnings.filterwarnings("ignore")

WORK_DIR = "/beegfs/halder/GITHUB/RESEARCH/crop-yield-forecasting-germany/"
os.chdir(WORK_DIR)

In [None]:
# Parameters
CROP = "winter_wheat"
INTERVAL = "7D"

## Calculate scalers


### Scaler for timeseries


In [None]:
# Read the split dataframe
split_df = pd.read_csv(
    os.path.join(WORK_DIR, "data", "processed", CROP, "train_test_val_split.csv")
)
split_df["file_name"] = split_df["NUTS_ID"] + split_df["year"].apply(
    lambda y: f"_{str(y)}.parquet"
)

climate_file_paths = glob(
    os.path.join(WORK_DIR, "data", "processed", CROP, "timeseries", "*.parquet")
)
train_climate_file_paths = [
    path
    for path in climate_file_paths
    if os.path.basename(path)
    in split_df[split_df["split"] == "train"]["file_name"].values
]

timeseries_df = pd.DataFrame()
for path in tqdm(train_climate_file_paths):
    basename = os.path.basename(path).replace(".parquet", "")
    NUTS_ID = basename.split("_")[0]
    year = int(basename.split("_")[1])

    df = pd.read_parquet(path).set_index("date").resample(INTERVAL).mean().reset_index()
    df = df.iloc[:57]
    timeseries_df = pd.concat((timeseries_df, df), axis=0, ignore_index=True)

print(timeseries_df.shape)
timeseries_df.head()

In [None]:
remote_sensing_features = ["ndvi", "evi", "fpar", "lai"]
climate_features = [
    "sun_dur",
    "soil_moist",
    "soil_temp",
    "et0",
    "vpd",
    "cwb",
    "tmin",
    "tmax",
    "tavg",
    "prec",
    "rad",
]

timeseries_df = timeseries_df[remote_sensing_features + climate_features]

np.set_printoptions(suppress=True)
print("Timeseries mean:", timeseries_df.mean(axis=0, numeric_only=True).values.round(3))
print("Timeseries std:", timeseries_df.std(axis=0, numeric_only=True).values.round(3))

### Scaler for static


In [None]:
soil_features = ["soil_quality_mean", "soil_quality_stdDev"]
topo_features = ["elevation_mean", "elevation_stdDev", "slope_mean", "slope_stdDev"]
irrigation_features = ["irrigated_fraction"]

static_file_path = os.path.join(
    WORK_DIR, "data", "processed", CROP, f"{CROP}_static.csv"
)
static_df = pd.read_csv(static_file_path)
static_df = static_df[soil_features + topo_features + irrigation_features]

print(static_df.shape)
static_df.head()

In [None]:
print("Static mean:", static_df.mean(axis=0, numeric_only=True).values.round(3))
print("Static std:", static_df.std(axis=0, numeric_only=True).values.round(3))

### Scaler for yield


In [None]:
yield_df = pd.read_csv(
    os.path.join(WORK_DIR, "data", "processed", CROP, f"{CROP}_yield.csv")
)
yield_df = pd.merge(
    left=yield_df,
    right=split_df[["NUTS_ID", "year", "split"]],
    on=["NUTS_ID", "year"],
    how="left",
)
yield_df = yield_df[yield_df["split"] == "train"]

print(yield_df.shape)
yield_df.head()

In [None]:
print("Yield mean:", yield_df["yield"].mean().round(3))
print("Yield std:", yield_df["yield"].std().round(3))

print("Residual mean:", yield_df["residual"].mean().round(3))
print("Residual std:", yield_df["residual"].std().round(3))