# Benchmark Models

In [1]:
%load_ext lab_black

In [2]:
import time
import os

import numpy as np
import pandas as pd
import random
import pyarrow
import concurrent.futures

from pyhere import here
from datetime import date
from itertools import product, combinations
from tqdm import tqdm

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.model_selection import (
    train_test_split,
    KFold,
    GridSearchCV,
    cross_val_predict,
)
from sklearn.metrics import r2_score
from scipy.stats import spearmanr, pearsonr, t
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor, as_completed

from task_modeling_utils import *

In [3]:
# variable_groups = ["ndvi"]
# hot_encode = False
# anomaly = True
# index_cols = ["year", "district", "yield_mt"]
# year_start = 2016
# n_splits = 5
# split = 0
# random_state = 42
# return_oos_predictions = True


# variable_groups_str = "_".join(variable_groups)

# #########################################     READ DATA    #########################################
# data = pd.read_csv(here("data", "climate", "climate_summary.csv"))
# data = data.dropna()

# #########################################     FILTER DATA    #########################################
# keep_cols = []

# for var in variable_groups:
#     tmp = data.columns[data.columns.to_series().str.contains(var)].tolist()
#     keep_cols.append(tmp)

# keep_cols = [*index_cols, *[col for cols in keep_cols for col in cols]]
# data = data.loc[:, keep_cols]
# data = data[data.year >= year_start]

# data["log_yield"] = np.log10(data["yield_mt"] + 1)

# data["demean_log_yield"] = data.log_yield - data.groupby(
#     "district"
# ).log_yield.transform("mean")

# index_cols.append("log_yield")
# index_cols.append("demean_log_yield")

# #########################################    MAKE A COPY    #########################################
# crop_yield = data.copy().loc[:, tuple(index_cols)].reset_index(drop=True)

# #########################################     CALCULATE ANOMALY   #########################################
# if anomaly:
#     data.set_index(["year", "district"], inplace=True)
#     var_cols = data.columns
#     data = data[var_cols] - data.groupby(["district"], as_index=True)[
#         var_cols
#     ].transform("mean")
# else:
#     pass

# data.reset_index(drop=False, inplace=True)

# #########################################    HOT ENCODE    #########################################
# if hot_encode:
#     index_cols.remove("district")
#     data = pd.get_dummies(data, columns=["district"], drop_first=False)
# else:
#     pass

# #########################################     K-FOLD SPLIT    #########################################
# x_all = data.drop(index_cols, axis=1)
# if anomaly:
#     y_all = data.demean_log_yield
# else:
#     y_all = data.log_yield
# x_train, x_test, y_train, y_test = train_test_split(
#     x_all, y_all, test_size=0.2, random_state=random_state
# )
# kfold = KFold(n_splits=n_splits)

# #########################################    STANDARDIZE FEATURES    #########################################
# scaler = StandardScaler().fit(x_all)
# x_train = pd.DataFrame(
#     scaler.transform(x_train), columns=x_train.columns, index=x_train.index
# )
# x_test = pd.DataFrame(
#     scaler.transform(x_test), columns=x_test.columns, index=x_test.index
# )

# #########################################     K-FOLD CV    #########################################
# ### SETUP
# tic = time.time()
# alphas = {"alpha": np.logspace(-1, 1, base=10, num=3)}

# ### LAMBDA INDICIES
# i = 0
# start = [i]
# end = [x_train.shape[1]]

# for var in variable_groups:
#     i += 12
#     start.append(i)
#     end.append(i)
# start.sort()
# end.sort()

# if not hot_encode:
#     start = start[0:-1]
#     end = end[0:-1]

# ### GRID SEARCH - FINDING BEST REGULARIZATION PARAMETER(S)
# best_lambdas, best_scores, best_model = kfold_rr_multi_lambda_tuning(
#     X=x_train,
#     y=y_train,
#     grid=alphas.get("alpha"),
#     n_splits=n_splits,
#     start=start,
#     end=end,
#     static_lam=1,
#     verbose=0,
#     show_linalg_warning=False,
#     fit_model_after_tuning=True,
# )
# ### PREDICT WITH BEST HYPERPARAMETER(S)
# val_predictions = cross_val_predict(best_model, X=x_train, y=y_train, cv=kfold)
# train_predictions = best_model.predict(x_train)
# test_predictions = best_model.predict(x_test)

# if anomaly:
#     pass
# else:
#     val_predictions = np.maximum(val_predictions, 0)
#     train_predictions = np.maximum(train_predictions, 0)
#     test_predictions = np.maximum(test_predictions, 0)

# #########################################     DE-MEAN TRAIN R2    #########################################
# fold_list = []
# for i in range(n_splits):
#     idx = len(list(kfold.split(y_train))[i][1])
#     fold = np.repeat(i + 1, idx).tolist()
#     fold_list.append(fold)
# fold_list = [item for sublist in fold_list for item in sublist]

# train_split = pd.DataFrame(
#     np.repeat("train", len(x_train)), columns=["data_fold"], index=x_train.index
# )
# train_split = train_split.join(crop_yield.copy()[crop_yield.index.isin(x_train.index)])
# train_split["oos_prediction"] = val_predictions
# train_split["val_fold"] = fold_list

# #########################################     DE-MEAN TEST R2    #########################################
# test_split = pd.DataFrame(
#     {"data_fold": np.repeat("test", len(x_test))}, index=x_test.index
# )
# test_split = test_split.join(crop_yield.copy()[crop_yield.index.isin(x_test.index)])
# test_split["oos_prediction"] = test_predictions
# test_split["val_fold"] = n_splits + 1

# #########################################     OUT OF SAMPLE PREDICTIONS    #########################################
# oos_preds = pd.concat([train_split, test_split])
# oos_preds[["split", "random_state"]] = split, random_state
# oos_preds["variables"] = variable_groups_str
# oos_preds["anomaly"] = anomaly
# oos_preds["hot_encode"] = hot_encode
# oos_preds["year_start"] = year_start
# oos_preds["demean_oos_prediction"] = oos_preds.oos_prediction - oos_preds.groupby(
#     "district"
# ).oos_prediction.transform("mean")

# #########################################     SCORES    #########################################
# val_R2 = r2_score(y_train, val_predictions)
# val_r = pearsonr(val_predictions, y_train)[0]
# train_R2 = r2_score(y_train, train_predictions)
# train_r = pearsonr(train_predictions, y_train)[0]
# test_R2 = r2_score(y_test, test_predictions)
# test_r = pearsonr(test_predictions, y_test)[0]

# if anomaly:
#     demean_cv_R2 = np.nan
#     demean_cv_r = np.nan
#     demean_test_R2 = np.nan
#     demean_test_r = np.nan
# else:
#     test = oos_preds[oos_preds.data_fold == "test"]
#     train = oos_preds[oos_preds.data_fold == "train"]
#     demean_cv_R2 = r2_score(train.demean_log_yield, train.demean_oos_prediction)
#     demean_cv_r = pearsonr(train.demean_log_yield, train.demean_oos_prediction)[0]
#     demean_test_R2 = r2_score(test.demean_log_yield, test.demean_oos_prediction)
#     demean_test_r = pearsonr(test.demean_log_yield, test.demean_oos_prediction)[0]

# d = {
#     "split": split,
#     "random_state": random_state,
#     "variables": "_".join(variable_groups),
#     "year_start": year_start,
#     "hot_encode": hot_encode,
#     "anomaly": anomaly,
#     "total_n": len(x_all),
#     "train_n": len(x_train),
#     "test_n": len(x_test),
#     "best_reg_param": best_lambdas,
#     "mean_of_val_R2": best_scores,
#     "val_R2": val_R2,
#     "val_r": val_r,
#     "val_r2": val_r**2,
#     "train_R2": train_R2,
#     "train_r": train_r,
#     "train_r2": train_r**2,
#     "test_R2": test_R2,
#     "test_r": test_r,
#     "test_r2": test_r**2,
#     "demean_cv_R2": demean_cv_R2,
#     "demean_cv_r": demean_cv_r,
#     "demean_cv_r2": demean_cv_r**2,
#     "demean_test_R2": demean_test_R2,
#     "demean_test_r": demean_test_r,
#     "demean_test_r2": demean_test_r**2,
# }
# # if return_oos_predictions:
# #     return d, oos_preds
# # else:
# #     return d

In [4]:
variables = ["pre", "tmp", "ndvi"]
climate_vars = list(combinations(variables, 2))
climate_vars = (
    climate_vars + [[variables[i]] for i in range(len(variables))] + [variables]
)
climate_vars = [list(elem) for elem in climate_vars]
climate_vars.sort(key=len)
climate_vars

[['pre'],
 ['tmp'],
 ['ndvi'],
 ['pre', 'tmp'],
 ['pre', 'ndvi'],
 ['tmp', 'ndvi'],
 ['pre', 'tmp', 'ndvi']]

In [5]:
import itertools

n_splits = 10
random.seed(42)
random_seeds = [random.randint(0, 1_000_000) for _ in range(n_splits)]

he_anom_combinations = [(True, False), (False, False), (False, True)]

kwarg_list = [
    {
        "variable_groups": clim,
        "hot_encode": he,
        "anomaly": anom,
        "index_cols": ["year", "district", "yield_mt"],
        "year_start": 2016,
        "n_splits": 5,
        "split": split,
        "random_state": random_state,
        "return_oos_predictions": True,
    }
    for clim in climate_vars
    for he, anom in he_anom_combinations
    for split, random_state in enumerate(random_seeds)
]

len(kwarg_list)

210

In [6]:
if __name__ == "__main__":
    output, oos_preds = [], []
    with ProcessPoolExecutor() as executor:
        # with ThreadPoolExecutor() as executor:
        futures = {
            executor.submit(climate_model, **kwargs): kwargs for kwargs in kwarg_list
        }
        for future in tqdm(
            as_completed(futures), total=len(futures), desc="Processing models"
        ):
            out, oos = future.result()
            output.append(out)
            oos_preds.append(oos)

    today = date.today().strftime("%Y-%m-%d")

    results = pd.DataFrame(output)
    results_fn = f"climate_model_{n_splits}-splits_{today}.csv"
    print(f"Saving results as: {results_fn}\n\n")
    results.to_csv(here("data", "results", results_fn), index=False)

    oos_predictions = pd.concat(oos_preds)
    oos_fn = f"climate_model_oos_predictions_{n_splits}-splits_{today}.csv"
    print(f"Saving results as: {oos_fn}\n\n")
    oos_predictions.to_csv(here("data", "results", oos_fn), index=False)

Processing models: 100%|██████████| 210/210 [14:13<00:00,  4.06s/it]


Saving results as: climate_model_10-splits_2023-07-05.csv


Saving results as: climate_model_oos_predictions_10-splits_2023-07-05.csv




In [7]:
results

Unnamed: 0,split,random_state,variables,year_start,hot_encode,anomaly,total_n,train_n,test_n,best_reg_param,...,train_r2,test_R2,test_r,test_r2,demean_cv_R2,demean_cv_r,demean_cv_r2,demean_test_R2,demean_test_r,demean_test_r2
0,2,26225,ndvi,2016,False,False,432,345,87,[1.0],...,0.298571,0.280308,0.542938,0.294781,0.157889,0.416331,0.173331,0.111030,0.348220,0.121257
1,1,116739,pre,2016,False,True,432,345,87,[0.1],...,0.170344,0.102114,0.348568,0.121499,,,,,,
2,3,777572,pre,2016,False,False,432,345,87,[0.1],...,0.398278,0.324432,0.575593,0.331307,-0.281685,-0.151317,0.022897,-0.380065,-0.219479,0.048171
3,0,670487,ndvi,2016,False,True,432,345,87,[0.01],...,0.455168,0.570649,0.756902,0.572900,,,,,,
4,7,146316,pre,2016,False,False,432,345,87,[0.01],...,0.394274,0.368904,0.609873,0.371945,-0.354622,-0.085783,0.007359,-0.536602,-0.147994,0.021902
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
205,9,107473,pre_tmp_ndvi,2016,False,True,432,345,87,"[1000000000.0, 0.1, 0.1]",...,0.553652,0.632169,0.804545,0.647292,,,,,,
206,9,107473,pre_tmp_ndvi,2016,True,False,432,345,87,"[0.01, 0.01, 0.1, 0.01]",...,0.902052,0.848890,0.921905,0.849908,0.059943,0.409038,0.167312,0.565475,0.763606,0.583093
207,8,772246,pre_tmp_ndvi,2016,True,False,432,345,87,"[0.1, 0.01, 0.01, 0.01]",...,0.914142,0.839197,0.919882,0.846182,0.213436,0.510816,0.260933,0.468034,0.686457,0.471223
208,5,256787,pre_tmp_ndvi,2016,True,False,432,345,87,"[0.01, 0.01, 0.001, 0.01]",...,0.914419,0.866718,0.931686,0.868038,0.255763,0.550629,0.303192,0.221413,0.530530,0.281462


In [None]:
oos_predictions

In [None]:
results = pd.DataFrame(output)
results
# mask = results.anomaly == True
# cols = [
#     "demean_cv_R2",
#     "demean_cv_r",
#     "demean_cv_r2",
#     "demean_test_R2",
#     "demean_test_r",
#     "demean_test_r2",
# ]
# results.loc[mask, cols] = np.nan

# today = date.today().strftime("%Y-%m-%d")
# file_name = f"climate_model_{num_seeds}-splits_{today}.csv"
# print(f"Saving results as: {file_name}\n\n")
# results.to_csv(here("data", "results", file_name), index=False)

In [None]:
a = results.copy()
a = a[a.year_start == 2016]
a = a[a.hot_encode]
# a = a[a.variables.isin(["tmp_ndvi"])]

b = get_mean_std_ste(
    df=a,
    groupby_columns=["variables", "year_start", "hot_encode", "anomaly"],
    target_columns=["val_R2", "test_R2", "demean_cv_R2", "demean_cv_r2"],
)
b.sort_values(["summary_var", "mean"], ascending=False)
# b

In [None]:
# def climate_model(
variable_groups = ["ndvi"]
hot_encode = True
anomaly = False
index_cols = ["year", "district", "yield_mt"]
year_start = 2016
n_splits = 5
split = 0
random_state = 42
return_oos_predictions = True
# ):
if variable_groups is None:
    variable_groups_str = "rcf"
else:
    variable_groups_str = "_".join(variable_groups)
#########################################     READ DATA    #########################################
data = pd.read_csv(here("data", "climate", "climate_summary.csv"))
data = data.dropna()

#########################################     FILTER DATA    #########################################
keep_cols = []

for var in variable_groups:
    tmp = data.columns[data.columns.to_series().str.contains(var)].tolist()
    keep_cols.append(tmp)

keep_cols = [*index_cols, *[col for cols in keep_cols for col in cols]]
data = data.loc[:, keep_cols]
data = data[data.year >= year_start]

#########################################    MAKE A COPY    #########################################
crop_yield = data.copy().loc[:, tuple(index_cols)].reset_index(drop=True)
crop_yield["log_yield"] = np.log10(crop_yield.yield_mt.to_numpy() + 1)

#########################################     CALCULATE ANOMALY   #########################################
if anomaly:
    data["yield_mt"] = np.log10(data.yield_mt.to_numpy() + 1)
    data.set_index(["year", "district"], inplace=True)
    var_cols = data.columns
    data = data[var_cols] - data.groupby(["district"], as_index=True)[
        var_cols
    ].transform("mean")

else:
    pass
data.reset_index(drop=False, inplace=True)

#########################################    HOT ENCODE    #########################################
if hot_encode and not anomaly:
    index_cols.remove("district")
    data = pd.get_dummies(data, columns=["district"], drop_first=False)
else:
    pass

#########################################     K-FOLD SPLIT    #########################################
x_all = data.drop(index_cols, axis=1)
y_all = np.log10(data.yield_mt.to_numpy() + 1)
x_train, x_test, y_train, y_test = train_test_split(
    x_all, y_all, test_size=0.2, random_state=random_state
)
kfold = KFold(n_splits=n_splits)

#########################################    STANDARDIZE FEATURES    #########################################
scaler = StandardScaler().fit(x_train)
x_train = pd.DataFrame(
    scaler.transform(x_train), columns=x_train.columns, index=x_train.index
)
x_test = pd.DataFrame(
    scaler.transform(x_test), columns=x_test.columns, index=x_test.index
)

#########################################     K-FOLD CV    #########################################
### SETUP
tic = time.time()
alphas = {"alpha": np.logspace(-1, 1, base=10, num=3)}

### LAMBDA INDICIES
i = 0
start = [i]
end = [x_train.shape[1]]

for var in variable_groups:
    i += 12
    start.append(i)
    end.append(i)
start.sort()
end.sort()

if not hot_encode:
    start = start[0:-1]
    end = end[0:-1]

### GRID SEARCH - FINDING BEST REGULARIZATION PARAMETER(S)
best_lambdas, best_scores, best_model = kfold_rr_multi_lambda_tuning(
    X=x_train,
    y=y_train,
    grid=alphas.get("alpha"),
    n_splits=n_splits,
    start=start,
    end=end,
    static_lam=1,
    verbose=0,
    show_linalg_warning=False,
    fit_model_after_tuning=True,
)
### PREDICT WITH BEST HYPERPARAMETER(S)
val_predictions = cross_val_predict(best_model, X=x_train, y=y_train, cv=kfold)
train_predictions = best_model.predict(x_train)
test_predictions = best_model.predict(x_test)

if anomaly:
    pass
else:
    val_predictions = np.maximum(val_predictions, 0)
    train_predictions = np.maximum(train_predictions, 0)
    test_predictions = np.maximum(test_predictions, 0)

#########################################     DE-MEAN TRAIN R2    #########################################
fold_list = []
for i in range(n_splits):
    idx = len(list(kfold.split(y_train))[i][1])
    fold = np.repeat(i + 1, idx).tolist()
    fold_list.append(fold)
fold_list = [item for sublist in fold_list for item in sublist]

train_split = pd.DataFrame(
    np.repeat("train", len(x_train)), columns=["data_fold"], index=x_train.index
)
train_split = train_split.join(crop_yield.copy()[crop_yield.index.isin(x_train.index)])
train_split["oos_prediction"] = val_predictions
train_split["val_fold"] = fold_list
train_split = demean_by_group(
    train_split, predicted="oos_prediction", group=["district"]
)

#########################################     DE-MEAN TEST R2    #########################################
test_split = pd.DataFrame(
    {"data_fold": np.repeat("test", len(x_test))}, index=x_test.index
)
test_split = test_split.join(crop_yield.copy()[crop_yield.index.isin(x_test.index)])
test_split["oos_prediction"] = test_predictions
test_split["val_fold"] = n_splits + 1
test_split = demean_by_group(test_split, predicted="oos_prediction", group=["district"])

#########################################     OUT OF SAMPLE PREDICTIONS    #########################################
oos_preds = pd.concat([train_split, test_split])
oos_preds[["split", "random_state"]] = split, random_state
oos_preds["variables"] = variable_groups_str

#########################################     SCORES    #########################################
val_R2 = r2_score(y_train, val_predictions)
val_r = pearsonr(val_predictions, y_train)[0]
train_R2 = r2_score(y_train, train_predictions)
train_r = pearsonr(train_predictions, y_train)[0]
test_R2 = r2_score(y_test, test_predictions)
test_r = pearsonr(test_predictions, y_test)[0]

if anomaly:
    demean_cv_R2 = np.nan
    demean_cv_r = np.nan
    demean_test_R2 = np.nan
    demean_test_r = np.nan
else:
    demean_cv_R2 = r2_score(
        train_split.demean_log_yield, train_split.demean_oos_prediction
    )
    demean_cv_r = pearsonr(
        train_split.demean_log_yield, train_split.demean_oos_prediction
    )[0]
    demean_test_R2 = r2_score(
        test_split.demean_log_yield, test_split.demean_oos_prediction
    )
    demean_test_r = pearsonr(
        test_split.demean_log_yield, test_split.demean_oos_prediction
    )[0]

d = {
    "split": split,
    "random_state": random_state,
    "variables": "_".join(variable_groups),
    "year_start": year_start,
    "hot_encode": hot_encode,
    "anomaly": anomaly,
    "total_n": len(x_all),
    "train_n": len(x_train),
    "test_n": len(x_test),
    "best_reg_param": best_lambdas,
    "mean_of_val_R2": best_scores,
    "val_R2": val_R2,
    "val_r": val_r,
    "val_r2": val_r**2,
    "train_R2": train_R2,
    "train_r": train_r,
    "train_r2": train_r**2,
    "test_R2": test_R2,
    "test_r": test_r,
    "test_r2": test_r**2,
    "demean_cv_R2": demean_cv_R2,
    "demean_cv_r": demean_cv_r,
    "demean_cv_r2": demean_cv_r**2,
    "demean_test_R2": demean_test_R2,
    "demean_test_r": demean_test_r,
    "demean_test_r2": demean_test_r**2,
}
d, oos_preds