# Benchmark Models

In [1]:
%load_ext lab_black

In [2]:
import time
import os

import numpy as np
import pandas as pd
import random
import pyarrow
import concurrent.futures

from pyhere import here
from datetime import date
from itertools import product, combinations
from tqdm import tqdm

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.model_selection import (
    train_test_split,
    KFold,
    GridSearchCV,
    cross_val_predict,
)
from sklearn.metrics import r2_score
from scipy.stats import spearmanr, pearsonr, t
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor, as_completed

from task_modeling_utils import *

In [27]:
# def climate_model(
variable_groups = ["ndvi"]
hot_encode = True
anomaly = False
index_cols = ["year", "district", "yield_mt"]
year_start = 2016
n_splits = 5
split = 0
random_state = 42
return_oos_predictions = True
# ):
if variable_groups is None:
    variable_groups_str = "rcf"
else:
    variable_groups_str = "_".join(variable_groups)
#########################################     READ DATA    #########################################
data = pd.read_csv(here("data", "climate", "climate_summary.csv"))
data = data.dropna()

#########################################     FILTER DATA    #########################################
keep_cols = []

for var in variable_groups:
    tmp = data.columns[data.columns.to_series().str.contains(var)].tolist()
    keep_cols.append(tmp)

keep_cols = [*index_cols, *[col for cols in keep_cols for col in cols]]
data = data.loc[:, keep_cols]
data = data[data.year >= year_start]

#########################################    MAKE A COPY    #########################################
crop_yield = data.copy().loc[:, tuple(index_cols)].reset_index(drop=True)
crop_yield["log_yield"] = np.log10(crop_yield.yield_mt.to_numpy() + 1)

#########################################     CALCULATE ANOMALY   #########################################
if anomaly:
    data["yield_mt"] = np.log10(data.yield_mt.to_numpy() + 1)
    data.set_index(["year", "district"], inplace=True)
    var_cols = data.columns
    data = data[var_cols] - data.groupby(["district"], as_index=True)[
        var_cols
    ].transform("mean")

else:
    pass
data.reset_index(drop=False, inplace=True)

#########################################    HOT ENCODE    #########################################
if hot_encode and not anomaly:
    index_cols.remove("district")
    data = pd.get_dummies(data, columns=["district"], drop_first=False)
else:
    pass

#########################################     K-FOLD SPLIT    #########################################
x_all = data.drop(index_cols, axis=1)
y_all = np.log10(data.yield_mt.to_numpy() + 1)
x_train, x_test, y_train, y_test = train_test_split(
    x_all, y_all, test_size=0.2, random_state=random_state
)
kfold = KFold(n_splits=n_splits)

#########################################    STANDARDIZE FEATURES    #########################################
scaler = StandardScaler().fit(x_train)
x_train = pd.DataFrame(
    scaler.transform(x_train), columns=x_train.columns, index=x_train.index
)
x_test = pd.DataFrame(
    scaler.transform(x_test), columns=x_test.columns, index=x_test.index
)

#########################################     K-FOLD CV    #########################################
### SETUP
tic = time.time()
alphas = {"alpha": np.logspace(-1, 1, base=10, num=3)}

### LAMBDA INDICIES
i = 0
start = [i]
end = [x_train.shape[1]]

for var in variable_groups:
    i += 12
    start.append(i)
    end.append(i)
start.sort()
end.sort()

if not hot_encode:
    start = start[0:-1]
    end = end[0:-1]

### GRID SEARCH - FINDING BEST REGULARIZATION PARAMETER(S)
best_lambdas, best_scores, best_model = kfold_rr_multi_lambda_tuning(
    X=x_train,
    y=y_train,
    grid=alphas.get("alpha"),
    n_splits=n_splits,
    start=start,
    end=end,
    static_lam=1,
    verbose=0,
    show_linalg_warning=False,
    fit_model_after_tuning=True,
)
### PREDICT WITH BEST HYPERPARAMETER(S)
val_predictions = cross_val_predict(best_model, X=x_train, y=y_train, cv=kfold)
train_predictions = best_model.predict(x_train)
test_predictions = best_model.predict(x_test)

if anomaly:
    pass
else:
    val_predictions = np.maximum(val_predictions, 0)
    train_predictions = np.maximum(train_predictions, 0)
    test_predictions = np.maximum(test_predictions, 0)

#########################################     DE-MEAN TRAIN R2    #########################################
fold_list = []
for i in range(n_splits):
    idx = len(list(kfold.split(y_train))[i][1])
    fold = np.repeat(i + 1, idx).tolist()
    fold_list.append(fold)
fold_list = [item for sublist in fold_list for item in sublist]

train_split = pd.DataFrame(
    np.repeat("train", len(x_train)), columns=["data_fold"], index=x_train.index
)
train_split = train_split.join(crop_yield.copy()[crop_yield.index.isin(x_train.index)])
train_split["oos_prediction"] = val_predictions
train_split["val_fold"] = fold_list
train_split = demean_by_group(
    train_split, predicted="oos_prediction", group=["district"]
)

#########################################     DE-MEAN TEST R2    #########################################
test_split = pd.DataFrame(
    {"data_fold": np.repeat("test", len(x_test))}, index=x_test.index
)
test_split = test_split.join(crop_yield.copy()[crop_yield.index.isin(x_test.index)])
test_split["oos_prediction"] = test_predictions
test_split["val_fold"] = n_splits + 1
test_split = demean_by_group(test_split, predicted="oos_prediction", group=["district"])

#########################################     OUT OF SAMPLE PREDICTIONS    #########################################
oos_preds = pd.concat([train_split, test_split])
oos_preds[["split", "random_state"]] = split, random_state
oos_preds["variables"] = variable_groups_str

#########################################     SCORES    #########################################
val_R2 = r2_score(y_train, val_predictions)
val_r = pearsonr(val_predictions, y_train)[0]
train_R2 = r2_score(y_train, train_predictions)
train_r = pearsonr(train_predictions, y_train)[0]
test_R2 = r2_score(y_test, test_predictions)
test_r = pearsonr(test_predictions, y_test)[0]

if anomaly:
    demean_cv_R2 = np.nan
    demean_cv_r = np.nan
    demean_test_R2 = np.nan
    demean_test_r = np.nan
else:
    demean_cv_R2 = r2_score(
        train_split.demean_log_yield, train_split.demean_oos_prediction
    )
    demean_cv_r = pearsonr(
        train_split.demean_log_yield, train_split.demean_oos_prediction
    )[0]
    demean_test_R2 = r2_score(
        test_split.demean_log_yield, test_split.demean_oos_prediction
    )
    demean_test_r = pearsonr(
        test_split.demean_log_yield, test_split.demean_oos_prediction
    )[0]

d = {
    "split": split,
    "random_state": random_state,
    "variables": "_".join(variable_groups),
    "year_start": year_start,
    "hot_encode": hot_encode,
    "anomaly": anomaly,
    "total_n": len(x_all),
    "train_n": len(x_train),
    "test_n": len(x_test),
    "best_reg_param": best_lambdas,
    "mean_of_val_R2": best_scores,
    "val_R2": val_R2,
    "val_r": val_r,
    "val_r2": val_r**2,
    "train_R2": train_R2,
    "train_r": train_r,
    "train_r2": train_r**2,
    "test_R2": test_R2,
    "test_r": test_r,
    "test_r2": test_r**2,
    "demean_cv_R2": demean_cv_R2,
    "demean_cv_r": demean_cv_r,
    "demean_cv_r2": demean_cv_r**2,
    "demean_test_R2": demean_test_R2,
    "demean_test_r": demean_test_r,
    "demean_test_r2": demean_test_r**2,
}
d, oos_preds

({'split': 0,
  'random_state': 42,
  'variables': 'ndvi',
  'year_start': 2016,
  'hot_encode': True,
  'anomaly': False,
  'total_n': 432,
  'train_n': 345,
  'test_n': 87,
  'best_reg_param': [0.001, 0.01],
  'mean_of_val_R2': [0.6387613307498409, 0.7841218569048505],
  'val_R2': 0.7947133343786241,
  'val_r': 0.8924915011152904,
  'val_r2': 0.7965410795630244,
  'train_R2': 0.8938451890193958,
  'train_r': 0.9455370398349828,
  'train_r2': 0.8940402936999019,
  'test_R2': 0.7886675666919576,
  'test_r': 0.8897153022876386,
  'test_r2': 0.7915933191247841,
  'demean_cv_R2': 0.028057447014490755,
  'demean_cv_r': 0.37090765265783054,
  'demean_cv_r2': 0.13757248680014186,
  'demean_test_R2': 0.3609679307487976,
  'demean_test_r': 0.6072413997318694,
  'demean_test_r2': 0.36874211754832004},
     data_fold  year   district  yield_mt  log_yield  oos_prediction  val_fold  \
 132     train  2017   Mazabuka  2.823767   0.582491        0.429540         1   
 231     train  2019      Ndola 

In [29]:
crop_yield

Unnamed: 0,year,district,yield_mt,log_yield
0,2016,Chibombo,2.441532,0.536752
1,2016,Kabwe,2.501448,0.544248
2,2016,Kapiri Mposhi,2.756906,0.574830
3,2016,Mkushi,3.721249,0.674057
4,2016,Mumbwa,2.162931,0.500090
...,...,...,...,...
427,2021,Lukulu,1.008066,0.302778
428,2021,Mongu,0.813282,0.258465
429,2021,Senanga,0.561860,0.193642
430,2021,Sesheke,0.538047,0.186970


In [30]:
x_test.index

Int64Index([424,  75, 180,  30, 392, 275, 414, 154, 195, 148, 153,  70, 199,
            334,  39, 173, 382,  72,   9, 280, 286, 225, 426,  55, 325, 335,
            157, 391, 250, 318, 126, 399, 304,  93, 427,  77, 374, 140, 322,
            428, 352, 297,  78,  73,  33, 116, 299,  76, 246, 379,  15, 238,
            356,   0,  19, 277, 285,  56, 394,  82, 355, 284,  79,  90, 367,
             25, 176, 184, 152,  42, 333, 420, 104, 168, 423, 342,  22,  46,
            172, 113,  94, 381,  57, 124,  24,  17,  66],
           dtype='int64')

In [31]:
test_split = pd.DataFrame(
    {"data_fold": np.repeat("test", len(x_test))}, index=x_test.index
)
test_split.join(crop_yield.copy()[crop_yield.index.isin(x_test.index)])

Unnamed: 0,data_fold,year,district,yield_mt,log_yield
424,test,2021,Sinazongwe,0.728113,0.237572
75,test,2017,Mkushi,3.267198,0.630143
180,test,2018,Isoka,3.400556,0.643508
30,test,2016,Chongwe,2.646931,0.561928
392,test,2021,Luangwa,0.551129,0.190648
...,...,...,...,...,...
57,test,2016,Kalomo,1.537028,0.404325
124,test,2017,Mpulungu,2.719739,0.570512
24,test,2016,Kawambwa,3.011642,0.603322
17,test,2016,Chipata,1.915801,0.464758


In [4]:
variables = ["pre", "tmp", "ndvi"]
climate_vars = list(combinations(variables, 2))
climate_vars = (
    climate_vars + [[variables[i]] for i in range(len(variables))] + [variables]
)
climate_vars = [list(elem) for elem in climate_vars]
climate_vars.sort(key=len)
climate_vars

[['pre'],
 ['tmp'],
 ['ndvi'],
 ['pre', 'tmp'],
 ['pre', 'ndvi'],
 ['tmp', 'ndvi'],
 ['pre', 'tmp', 'ndvi']]

In [5]:
# Generate n random seeds
n_splits = 10
random.seed(42)
random_seeds = [random.randint(0, 1_000_000) for _ in range(n_splits)]

kwarg_list = [
    {
        "variable_groups": clim,
        "hot_encode": he,
        "anomaly": anom,
        "index_cols": ["year", "district", "yield_mt"],
        "year_start": yr,
        "n_splits": 5,
        "split": split,
        "random_state": random_state,
        "return_oos_predictions": True,
    }
    for clim in climate_vars
    for he in [True, False]
    for anom in [True, False]
    for yr in [2009, 2016]
    for split, random_state in enumerate(random_seeds)
]
len(kwarg_list)
# with ThreadPoolExecutor() as executor:

560

In [6]:
if __name__ == "__main__":
    output, oos_preds = [], []
    with ProcessPoolExecutor() as executor:
    # with ThreadPoolExecutor() as executor:
        futures = {executor.submit(climate_model, **kwargs): kwargs for kwargs in kwarg_list}
        for future in tqdm(as_completed(futures), total=len(futures), desc='Processing models'):
            out, oos = future.result()
            output.append(out)
            oos_preds.append(oos)

    today = date.today().strftime("%Y-%m-%d")
    
    results = pd.DataFrame(output)
    results_fn = f"climate_model_{n_splits}-splits_{today}.csv"
    print(f"Saving results as: {results_fn}\n\n")
    results.to_csv(here("data","results", results_fn), index=False)

    oos_predictions = pd.concat(oos_preds)
    oos_fn = f"climate_model_oos_predictions_{n_splits}-splits_{today}.csv"
    print(f"Saving results as: {oos_fn}\n\n")
    oos_predictions.to_csv(here("data","results", oos_fn), index=False)


Processing models: 100%|██████████████████████████████████████████████████████████████████████████████████████| 560/560 [04:50<00:00,  1.93it/s]


NameError: name 'num_seeds' is not defined

In [7]:
results

Unnamed: 0,split,random_state,variables,year_start,hot_encode,anomaly,total_n,train_n,test_n,best_reg_param,...,train_r2,test_R2,test_r,test_r2,demean_cv_R2,demean_cv_r,demean_cv_r2,demean_test_R2,demean_test_r,demean_test_r2
0,5,256787,pre,2016,True,True,432,345,87,"[1.0, 1000000000.0]",...,0.134416,0.032145,0.248466,0.061735,,,,,,
1,1,116739,pre,2016,True,True,432,345,87,"[0.1, 1000000000.0]",...,0.172531,0.122801,0.383200,0.146842,,,,,,
2,0,670487,pre,2016,True,True,432,345,87,"[0.1, 1000000000.0]",...,0.165308,0.131783,0.390744,0.152681,,,,,,
3,2,26225,pre,2016,True,True,432,345,87,"[0.1, 1000000000.0]",...,0.155073,0.187336,0.452072,0.204369,,,,,,
4,4,288389,pre,2016,True,True,432,345,87,"[0.01, 1000000000.0]",...,0.193340,0.072534,0.301319,0.090793,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
555,5,256787,pre_tmp_ndvi,2016,False,False,432,345,87,"[0.01, 0.01, 0.01]",...,0.770250,0.698702,0.846472,0.716515,0.325724,0.595475,0.354590,-0.616774,0.140325,0.019691
556,6,234053,pre_tmp_ndvi,2016,False,False,432,345,87,"[0.01, 0.01, 0.01]",...,0.774523,0.699092,0.836617,0.699928,0.306641,0.591946,0.350400,0.288094,0.598612,0.358336
557,7,146316,pre_tmp_ndvi,2016,False,False,432,345,87,"[0.01, 0.001, 0.01]",...,0.770176,0.722325,0.851257,0.724639,0.378370,0.639916,0.409493,-0.043811,0.458077,0.209835
558,9,107473,pre_tmp_ndvi,2016,False,False,432,345,87,"[0.001, 0.01, 0.1]",...,0.752425,0.724221,0.852424,0.726627,0.222818,0.518004,0.268328,0.567272,0.795062,0.632124


In [10]:
oos_predictions

Unnamed: 0,data_fold,year,district,yield_mt,log_yield,oos_prediction,val_fold,demean_log_yield,demean_oos_prediction,split,random_state,variables
162,train,2018,Katete,0.943240,0.288526,-0.005687,1,-0.102584,-0.003152,5,256787,pre
233,train,2019,Chipata,1.786837,0.445112,-0.003269,1,-0.053057,-0.001964,5,256787,pre
324,train,2020,Isoka,2.936742,0.595137,-0.015694,1,-0.049253,-0.014394,5,256787,pre
134,train,2017,Namwala,1.045824,0.310868,-0.004583,1,-0.003890,-0.003599,5,256787,pre
70,train,2016,Sesheke,0.519042,0.181570,0.004959,1,-0.000794,0.007458,5,256787,pre
...,...,...,...,...,...,...,...,...,...,...,...,...
377,test,2021,Chipata,2.498687,0.543905,0.533751,6,0.039638,0.055122,8,772246,pre_tmp_ndvi
229,test,2019,Mpongwe,2.689074,0.566917,0.477555,6,-0.006762,-0.057895,8,772246,pre_tmp_ndvi
353,test,2020,Kalabo,0.437436,0.157589,0.314182,6,-0.028577,0.014817,8,772246,pre_tmp_ndvi
148,test,2018,Mumbwa,1.580626,0.411725,0.432077,6,-0.013786,-0.026985,8,772246,pre_tmp_ndvi


In [6]:
results = pd.DataFrame(output)
results
# mask = results.anomaly == True
# cols = [
#     "demean_cv_R2",
#     "demean_cv_r",
#     "demean_cv_r2",
#     "demean_test_R2",
#     "demean_test_r",
#     "demean_test_r2",
# ]
# results.loc[mask, cols] = np.nan

# today = date.today().strftime("%Y-%m-%d")
# file_name = f"climate_model_{num_seeds}-splits_{today}.csv"
# print(f"Saving results as: {file_name}\n\n")
# results.to_csv(here("data", "results", file_name), index=False)

In [24]:
a = results.copy()
a = a[a.year_start == 2016]
a = a[a.hot_encode]
# a = a[a.variables.isin(["tmp_ndvi"])]

b = get_mean_std_ste(
    df=a,
    groupby_columns=["variables", "year_start", "hot_encode", "anomaly"],
    target_columns=["val_R2", "test_R2", "demean_cv_R2", "demean_cv_r2"],
)
b.sort_values(["summary_var", "mean"], ascending=False)
# b

Unnamed: 0,variables,year_start,hot_encode,anomaly,summary_var,mean,std,sem,sample_size
6,tmp_ndvi,2016,True,False,val_R2,0.812934,0.015992,0.005057,10
4,pre_tmp_ndvi,2016,True,False,val_R2,0.803845,0.015862,0.005016,10
2,pre_ndvi,2016,True,False,val_R2,0.784827,0.027986,0.00885,10
3,pre_tmp,2016,True,False,val_R2,0.780052,0.012905,0.004081,10
5,tmp,2016,True,False,val_R2,0.779806,0.010611,0.003355,10
0,ndvi,2016,True,False,val_R2,0.742841,0.034199,0.010815,10
1,pre,2016,True,False,val_R2,0.658987,0.023647,0.007478,10
13,tmp_ndvi,2016,True,False,test_R2,0.83847,0.022433,0.007094,10
11,pre_tmp_ndvi,2016,True,False,test_R2,0.827356,0.022587,0.007143,10
12,tmp,2016,True,False,test_R2,0.808324,0.017324,0.005478,10


In [20]:
import itertools

# Generate n random seeds
n_splits = 10
random_seeds = [random.randint(0, 1_000_000) for _ in range(n_splits)]

directory = here("data", "random_features", "summary")
files = [
    f for f in os.listdir(directory) if f not in (".gitkeep", ".ipynb_checkpoints")
]
# files = [f for f in files if not (f.startswith("landsat-8") and "lm-False" in f)]
# files = [f for f in files if not (f.startswith("sentinel") and "lm-True" in f)]
# files = [f for f in files if "cm-True" in f]
# files = [f for f in files if "wa-False" in f]

combinations = list(itertools.combinations(files, 2))
len(combinations) * 2 + 88

1980

In [13]:
groupby_cols = ["variables", "year_start", "hot_encode", "anomaly"]

results = pd.read_csv(here("data", "results", "climate_model_10-splits_2023-06-12.csv"))
results = results[results.year_start == 2016]
# results = results[results.variables == "ndvi"]
# results = results[results.hot_encode == True]
# results = results[results.anomaly == False]

results_summary = results.groupby(groupby_cols, as_index=False).agg(
    {
        "val_R2": "mean",
        "val_r2": "mean",
        "test_R2": "mean",
        "test_r2": "mean",
        "demean_cv_R2": "mean",
        "demean_cv_r2": "mean",
        "demean_test_R2": "mean",
        "demean_test_r2": "mean",
    }
)
results_summary = results_summary.sort_values("val_R2", ascending=False)  # .head(20)
results_summary  # .iloc[0:10, :]

Unnamed: 0,variables,year_start,hot_encode,anomaly,val_R2,val_r2,test_R2,test_r2,demean_cv_R2,demean_cv_r2,demean_test_R2,demean_test_r2
26,tmp_ndvi,2016,True,False,0.816014,0.817164,0.834273,0.840106,0.188323,0.252929,0.384849,0.437002
18,pre_tmp_ndvi,2016,True,False,0.814329,0.815517,0.831528,0.837293,0.181001,0.249089,0.380537,0.438187
10,pre_ndvi,2016,True,False,0.791151,0.793534,0.814642,0.823237,0.087058,0.182711,0.357609,0.407231
2,ndvi,2016,True,False,0.784346,0.785885,0.803543,0.81359,0.057853,0.150001,0.30315,0.380023
14,pre_tmp,2016,True,False,0.783024,0.783442,0.802267,0.807989,0.043079,0.14483,0.225022,0.341623
22,tmp,2016,True,False,0.781171,0.782807,0.806596,0.8133,0.030858,0.140948,0.252035,0.359901
16,pre_tmp_ndvi,2016,False,False,0.708435,0.70878,0.70825,0.71489,0.310563,0.3476,0.186387,0.326434
12,pre_tmp,2016,False,False,0.686331,0.686476,0.683913,0.689825,0.274934,0.308668,0.137382,0.2957
24,tmp_ndvi,2016,False,False,0.666565,0.667003,0.673338,0.679725,0.343107,0.356657,0.242638,0.335919
6,pre,2016,True,False,0.664579,0.672051,0.699431,0.717199,-0.488164,0.062078,-0.00085,0.149651


In [37]:
%%time
### TESTING
# variable_groups = ["pre", "tmp", "ndvi"]
variable_groups = ["pre"]
index_cols = ["year", "district", "yield_mt"]
year_start = 2016
hot_encode = False
anomaly = False
n_splits = 5
split=0
random_state = 42

#########################################     READ DATA    #########################################
data = pd.read_csv(here("data", "climate", "climate_summary.csv"))
data = data.dropna()

keep_cols = []

for var in variable_groups:
    tmp = data.columns[data.columns.to_series().str.contains(var)].tolist()
    keep_cols.append(tmp)

keep_cols = [*index_cols, *[col for cols in keep_cols for col in cols]]

data = data.loc[:, keep_cols]

data = data[data.year >= year_start]

crop_yield = data.copy().loc[:, tuple(index_cols)].reset_index(drop=True)
crop_yield["log_yield"] = np.log10(crop_yield.yield_mt.to_numpy() + 1)

########################################    STANDARDIZE FEATURES    #########################################
data = data.set_index(index_cols)
data_scaled = StandardScaler().fit_transform(data.values)
data = pd.DataFrame(data_scaled, index=data.index).reset_index()
data.columns = data.columns.astype(str)

#########################################     CALCULATE ANOMALY   #########################################
if anomaly:
    data["yield_mt"] = np.log10(data.yield_mt.to_numpy() + 1)
    data.set_index(["year", "district"], inplace=True)
    var_cols = data.columns
    data = data[var_cols] - data.groupby(["district"], as_index=True)[
        var_cols
    ].transform("mean")
    data.reset_index(drop=False, inplace=True)
else:
    pass

#########################################    HOT ENCODE    #########################################
if hot_encode:
    index_cols.remove("district")
    data = pd.get_dummies(data, columns=["district"], drop_first=False)
else:
    pass

#########################################     K-FOLD SPLIT    #########################################
x_all = data.drop(index_cols, axis=1)
y_all = np.log10(data.yield_mt.to_numpy() + 1)
x_train, x_test, y_train, y_test = train_test_split(
    x_all, y_all, test_size=0.2, random_state=random_state
)
kfold = KFold(n_splits=n_splits)

#########################################     K-FOLD CV    #########################################
### SETUP
tic = time.time()
alphas = {"alpha": np.logspace(-1, 1, base=10, num=3)}

i = 0
start = [i]
end = [x_train.shape[1]]

for var in variable_groups:
    i += 12
    start.append(i)
    end.append(i)
start.sort()
end.sort()

if not hot_encode:
    start = start[0:-1]
    end = end[0:-1]

### GRID SEARCH - FINDING BEST REGULARIZATION PARAMETER(S)
best_lambdas, best_scores, best_model = kfold_rr_multi_lambda_tuning(
    X=x_train,
    y=y_train,
    grid=alphas.get("alpha"),
    n_splits=n_splits,
    start=start,
    end=end,
    static_lam=1,
    verbose=3,
    show_linalg_warning=False,
    fit_model_after_tuning=True,
)
### PREDICT WITH BEST HYPERPARAMETER(S)
val_predictions = cross_val_predict(best_model, X=x_train, y=y_train, cv=kfold)
train_predictions = best_model.predict(x_train)
test_predictions = best_model.predict(x_test)

#########################################     DE-MEAN TRAIN R2    #########################################
train_split = pd.DataFrame(
    np.repeat("train", len(x_train)), columns=["data_fold"], index=x_train.index
)
train_split = train_split.join(
    crop_yield.copy()[crop_yield.index.isin(x_train.index)]
)
train_split["oos_prediction"] = np.maximum(val_predictions, 0)
train_split = demean_by_group(train_split, predicted="oos_prediction", group=["district"])

#########################################     DE-MEAN TEST R2    #########################################
test_split = pd.DataFrame({"data_fold": np.repeat("test", len(x_test))}, index=x_test.index)
test_split = test_split.join(crop_yield.copy()[crop_yield.index.isin(x_test.index)])
test_split["oos_prediction"] = np.maximum(best_model.predict(x_test), 0)
test_split = demean_by_group(test_split, predicted="oos_prediction", group=["district"])

#########################################     OUT OF SAMPLE PREDICTIONS    #########################################
oos_preds = pd.concat([train_split, test_split])
oos_preds[["split", "random_state"]] = split, random_state

d = {
    "variables": [variable_groups],
    "year_start": year_start,
    "hot_encode": hot_encode,
    "anomaly": anomaly,
    "total_n": len(x_all),
    "train_n": len(x_train),
    "test_n": len(x_test),
    "best_reg_param": [best_lambdas],
    "mean_of_val_R2": [best_scores],
    "val_R2": r2_score(y_train, val_predictions),
    "val_r": pearsonr(val_predictions, y_train)[0],
    "val_r2": pearsonr(val_predictions, y_train)[0] ** 2,
    "train_R2": r2_score(y_train, train_predictions),
    "train_r": pearsonr(train_predictions, y_train)[0],
    "train_r2": pearsonr(train_predictions, y_train)[0] ** 2,
    "test_R2": r2_score(y_test, test_predictions),
    "test_r": pearsonr(test_predictions, y_test)[0],
    "test_r2": pearsonr(test_predictions, y_test)[0] ** 2,
    "demean_cv_R2": r2_score(
        train_split.demean_log_yield, train_split.demean_oos_prediction
    ),
    "demean_cv_r": pearsonr(
        train_split.demean_log_yield, train_split.demean_oos_prediction
    )[0],
    "demean_cv_r2": pearsonr(
        train_split.demean_log_yield, train_split.demean_oos_prediction
    )[0]
    ** 2,
    "demean_test_R2": r2_score(
        test_split.demean_log_yield, test_split.demean_oos_prediction
    ),
    "demean_test_r": pearsonr(
        test_split.demean_log_yield, test_split.demean_oos_prediction
    )[0],
    "demean_test_r2": pearsonr(
        test_split.demean_log_yield, test_split.demean_oos_prediction
    )[0]
    ** 2,
}


	Best λ: 0.1
	Val R2: 0.3666

CPU times: total: 4.89 s
Wall time: 409 ms


In [4]:
oos_preds

Unnamed: 0,data_fold,year,district,yield_mt,log_yield,oos_prediction,demean_log_yield,demean_oos_prediction,split,random_state
132,train,2017,Mazabuka,2.823767,0.582491,0.379830,0.190994,0.042195,0,42
231,train,2019,Ndola,2.588668,0.554933,0.519268,-0.007168,0.008410,0,42
31,train,2016,Kafue,2.844356,0.584824,0.283233,0.125010,-0.067914,0,42
84,train,2017,Masaiti,2.045396,0.483644,0.519354,-0.039523,0.029456,0,42
296,train,2020,Kalulushi,3.472055,0.650507,0.541401,0.048078,-0.002087,0,42
...,...,...,...,...,...,...,...,...,...,...
57,test,2016,Kalomo,1.537028,0.404325,0.358483,0.000000,0.000000,0,42
124,test,2017,Mpulungu,2.719739,0.570512,0.590175,0.000000,0.000000,0,42
24,test,2016,Kawambwa,3.011642,0.603322,0.541183,0.004752,-0.019035,0,42
17,test,2016,Chipata,1.915801,0.464758,0.441222,0.000000,0.000000,0,42
