In [3]:
%load_ext lab_black
import time
import os

import numpy as np
import pandas as pd
import random
import pyarrow
import concurrent.futures

from pyhere import here
from datetime import date
from itertools import product, combinations
from tqdm import tqdm

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.model_selection import (
    train_test_split,
    KFold,
    GridSearchCV,
    cross_val_predict,
)
from sklearn.metrics import r2_score
from scipy.stats import spearmanr, pearsonr, t

from task_modeling_utils import *

The lab_black extension is already loaded. To reload it, use:
  %reload_ext lab_black


In [6]:
groupby_cols = ["variables", "year_start", "hot_encode", "anomaly"]

In [13]:
results = pd.read_csv(
    here("data", "results", "climate_model_n-seeds-100_2023-04-05.csv")
)
results = results[results.year_start == 2016]
results = results[results.hot_encode == True]
results = results[results.anomaly == False]

results_summary = results.groupby(groupby_cols, as_index=False).agg(
    {
        "val_R2": "mean",
        "test_R2": "mean",
        "demean_cv_R2": "mean",
        "demean_test_R2": "mean",
    }
)
results_summary = results_summary.sort_values("val_R2", ascending=False)  # .head(20)
results_summary  # .iloc[0:10, :]

Unnamed: 0,variables,year_start,hot_encode,anomaly,val_R2,test_R2,demean_cv_R2,demean_test_R2
6,tmp_ndvi,2016,True,False,0.815761,0.828261,0.194853,0.462063
4,pre_tmp_ndvi,2016,True,False,0.80622,0.820256,0.152658,0.435071
2,pre_ndvi,2016,True,False,0.7855,0.80138,0.05596,0.377668
5,tmp,2016,True,False,0.783536,0.796392,0.036821,0.354741
3,pre_tmp,2016,True,False,0.780812,0.790334,0.045963,0.340352
0,ndvi,2016,True,False,0.732203,0.745901,-0.187697,0.217054
1,pre,2016,True,False,0.660519,0.687615,-0.512005,0.048225


In [2]:
def get_mean_std_ste(df, groupby_columns, target_columns, confidence_level=0.95):
    """
    Group a pandas DataFrame and calculate mean, standard deviation, and standard error
    for a single column or a list of target columns.

    Args:
        df (pandas.DataFrame): The input pandas DataFrame
        groupby_columns (list): A list of columns to group by
        target_columns (str or list): A column or a list of columns to calculate the statistics for
        confidence_level (float, optional): The desired confidence level for the interval (default: 0.95)

    Returns:
        pandas.DataFrame: A summarized pandas DataFrame
    """

    if isinstance(target_columns, str):
        target_columns = [target_columns]

    summary_dfs = []

    for target_column in target_columns:
        # Group the DataFrame
        grouped_df = df.groupby(groupby_columns)[target_column]

        # Calculate mean, standard deviation, and standard error
        mean = grouped_df.mean()
        std = grouped_df.std()
        sem = grouped_df.sem()
        sample_size = grouped_df.count()
        # Create the summarized DataFrame
        summary_df = pd.DataFrame(
            {
                "summary_var": target_column,
                "mean": mean,
                "std": std,
                "sem": sem,
                "sample_size": sample_size,
            }
        ).reset_index()

        summary_dfs.append(summary_df)

    # Concatenate the summary DataFrames
    combined_summary_df = pd.concat(summary_dfs, axis=0, ignore_index=True)

    return combined_summary_df

In [3]:
variables = ["pre", "tmp", "ndvi"]
clim = list(combinations(variables, 2))
clim = clim + [[variables[i]] for i in range(len(variables))] + [variables]
clim = [list(elem) for elem in clim]
clim.sort(key=len)
clim

[['pre'],
 ['tmp'],
 ['ndvi'],
 ['pre', 'tmp'],
 ['pre', 'ndvi'],
 ['tmp', 'ndvi'],
 ['pre', 'tmp', 'ndvi']]

In [None]:
import concurrent.futures
from itertools import product
from tqdm import tqdm
import random

# Set the random seed for reproducibility
random.seed(42)

# Generate 100 random seeds
num_seeds = 100
random_seeds = [random.randint(0, 1_000_000) for _ in range(num_seeds)]

output = []


# Generate parameter combinations, including seeds
param_combinations = list(
    product(clim, [2009, 2016], [True, False], [True, False], enumerate(random_seeds))
)

with concurrent.futures.ProcessPoolExecutor() as executor:
    tasks = [
        executor.submit(run_climate_model, params) for params in param_combinations
    ]
    output = [
        task.result()
        for task in tqdm(concurrent.futures.as_completed(tasks), total=len(tasks))
        if task.result() is not None
    ]

  2%|█▉                                                                                                      | 102/5600 [01:00<55:48,  1.64it/s]

In [8]:
results = pd.DataFrame(output)

mask = results.anomaly == True
cols = [
    "demean_cv_R2",
    "demean_cv_r",
    "demean_cv_r2",
    "demean_test_R2",
    "demean_test_r",
    "demean_test_r2",
]
results.loc[mask, cols] = np.nan

today = date.today().strftime("%Y-%m-%d")
file_name = f"climate_model_n-seeds-{num_seeds}_{today}.csv"
print(f"Saving results as: {file_name}\n\n")
results.to_csv(here("data", "results", file_name), index=False)

Saving results as: climate_model_n-seeds-100_2023-04-20.csv




In [6]:
results

Unnamed: 0,split,random_state,variables,year_start,hot_encode,anomaly,total_n,train_n,test_n,best_reg_param,...,train_r2,test_R2,test_r,test_r2,demean_cv_R2,demean_cv_r,demean_cv_r2,demean_test_R2,demean_test_r,demean_test_r2
0,1,116739,pre,2009,True,False,936,748,188,"[0.01, 0.001]",...,0.663613,0.635893,0.797723,0.636362,-0.060034,0.152047,0.023118,-0.012841,0.200774,0.040310
1,0,670487,pre,2009,True,False,936,748,188,"[0.01, 0.001]",...,0.689724,0.505226,0.712270,0.507328,-0.081250,0.137464,0.018896,0.061015,0.267610,0.071615
2,5,256787,pre,2009,True,False,936,748,188,"[0.01, 0.001]",...,0.665604,0.596661,0.776197,0.602482,-0.116015,0.103074,0.010624,0.015910,0.216306,0.046788
3,6,234053,pre,2009,True,False,936,748,188,"[0.01, 0.001]",...,0.672251,0.599770,0.774802,0.600319,-0.117682,0.059861,0.003583,0.173865,0.417550,0.174348
4,11,776646,pre,2009,True,False,936,748,188,"[0.01, 0.001]",...,0.678480,0.558706,0.747795,0.559197,-0.063651,0.145619,0.021205,0.051286,0.256551,0.065819
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4156,93,666563,pre_tmp_ndvi,2016,False,False,432,345,87,"[0.01, 0.01, 0.001]",...,0.764696,0.748478,0.868798,0.754809,0.203661,0.509076,0.259158,0.471991,0.691328,0.477934
4157,99,219684,pre_tmp_ndvi,2016,False,False,432,345,87,"[0.01, 0.01, 0.1]",...,0.746217,0.790160,0.889324,0.790898,0.346447,0.610144,0.372276,-0.037187,0.401793,0.161437
4158,95,382554,pre_tmp_ndvi,2016,False,False,432,345,87,"[0.01, 0.001, 0.001]",...,0.772898,0.705396,0.856181,0.733046,0.205520,0.540986,0.292666,0.129545,0.491652,0.241722
4159,58,277370,pre_tmp_ndvi,2016,False,False,432,345,87,"[1e-15, 0.001, 0.01]",...,0.782154,0.611078,0.828779,0.686875,0.335125,0.615151,0.378411,-0.034185,0.437550,0.191450


In [7]:
a = results.copy()
a = a[a.year_start == 2016]
a = a[a.hot_encode]
# a = a[a.variables.isin(["tmp_ndvi"])]

b = get_mean_std_ste(
    df=a,
    groupby_columns=["variables", "year_start", "hot_encode", "anomaly"],
    target_columns=["val_R2", "test_R2"],
)
b.sort_values(["summary_var", "mean"], ascending=False)
# b

Unnamed: 0,variables,year_start,hot_encode,anomaly,summary_var,mean,std,sem,sample_size
6,tmp_ndvi,2016,True,False,val_R2,0.81596,0.018517,0.001997,86
4,pre_tmp_ndvi,2016,True,False,val_R2,0.805587,0.018665,0.001946,92
2,pre_ndvi,2016,True,False,val_R2,0.785507,0.024067,0.002469,95
5,tmp,2016,True,False,val_R2,0.783561,0.013597,0.001367,99
3,pre_tmp,2016,True,False,val_R2,0.780799,0.014388,0.001446,99
0,ndvi,2016,True,False,val_R2,0.731369,0.037205,0.003758,98
1,pre,2016,True,False,val_R2,0.660594,0.023789,0.00248,92
13,tmp_ndvi,2016,True,False,test_R2,0.829218,0.038212,0.004121,86
11,pre_tmp_ndvi,2016,True,False,test_R2,0.819087,0.040128,0.004184,92
9,pre_ndvi,2016,True,False,test_R2,0.800846,0.044501,0.004566,95


In [39]:
a

Unnamed: 0,split,random_state,variables,year_start,hot_encode,anomaly,total_n,train_n,test_n,best_reg_param,...,test_r2,demean_val_R2,demean_val_r,demean_val_r2,demean_test_R2,demean_test_r,demean_test_r2,demean_cv_R2,demean_cv_r,demean_cv_r2
300,1,116739,pre,2016,True,False,432,345,87,"[0.01, 0.0001]",...,0.688741,-0.446112,-0.214652,0.046075,-0.086925,0.005453,0.000030,,,
301,2,26225,pre,2016,True,False,432,345,87,"[0.01, 0.0001]",...,0.736417,-0.602384,-0.253721,0.064374,0.156477,0.395797,0.156655,,,
302,3,777572,pre,2016,True,False,432,345,87,"[0.1, 1e-05]",...,0.729415,-0.420897,-0.249222,0.062112,-0.146337,-0.091647,0.008399,,,
303,5,256787,pre,2016,True,False,432,345,87,"[0.1, 0.0001]",...,0.820171,-0.469138,-0.398341,0.158675,-0.359098,-0.110815,0.012280,,,
304,6,234053,pre,2016,True,False,432,345,87,"[0.1, 0.001]",...,0.688821,-0.450744,-0.383321,0.146935,0.137071,0.399672,0.159737,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3969,96,170555,pre_tmp_ndvi,2016,True,False,432,345,87,"[0.1, 0.01, 0.1, 0.001]",...,0.823155,0.045783,0.388320,0.150792,0.658775,0.871269,0.759110,,,
3970,97,388162,pre_tmp_ndvi,2016,True,False,432,345,87,"[0.01, 0.01, 0.1, 0.0001]",...,0.768075,0.191585,0.507882,0.257944,0.441141,0.672378,0.452093,,,
3972,99,219684,pre_tmp_ndvi,2016,True,False,432,345,87,"[0.1, 0.01, 0.1, 0.001]",...,0.852085,0.216103,0.502090,0.252094,0.267088,0.549964,0.302461,,,
3973,98,372528,pre_tmp_ndvi,2016,True,False,432,345,87,"[0.0001, 0.01, 0.1, 0.001]",...,0.861860,0.168255,0.469822,0.220733,0.359680,0.607952,0.369606,,,


In [37]:
%%time
### TESTING
# variable_groups = ["pre", "tmp", "ndvi"]
variable_groups = ["pre"]
index_cols = ["year", "district", "yield_mt"]
year_start = 2016
hot_encode = False
anomaly = False
n_splits = 5
split=0
random_state = 42

#########################################     READ DATA    #########################################
data = pd.read_csv(here("data", "climate", "climate_summary.csv"))
data = data.dropna()

keep_cols = []

for var in variable_groups:
    tmp = data.columns[data.columns.to_series().str.contains(var)].tolist()
    keep_cols.append(tmp)

keep_cols = [*index_cols, *[col for cols in keep_cols for col in cols]]

data = data.loc[:, keep_cols]

data = data[data.year >= year_start]

crop_yield = data.copy().loc[:, tuple(index_cols)].reset_index(drop=True)
crop_yield["log_yield"] = np.log10(crop_yield.yield_mt.to_numpy() + 1)

########################################    STANDARDIZE FEATURES    #########################################
data = data.set_index(index_cols)
data_scaled = StandardScaler().fit_transform(data.values)
data = pd.DataFrame(data_scaled, index=data.index).reset_index()
data.columns = data.columns.astype(str)

#########################################     CALCULATE ANOMALY   #########################################
if anomaly:
    data["yield_mt"] = np.log10(data.yield_mt.to_numpy() + 1)
    data.set_index(["year", "district"], inplace=True)
    var_cols = data.columns
    data = data[var_cols] - data.groupby(["district"], as_index=True)[
        var_cols
    ].transform("mean")
    data.reset_index(drop=False, inplace=True)
else:
    pass

#########################################    HOT ENCODE    #########################################
if hot_encode:
    index_cols.remove("district")
    data = pd.get_dummies(data, columns=["district"], drop_first=False)
else:
    pass

#########################################     K-FOLD SPLIT    #########################################
x_all = data.drop(index_cols, axis=1)
y_all = np.log10(data.yield_mt.to_numpy() + 1)
x_train, x_test, y_train, y_test = train_test_split(
    x_all, y_all, test_size=0.2, random_state=random_state
)
kfold = KFold(n_splits=n_splits)

#########################################     K-FOLD CV    #########################################
### SETUP
tic = time.time()
alphas = {"alpha": np.logspace(-1, 1, base=10, num=3)}

i = 0
start = [i]
end = [x_train.shape[1]]

for var in variable_groups:
    i += 12
    start.append(i)
    end.append(i)
start.sort()
end.sort()

if not hot_encode:
    start = start[0:-1]
    end = end[0:-1]

### GRID SEARCH - FINDING BEST REGULARIZATION PARAMETER(S)
best_lambdas, best_scores, best_model = kfold_rr_multi_lambda_tuning(
    X=x_train,
    y=y_train,
    grid=alphas.get("alpha"),
    n_splits=n_splits,
    start=start,
    end=end,
    static_lam=1,
    verbose=3,
    show_linalg_warning=False,
    fit_model_after_tuning=True,
)
### PREDICT WITH BEST HYPERPARAMETER(S)
val_predictions = cross_val_predict(best_model, X=x_train, y=y_train, cv=kfold)
train_predictions = best_model.predict(x_train)
test_predictions = best_model.predict(x_test)

#########################################     DE-MEAN TRAIN R2    #########################################
train_split = pd.DataFrame(
    np.repeat("train", len(x_train)), columns=["data_fold"], index=x_train.index
)
train_split = train_split.join(
    crop_yield.copy()[crop_yield.index.isin(x_train.index)]
)
train_split["oos_prediction"] = np.maximum(val_predictions, 0)
train_split = demean_by_group(train_split, predicted="oos_prediction", group=["district"])

#########################################     DE-MEAN TEST R2    #########################################
test_split = pd.DataFrame({"data_fold": np.repeat("test", len(x_test))}, index=x_test.index)
test_split = test_split.join(crop_yield.copy()[crop_yield.index.isin(x_test.index)])
test_split["oos_prediction"] = np.maximum(best_model.predict(x_test), 0)
test_split = demean_by_group(test_split, predicted="oos_prediction", group=["district"])

#########################################     OUT OF SAMPLE PREDICTIONS    #########################################
oos_preds = pd.concat([train_split, test_split])
oos_preds[["split", "random_state"]] = split, random_state

d = {
    "variables": [variable_groups],
    "year_start": year_start,
    "hot_encode": hot_encode,
    "anomaly": anomaly,
    "total_n": len(x_all),
    "train_n": len(x_train),
    "test_n": len(x_test),
    "best_reg_param": [best_lambdas],
    "mean_of_val_R2": [best_scores],
    "val_R2": r2_score(y_train, val_predictions),
    "val_r": pearsonr(val_predictions, y_train)[0],
    "val_r2": pearsonr(val_predictions, y_train)[0] ** 2,
    "train_R2": r2_score(y_train, train_predictions),
    "train_r": pearsonr(train_predictions, y_train)[0],
    "train_r2": pearsonr(train_predictions, y_train)[0] ** 2,
    "test_R2": r2_score(y_test, test_predictions),
    "test_r": pearsonr(test_predictions, y_test)[0],
    "test_r2": pearsonr(test_predictions, y_test)[0] ** 2,
    "demean_cv_R2": r2_score(
        train_split.demean_log_yield, train_split.demean_oos_prediction
    ),
    "demean_cv_r": pearsonr(
        train_split.demean_log_yield, train_split.demean_oos_prediction
    )[0],
    "demean_cv_r2": pearsonr(
        train_split.demean_log_yield, train_split.demean_oos_prediction
    )[0]
    ** 2,
    "demean_test_R2": r2_score(
        test_split.demean_log_yield, test_split.demean_oos_prediction
    ),
    "demean_test_r": pearsonr(
        test_split.demean_log_yield, test_split.demean_oos_prediction
    )[0],
    "demean_test_r2": pearsonr(
        test_split.demean_log_yield, test_split.demean_oos_prediction
    )[0]
    ** 2,
}


	Best λ: 0.1
	Val R2: 0.3666

CPU times: total: 4.89 s
Wall time: 409 ms


In [4]:
oos_preds

Unnamed: 0,data_fold,year,district,yield_mt,log_yield,oos_prediction,demean_log_yield,demean_oos_prediction,split,random_state
132,train,2017,Mazabuka,2.823767,0.582491,0.379830,0.190994,0.042195,0,42
231,train,2019,Ndola,2.588668,0.554933,0.519268,-0.007168,0.008410,0,42
31,train,2016,Kafue,2.844356,0.584824,0.283233,0.125010,-0.067914,0,42
84,train,2017,Masaiti,2.045396,0.483644,0.519354,-0.039523,0.029456,0,42
296,train,2020,Kalulushi,3.472055,0.650507,0.541401,0.048078,-0.002087,0,42
...,...,...,...,...,...,...,...,...,...,...
57,test,2016,Kalomo,1.537028,0.404325,0.358483,0.000000,0.000000,0,42
124,test,2017,Mpulungu,2.719739,0.570512,0.590175,0.000000,0.000000,0,42
24,test,2016,Kawambwa,3.011642,0.603322,0.541183,0.004752,-0.019035,0,42
17,test,2016,Chipata,1.915801,0.464758,0.441222,0.000000,0.000000,0,42
