In [1]:
%load_ext lab_black
import time
import os

from pyhere import here
from datetime import date

import numpy as np
import pandas as pd

import pyarrow

import concurrent.futures
from itertools import product, combinations
from tqdm import tqdm
import random

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.model_selection import (
    train_test_split,
    KFold,
    GridSearchCV,
    cross_val_predict,
)
from sklearn.metrics import r2_score
from scipy.stats import spearmanr, pearsonr, t

from task_modeling_utils import *

In [2]:
def summarize_dataframe(df, groupby_columns, target_column, confidence_level=0.95):
    """
    Group a pandas DataFrame and calculate mean, standard deviation, standard error,
    and confidence interval for a single column.

    :param df: The input pandas DataFrame
    :param groupby_columns: A list of columns to group by
    :param target_column: The column to calculate the statistics for
    :param confidence_level: The desired confidence level for the interval (default: 0.95)
    :return: A summarized pandas DataFrame
    """

    # Group the DataFrame
    grouped_df = df.groupby(groupby_columns)[target_column]

    # Calculate mean, standard deviation, and standard error
    mean = grouped_df.mean()
    std = grouped_df.std()
    sem = grouped_df.sem()

    # Calculate the confidence interval
    ci_lower, ci_upper = t.interval(
        confidence=confidence_level,  # Confidence level
        df=grouped_df.count() - 1,  # Degrees of freedom
        loc=mean,  # Mean
        scale=sem,  # Standard error
    )

    # Create the summarized DataFrame
    summary_df = pd.DataFrame(
        {
            "summary_var": target_column,
            "mean": mean,
            "std": std,
            "sem": sem,
            "ci_lower": ci_lower,
            "ci_upper": ci_upper,
        }
    )

    return summary_df.reset_index()

In [3]:
variables = ["pre", "tmp", "ndvi"]
clim = list(combinations(variables, 2))
clim = clim + [[variables[i]] for i in range(len(variables))] + [variables]
clim = [list(elem) for elem in clim]
clim.sort(key=len)
clim

[['pre'],
 ['tmp'],
 ['ndvi'],
 ['pre', 'tmp'],
 ['pre', 'ndvi'],
 ['tmp', 'ndvi'],
 ['pre', 'tmp', 'ndvi']]

In [4]:
import concurrent.futures
from itertools import product
from tqdm import tqdm
import random

# Set the random seed for reproducibility
random.seed(42)

# Generate 100 random seeds
num_seeds = 100
random_seeds = [random.randint(0, 1_000_000) for _ in range(num_seeds)]

output = []

# Generate parameter combinations, including seeds
param_combinations = list(
    product(clim, [2009, 2016], [True, False], [True, False], random_seeds)
)

with concurrent.futures.ProcessPoolExecutor() as executor:
    tasks = [
        executor.submit(run_climate_model, params) for params in param_combinations
    ]
    output = [
        task.result()
        for task in tqdm(concurrent.futures.as_completed(tasks), total=len(tasks))
        if task.result() is not None
    ]

results = pd.DataFrame(output)

mask = results.anomaly == True
cols = [
    "demean_cv_R2",
    "demean_cv_r",
    "demean_cv_r2",
    "demean_test_R2",
    "demean_test_r",
    "demean_test_r2",
]
results.loc[mask, cols] = np.nan

today = date.today().strftime("%Y-%m-%d")
file_name = f"climate_model_n-seeds-{num_seeds}_{today}.csv"
print(f"Saving results as: {file_name}\n\n")
results.to_csv(here("data", "results", file_name), index=False)

100%|███████████████████████████████████████████████████████████████████████████████| 5600/5600 [49:00<00:00,  1.90it/s]


Saving results as: climate_model_n-seeds-100_2023-04-05.csv




In [5]:
results

Unnamed: 0,variables,random_state,year_start,hot_encode,anomaly,total_n,train_n,test_n,best_reg_param,mean_of_val_R2,...,train_r2,test_R2,test_r,test_r2,demean_cv_R2,demean_cv_r,demean_cv_r2,demean_test_R2,demean_test_r,demean_test_r2
0,pre,670487,2009,True,False,936,748,188,"[0.01, 0.001]","[0.32605299437170937, 0.5915969474212787]",...,0.689724,0.505226,0.712270,0.507328,-0.081250,0.137464,0.018896,0.061015,0.267610,0.071615
1,pre,116739,2009,True,False,936,748,188,"[0.01, 0.001]","[0.3230651368660741, 0.568903471601985]",...,0.663613,0.635893,0.797723,0.636362,-0.060034,0.152047,0.023118,-0.012841,0.200774,0.040310
2,pre,256787,2009,True,False,936,748,188,"[0.01, 0.001]","[0.316214295918939, 0.5519145308962885]",...,0.665604,0.596661,0.776197,0.602482,-0.116015,0.103074,0.010624,0.015910,0.216306,0.046788
3,pre,709570,2009,True,False,936,748,188,"[0.01, 0.001]","[0.34071000856908346, 0.5596416069557559]",...,0.657814,0.656065,0.812039,0.659408,-0.049615,0.162859,0.026523,-0.073261,0.179481,0.032213
4,pre,146316,2009,True,False,936,748,188,"[0.01, 0.001]","[0.2718998176875954, 0.5290086272652292]",...,0.663041,0.619920,0.788094,0.621092,-0.127079,0.061464,0.003778,0.166849,0.408471,0.166849
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4195,pre_tmp_ndvi,382554,2016,False,False,432,345,87,"[0.01, 0.001, 0.001]","[0.5584364012987756, 0.6960813856230079, 0.709...",...,0.772898,0.705396,0.856181,0.733046,0.205520,0.540986,0.292666,0.129545,0.491652,0.241722
4196,pre_tmp_ndvi,170555,2016,False,False,432,345,87,"[0.01, 0.01, 0.1]","[0.5622836343049482, 0.6904651626797575, 0.694...",...,0.756820,0.733523,0.858231,0.736560,0.299404,0.572170,0.327378,0.672193,0.848483,0.719923
4197,pre_tmp_ndvi,388162,2016,False,False,432,345,87,"[0.01, 0.01, 0.1]","[0.5600658729566149, 0.705632659623092, 0.7070...",...,0.767253,0.683676,0.827225,0.684301,0.316808,0.592926,0.351561,0.270710,0.597104,0.356533
4198,pre_tmp_ndvi,372528,2016,False,False,432,345,87,"[1e-08, 0.01, 0.1]","[0.584814283369868, 0.6968039956240404, 0.7031...",...,0.754589,0.734711,0.860786,0.740952,0.304971,0.579791,0.336157,0.211838,0.546692,0.298872


In [15]:
# results.groupby(
#     ["variables", "year_start", "hot_encode", "anomaly"], as_index=False
# ).mean()

In [26]:
a = results.copy()
a = a[a.year_start == 2016]
a = a[a.hot_encode]

b = summarize_dataframe(
    df=a,
    groupby_columns=["variables", "year_start", "hot_encode", "anomaly"],
    target_column="demean_cv_R2",
)
b

Unnamed: 0,variables,year_start,hot_encode,anomaly,summary_var,mean,std,sem,ci_lower,ci_upper
0,ndvi,2016,True,False,demean_cv_R2,-0.187697,0.161662,0.016166,-0.219774,-0.15562
1,pre,2016,True,False,demean_cv_R2,-0.512005,0.077951,0.007795,-0.527472,-0.496537
2,pre_ndvi,2016,True,False,demean_cv_R2,0.05596,0.107621,0.010762,0.034606,0.077314
3,pre_tmp,2016,True,False,demean_cv_R2,0.045963,0.076598,0.00766,0.030764,0.061162
4,pre_tmp_ndvi,2016,True,False,demean_cv_R2,0.152658,0.088301,0.00883,0.135137,0.170179
5,tmp,2016,True,False,demean_cv_R2,0.036821,0.05749,0.005749,0.025414,0.048229
6,tmp_ndvi,2016,True,False,demean_cv_R2,0.194853,0.080505,0.00805,0.178879,0.210827


In [4]:
%%time
### TESTING
variable_groups = ["pre", "tmp", "ndvi"]
index_cols = ["year", "district", "yield_mt"]
year_start = 2016
hot_encode = True
anomaly = False
n_splits = 5
seed = 42

#########################################     READ DATA    #########################################
data = pd.read_csv(here("data", "climate", "climate_summary.csv"))
data = data.dropna()

keep_cols = []

for var in variable_groups:
    tmp = data.columns[data.columns.to_series().str.contains(var)].tolist()
    keep_cols.append(tmp)

keep_cols = [*index_cols, *[col for cols in keep_cols for col in cols]]

data = data.loc[:, keep_cols]

data = data[data.year >= year_start]

crop_yield = data.copy().loc[:, tuple(index_cols)].reset_index(drop=True)
crop_yield["log_yield"] = np.log10(crop_yield.yield_mt.to_numpy() + 1)

########################################    STANDARDIZE FEATURES    #########################################
data = data.set_index(index_cols)
data_scaled = StandardScaler().fit_transform(data.values)
data = pd.DataFrame(data_scaled, index=data.index).reset_index()
data.columns = data.columns.astype(str)

#########################################     CALCULATE ANOMALY   #########################################
if anomaly:
    data["yield_mt"] = np.log10(data.yield_mt.to_numpy() + 1)
    data.set_index(["year", "district"], inplace=True)
    var_cols = data.columns
    data = data[var_cols] - data.groupby(["district"], as_index=True)[
        var_cols
    ].transform("mean")
    data.reset_index(drop=False, inplace=True)
else:
    pass

#########################################    HOT ENCODE    #########################################
if hot_encode:
    index_cols.remove("district")
    data = pd.get_dummies(data, columns=["district"], drop_first=False)
else:
    pass

#########################################     K-FOLD SPLIT    #########################################
x_all = data.drop(index_cols, axis=1)
y_all = np.log10(data.yield_mt.to_numpy() + 1)
x_train, x_test, y_train, y_test = train_test_split(
    x_all, y_all, test_size=0.2, random_state=seed
)
kfold = KFold(n_splits=n_splits)
# folds = []
# for i, (train_index, test_index) in enumerate(kfold.split(x_train)):
#     folds.append({"fold": i + 1, "": list(test_index)})

#########################################     K-FOLD CV    #########################################
### SETUP
tic = time.time()
alphas = {"alpha": np.logspace(-8, 8, base=10, num=17)}

i = 0
start = [i]
end = [x_train.shape[1]]

for var in variable_groups:
    i += 12
    start.append(i)
    end.append(i)
start.sort()
end.sort()

if not hot_encode:
    start = start[0:-1]
    end = end[0:-1]

### GRID SEARCH - FINDING BEST REGULARIZATION PARAMETER(S)
best_lambdas, best_scores, best_model = kfold_rr_multi_lambda_tuning(
    X=x_train,
    y=y_train,
    grid=alphas.get("alpha"),
    n_splits=n_splits,
    start=start,
    end=end,
    static_lam=1,
    verbose=0,
    show_linalg_warning=False,
    fit_model_after_tuning=True,
    seed=seed,
)
### PREDICT WITH BEST HYPERPARAMETER(S)
val_predictions = cross_val_predict(best_model, X=x_train, y=y_train, cv=kfold)
train_predictions = best_model.predict(x_train)
test_predictions = best_model.predict(x_test)

#########################################     DE-MEAN TRAIN R2    #########################################
# train_split = (
#     pd.DataFrame(folds).explode("").drop("", axis=1).set_index(x_train.index)
# )
# train_split["split"] = np.repeat("train", len(x_train))

train_split = pd.DataFrame(
    np.repeat("train", len(x_train)), columns=["split"], index=x_train.index
)
train_split = train_split.join(
    crop_yield.copy()[crop_yield.index.isin(x_train.index)]
)
train_split["cv_prediction"] = np.maximum(val_predictions, 0)
train_split = demean_by_group(train_split, predicted="cv_prediction", group=["district"])
train_split["demean_test_prediction"] = np.repeat(np.nan, len(x_train))

#########################################     DE-MEAN TEST R2    #########################################
test_split = pd.DataFrame({"split": np.repeat("test", len(x_test))}, index=x_test.index)
# test_split["fold"] = 6
test_split = test_split.join(crop_yield.copy()[crop_yield.index.isin(x_test.index)])
test_split["test_prediction"] = np.maximum(best_model.predict(x_test), 0)
test_split["cv_prediction"] = np.repeat(np.nan, len(x_test))
test_split["demean_cv_prediction"] = np.repeat(np.nan, len(x_test))
test_split = demean_by_group(test_split, predicted="test_prediction", group=["district"])

d = {
    "variables": [variable_groups],
    "year_start": year_start,
    "hot_encode": hot_encode,
    "anomaly": anomaly,
    "total_n": len(x_all),
    "train_n": len(x_train),
    "test_n": len(x_test),
    "best_reg_param": [best_lambdas],
    "mean_of_val_R2": [best_scores],
    "val_R2": r2_score(y_train, val_predictions),
    "val_r": pearsonr(val_predictions, y_train)[0],
    "val_r2": pearsonr(val_predictions, y_train)[0] ** 2,
    "train_R2": r2_score(y_train, train_predictions),
    "train_r": pearsonr(train_predictions, y_train)[0],
    "train_r2": pearsonr(train_predictions, y_train)[0] ** 2,
    "test_R2": r2_score(y_test, test_predictions),
    "test_r": pearsonr(test_predictions, y_test)[0],
    "test_r2": pearsonr(test_predictions, y_test)[0] ** 2,
    "demean_cv_R2": r2_score(
        train_split.demean_log_yield, train_split.demean_cv_prediction
    ),
    "demean_cv_r": pearsonr(
        train_split.demean_log_yield, train_split.demean_cv_prediction
    )[0],
    "demean_cv_r2": pearsonr(
        train_split.demean_log_yield, train_split.demean_cv_prediction
    )[0]
    ** 2,
    "demean_test_R2": r2_score(
        test_split.demean_log_yield, test_split.demean_test_prediction
    ),
    "demean_test_r": pearsonr(
        test_split.demean_log_yield, test_split.demean_test_prediction
    )[0],
    "demean_test_r2": pearsonr(
        test_split.demean_log_yield, test_split.demean_test_prediction
    )[0]
    ** 2,
}

CPU times: user 4min 39s, sys: 6.73 s, total: 4min 46s
Wall time: 18.2 s


In [5]:
pd.concat([train_split, test_split])

Unnamed: 0,split,year,district,yield_mt,log_yield,cv_prediction,demean_log_yield,demean_cv_prediction,demean_test_prediction,test_prediction
132,train,2017,Mazabuka,2.823767,0.582491,0.427763,0.190994,0.043694,,
231,train,2019,Ndola,2.588668,0.554933,0.503889,-0.007168,-0.058730,,
31,train,2016,Kafue,2.844356,0.584824,0.508687,0.125010,0.028804,,
84,train,2017,Masaiti,2.045396,0.483644,0.559393,-0.039523,0.037150,,
296,train,2020,Kalulushi,3.472055,0.650507,0.609615,0.048078,0.012385,,
...,...,...,...,...,...,...,...,...,...,...
57,test,2016,Kalomo,1.537028,0.404325,,0.000000,,0.000000,0.365072
124,test,2017,Mpulungu,2.719739,0.570512,,0.000000,,0.000000,0.553158
24,test,2016,Kawambwa,3.011642,0.603322,,0.004752,,0.001051,0.607808
17,test,2016,Chipata,1.915801,0.464758,,0.000000,,0.000000,0.485300
