In [1]:
# !pip install -q pyhere p_tqdm glum

In [1]:
%load_ext lab_black

In [2]:
import time
import os

from pyhere import here
from datetime import date

import numpy as np
import pandas as pd

import pyarrow
import itertools
import multiprocessing
import p_tqdm

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.model_selection import (
    train_test_split,
    KFold,
    GridSearchCV,
    cross_val_predict,
)
from sklearn.metrics import r2_score
from scipy.stats import spearmanr, pearsonr

from task_modeling_utils import *

In [3]:
variables = ["pre", "tmp", "ndvi"]

clim = list(itertools.combinations(variables, 2))
clim.append([variables[0]])
clim.append([variables[1]])
clim.append([variables[2]])
clim.append(variables)
clim = [list(elem) for elem in clim]
clim.sort(key=len)
clim

[['pre'],
 ['tmp'],
 ['ndvi'],
 ['pre', 'tmp'],
 ['pre', 'ndvi'],
 ['tmp', 'ndvi'],
 ['pre', 'tmp', 'ndvi']]

In [4]:
# paramlist = list(itertools.product([True, False], [True, False]))
# paramlist = [list(elem) for elem in paramlist]
# paramlist = list(itertools.product(clim, paramlist))

# paramlist = (i for i in paramlist)
# paramlist

In [11]:
%%time
seed=0
output = []
for var in clim:
    print(f"Variable: {var}")
    for yr in [2009, 2016]:
        # print(f"\tYear start: {yr}")
        for he in [True, False]:
            # print(f"\tHE: {he}")
            for anom in [True, False]:
                # print(f"\tAnomaly: {anom}")
                if he and anom:
                    # print(
                    #     "\t\tPass, one-hot encoding does not make sense with an anomaly model"
                    # )
                    pass
                else:
                    out = climate_model(
                        variable_groups=var,
                        year_start=yr,
                        hot_encode=he,
                        anomaly=anom,
                        index_cols=["year", "district", "yield_mt"],
                        seed=seed
                    )
                    output.append(out)

results = pd.DataFrame(output)

mask = results.anomaly == True
cols = ["demean_cv_R2", "demean_cv_r", "demean_cv_r2"]
results.loc[mask, cols] = np.nan

today = date.today().strftime("%Y-%m-%d")
file_name = f"climate_model_rand-seed-{seed}_{today}.csv"
print(f"Saving results as: {file_name}\n\n")
results.to_csv(here("data", "results", file_name), index=False)

results

Variable: ['pre']
		Pass, one-hot encoding does not make sense with an anomaly model
		Pass, one-hot encoding does not make sense with an anomaly model
Variable: ['tmp']
		Pass, one-hot encoding does not make sense with an anomaly model
		Pass, one-hot encoding does not make sense with an anomaly model
Variable: ['ndvi']
		Pass, one-hot encoding does not make sense with an anomaly model
		Pass, one-hot encoding does not make sense with an anomaly model
Variable: ['pre', 'tmp']
		Pass, one-hot encoding does not make sense with an anomaly model
		Pass, one-hot encoding does not make sense with an anomaly model
Variable: ['pre', 'ndvi']
		Pass, one-hot encoding does not make sense with an anomaly model
		Pass, one-hot encoding does not make sense with an anomaly model
Variable: ['tmp', 'ndvi']
		Pass, one-hot encoding does not make sense with an anomaly model
		Pass, one-hot encoding does not make sense with an anomaly model
Variable: ['pre', 'tmp', 'ndvi']
		Pass, one-hot encoding does n

Unnamed: 0,variables,year_start,hot_encode,anomaly,total_n,train_n,test_n,best_reg_param,mean_of_val_R2,val_R2,...,train_r2,test_R2,test_r,test_r2,demean_cv_R2,demean_cv_r,demean_cv_r2,demean_test_R2,demean_test_r,demean_test_r2
0,[[pre]],2009,True,False,936,748,188,"[[0.01, 0.0001]]","[[0.3277257775042445, 0.5824141187414437]]",0.584925,...,0.673416,0.607593,0.784196,0.614964,0.108565,0.329581,0.108624,0.173162,0.430317,0.185173
1,[[pre]],2009,False,True,936,748,188,[[0.01]],[[0.08960267369017674]],0.092229,...,0.121168,0.13166,0.369041,0.136192,,,,0.062251,0.410517,0.168525
2,[[pre]],2009,False,False,936,748,188,[[0.01]],[[0.321786743897409]],0.323814,...,0.344695,0.356793,0.597501,0.357007,-0.009273,0.261818,0.068548,0.154425,0.401863,0.161494
3,[[pre]],2016,True,False,432,345,87,"[[0.01, 0.001]]","[[0.32500092422759935, 0.6005759481906309]]",0.622504,...,0.80448,0.694306,0.83472,0.696757,-0.016113,0.150216,0.022565,0.144196,0.379766,0.144222
4,[[pre]],2016,False,True,432,345,87,[[0.01]],[[0.11208094470342225]],0.128262,...,0.168841,0.14377,0.388156,0.150665,,,,0.070129,0.413145,0.170688
5,[[pre]],2016,False,False,432,345,87,[[0.01]],[[0.317815981897002]],0.343397,...,0.394416,0.377233,0.614321,0.377391,-0.339596,-0.109461,0.011982,-0.208561,-0.059738,0.003569
6,[[tmp]],2009,True,False,936,748,188,"[[0.01, 1e-05]]","[[0.5559933136470949, 0.7032702524465111]]",0.705109,...,0.771933,0.720124,0.851631,0.725275,0.369501,0.608362,0.370104,0.435829,0.678009,0.459696
7,[[tmp]],2009,False,True,936,748,188,[[0.001]],[[0.34346137190266657]],0.363192,...,0.395107,0.379433,0.624822,0.390402,,,,0.182239,0.657212,0.431927
8,[[tmp]],2009,False,False,936,748,188,[[0.01]],[[0.552363547644122]],0.555149,...,0.576582,0.536959,0.733677,0.538281,0.292208,0.549801,0.302282,0.346501,0.588809,0.346696
9,[[tmp]],2016,True,False,432,345,87,"[[0.01, 0.001]]","[[0.6192899462388842, 0.7594750722874657]]",0.772396,...,0.874615,0.817738,0.905197,0.819381,0.315476,0.569584,0.324426,0.48695,0.749011,0.561018


In [12]:
%%time
### TESTING
variable_groups = ["pre", "tmp", "ndvi"]
index_cols = ["year", "district", "yield_mt"]
year_start = 2016
hot_encode = True
anomaly = False
n_splits = 5
seed = 42

#########################################     READ DATA    #########################################
data = pd.read_csv(here("data", "climate", "climate_summary.csv"))
data = data.dropna()

# hot_encode = he_anom[0]
# anom = he_anom[1]

keep_cols = []

for var in variable_groups:
    tmp = data.columns[data.columns.to_series().str.contains(var)].tolist()
    keep_cols.append(tmp)

keep_cols = [*index_cols, *[col for cols in keep_cols for col in cols]]

data = data.loc[:, keep_cols]

data = data[data.year >= year_start]

crop_yield = data.copy().loc[:, tuple(index_cols)].reset_index(drop=True)
crop_yield["log_yield"] = np.log10(crop_yield.yield_mt.to_numpy() + 1)

########################################    STANDARDIZE FEATURES    #########################################
data = data.set_index(index_cols)
data_scaled = StandardScaler().fit_transform(data.values)
data = pd.DataFrame(data_scaled, index=data.index).reset_index()
data.columns = data.columns.astype(str)

#########################################     CALCULATE ANOMALY   #########################################
if anomaly:
    data["yield_mt"] = np.log10(data.yield_mt.to_numpy() + 1)
    data.set_index(["year", "district"], inplace=True)
    var_cols = data.columns
    data = data[var_cols] - data.groupby(["district"], as_index=True)[
        var_cols
    ].transform("mean")
    data.reset_index(drop=False, inplace=True)
else:
    pass

#########################################    HOT ENCODE    #########################################
if hot_encode:
    index_cols.remove("district")
    data = pd.get_dummies(data, columns=["district"], drop_first=False)
else:
    pass

#########################################     K-FOLD SPLIT    #########################################
x_all = data.drop(index_cols, axis=1)
y_all = np.log10(data.yield_mt.to_numpy() + 1)
x_train, x_test, y_train, y_test = train_test_split(
    x_all, y_all, test_size=0.2, random_state=seed
)
kfold = KFold(n_splits=n_splits)
folds = []
for i, (train_index, test_index) in enumerate(kfold.split(x_train)):
    folds.append({"fold": i + 1, "": list(test_index)})
folds_df = pd.DataFrame(folds).explode("").set_index("")

#########################################     K-FOLD CV    #########################################
### SETUP
tic = time.time()
alphas = {"alpha": np.logspace(-8, 8, base=10, num=17)}

i = 0
start = [i]
end = [x_train.shape[1]]

for var in variable_groups:
    i += 12
    start.append(i)
    end.append(i)
start.sort()
end.sort()

if not hot_encode:
    start = start[0:-1]
    end = end[0:-1]

### GRID SEARCH - FINDING BEST REGULARIZATION PARAMETER(S)
best_lambdas, best_scores, best_model = kfold_rr_multi_lambda_tuning(
    X=x_train,
    y=y_train,
    grid=alphas.get("alpha"),
    n_splits=n_splits,
    start=start,
    end=end,
    static_lam=1,
    verbose=0,
    show_linalg_warning=False,
    fit_model_after_tuning=True,
    seed=seed,
)
### PREDICT WITH BEST HYPERPARAMETER(S)
val_predictions = cross_val_predict(best_model, X=x_train, y=y_train, cv=kfold)
train_predictions = best_model.predict(x_train)
test_predictions = best_model.predict(x_test)

#########################################     DE-MEAN TRAIN R2    #########################################
train_split = pd.DataFrame(folds).explode("").drop("", axis=1).set_index(x_train.index)
train_split["split"] = np.repeat("train", len(x_train))
train_split = train_split.join(crop_yield.copy()[crop_yield.index.isin(x_train.index)])
train_split["cv_prediction"] = np.maximum(val_predictions, 0)
train_split = demean_by_group(train_split, predicted="cv_prediction")
train_split["demean_test_prediction"] = np.repeat(np.nan, len(x_train))

#########################################     DE-MEAN TEST R2    #########################################
test_split = pd.DataFrame({"split": np.repeat("test", len(x_test))}, index=x_test.index)
test_split["fold"] = 6
test_split = test_split.join(crop_yield.copy()[crop_yield.index.isin(x_test.index)])
test_split["test_prediction"] = np.maximum(best_model.predict(x_test), 0)
test_split["cv_prediction"] = np.repeat(np.nan, len(x_test))
test_split["demean_cv_prediction"] = np.repeat(np.nan, len(x_test))
test_split = demean_by_group(test_split, predicted="test_prediction")

d = {
    "variables": [variable_groups],
    "year_start": year_start,
    "hot_encode": hot_encode,
    "anomaly": anomaly,
    "total_n": len(x_all),
    "train_n": len(x_train),
    "test_n": len(x_test),
    "best_reg_param": [best_lambdas],
    "mean_of_val_R2": [best_scores],
    "val_R2": r2_score(y_train, val_predictions),
    "val_r": pearsonr(val_predictions, y_train)[0],
    "val_r2": pearsonr(val_predictions, y_train)[0] ** 2,
    "train_R2": r2_score(y_train, train_predictions),
    "train_r": pearsonr(train_predictions, y_train)[0],
    "train_r2": pearsonr(train_predictions, y_train)[0] ** 2,
    "test_R2": r2_score(y_test, test_predictions),
    "test_r": pearsonr(test_predictions, y_test)[0],
    "test_r2": pearsonr(test_predictions, y_test)[0] ** 2,
    "demean_cv_R2": r2_score(
        train_split.demean_log_yield, train_split.demean_cv_prediction
    ),
    "demean_cv_r": pearsonr(
        train_split.demean_log_yield, train_split.demean_cv_prediction
    )[0],
    "demean_cv_r2": pearsonr(
        train_split.demean_log_yield, train_split.demean_cv_prediction
    )[0]
    ** 2,
    "demean_test_R2": r2_score(
        test_split.demean_log_yield, test_split.demean_test_prediction
    ),
    "demean_test_r": pearsonr(
        test_split.demean_log_yield, test_split.demean_test_prediction
    )[0],
    "demean_test_r2": pearsonr(
        test_split.demean_log_yield, test_split.demean_test_prediction
    )[0]
    ** 2,
}

CPU times: user 4min 36s, sys: 32.3 s, total: 5min 8s
Wall time: 21.4 s


In [13]:
pd.concat([train_split, test_split])

Unnamed: 0,fold,split,year,district,yield_mt,log_yield,cv_prediction,demean_log_yield,demean_cv_prediction,demean_test_prediction,test_prediction
132,1,train,2017,Mazabuka,2.823767,0.582491,0.427763,0.040774,0.015981,,
231,1,train,2019,Ndola,2.588668,0.554933,0.503889,0.000000,0.000000,,
31,1,train,2016,Kafue,2.844356,0.584824,0.508687,0.000000,0.000000,,
84,1,train,2017,Masaiti,2.045396,0.483644,0.559393,0.000000,0.000000,,
296,1,train,2020,Kalulushi,3.472055,0.650507,0.609615,0.000000,0.000000,,
...,...,...,...,...,...,...,...,...,...,...,...
57,6,test,2016,Kalomo,1.537028,0.404325,,0.000000,,0.000000,0.365072
124,6,test,2017,Mpulungu,2.719739,0.570512,,0.000000,,0.000000,0.553158
24,6,test,2016,Kawambwa,3.011642,0.603322,,0.004752,,0.001051,0.607808
17,6,test,2016,Chipata,1.915801,0.464758,,0.000000,,0.000000,0.485300
