In [1]:
# !pip install -q pyhere p_tqdm glum

In [1]:
%load_ext lab_black

In [15]:
import time
import os

from pyhere import here
from datetime import date

import numpy as np
import pandas as pd

import pyarrow
import itertools
import multiprocessing
import p_tqdm

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.model_selection import (
    train_test_split,
    KFold,
    GridSearchCV,
    cross_val_predict,
)
from sklearn.metrics import r2_score
from scipy.stats import spearmanr, pearsonr

from task_modeling_utils import *

In [1]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import StratifiedShuffleSplit, GridSearchCV
from sklearn.linear_model import Ridge
from sklearn.metrics import roc_auc_score

# Load the breast cancer dataset
data = load_breast_cancer()
X = data.data
y = data.target

# Define the stratified shuffle split with 100 splits
sss = StratifiedShuffleSplit(n_splits=100, test_size=0.2, random_state=42)

# Define the ridge regression model
ridge = Ridge()

# Define the hyperparameters to tune
param_grid = {'alpha': [0.01, 0.1, 1, 10, 100]}

# Define the GridSearchCV object with AUC as the scoring metric
grid_search = GridSearchCV(estimator=ridge, param_grid=param_grid, scoring='roc_auc', cv=sss)

# Fit the GridSearchCV object to the data
grid_search.fit(X, y)

# Print the best hyperparameters and the corresponding AUC score
print("Best hyperparameters: ", grid_search.best_params_)
print("Best AUC score: ", grid_search.best_score_)

Best hyperparameters:  {'alpha': 0.1}
Best AUC score:  0.9921428571428572


In [4]:
X.shape

(569, 30)

In [11]:
len(next(sss.split(X, y))[0])

455

In [16]:
pd.DataFrame(grid_search.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_alpha,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,...,split93_test_score,split94_test_score,split95_test_score,split96_test_score,split97_test_score,split98_test_score,split99_test_score,mean_test_score,std_test_score,rank_test_score
0,0.001159,0.003678,0.001722,0.004305,0.01,{'alpha': 0.01},0.99537,0.988426,0.999008,0.999008,...,0.996032,0.979167,0.99041,0.986111,0.991402,0.991733,0.999339,0.992083,0.007115,2
1,0.000906,0.003337,0.000977,0.003371,0.1,{'alpha': 0.1},0.997685,0.99041,0.999669,0.999669,...,0.996362,0.976521,0.991402,0.986442,0.991071,0.992063,0.999669,0.992143,0.00683,1
2,0.000426,0.001591,0.001482,0.00364,1.0,{'alpha': 1},0.997024,0.986442,0.999008,1.0,...,0.99504,0.97586,0.991733,0.983135,0.990079,0.991733,0.999669,0.990241,0.0079,3
3,0.0005,0.0005,0.001549,0.00276,10.0,{'alpha': 10},0.995701,0.988757,0.998016,1.0,...,0.989418,0.982143,0.989418,0.984788,0.990079,0.991733,0.999669,0.989997,0.007299,4
4,0.000979,0.001316,0.001415,0.000602,100.0,{'alpha': 100},0.993717,0.987765,0.997024,0.999008,...,0.983135,0.98578,0.990741,0.982143,0.990079,0.988095,0.998677,0.988072,0.00769,5


In [17]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import StratifiedShuffleSplit, GridSearchCV
from sklearn.linear_model import Ridge
from sklearn.metrics import roc_auc_score
import numpy as np

# Load the breast cancer dataset
data = load_breast_cancer()
X = data.data
y = data.target

# Define the stratified shuffle split with 100 splits
sss = StratifiedShuffleSplit(n_splits=100, test_size=0.2, random_state=42)

# Define the ridge regression model
ridge = Ridge()

# Define the hyperparameters to tune
param_grid = {'alpha': [0.01, 0.1, 1, 10, 100]}

# Define lists to store the AUC scores for each split
auc_scores = []

# Perform the hyperparameter tuning with cross-validation
for train_index, test_index in sss.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # Define the GridSearchCV object with AUC as the scoring metric
    grid_search = GridSearchCV(estimator=ridge, param_grid=param_grid, scoring='roc_auc', cv=5)

    # Fit the GridSearchCV object to the data
    grid_search.fit(X_train, y_train)
    
    # Compute the AUC score on the test set
    y_pred = grid_search.predict(X_test)
    auc = roc_auc_score(y_test, y_pred)
    
    # Store the AUC score
    auc_scores.append(auc)

# Compute the mean and standard deviation of the AUC scores
mean_auc = np.mean(auc_scores)
std_auc = np.std(auc_scores)

# Compute the 95% confidence interval using the normal approximation method
lower_bound = mean_auc - 1.96 * std_auc / np.sqrt(len(auc_scores))
upper_bound = mean_auc + 1.96 * std_auc / np.sqrt(len(auc_scores))

# Print the results
print("Mean AUC score:", mean_auc)
print("Standard deviation of AUC scores:", std_auc)
print("95% confidence interval: [{:.4f}, {:.4f}]".format(lower_bound, upper_bound))


Mean AUC score: 0.991415343915344
Standard deviation of AUC scores: 0.007350636333770812
95% confidence interval: [0.9900, 0.9929]


In [27]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.utils import resample
import numpy as np

# Define hyperparameters to be tuned
alphas = [0.01, 0.1, 1, 10]

# Define number of random samples
n_samples = 100

# Define confidence level
conf_level = 0.95

# Create arrays to store AUC scores and coefficients for each split and alpha value
auc_scores = np.zeros((n_samples, len(alphas)))
coefs = np.zeros((n_samples, len(alphas), X.shape[1]))

# Perform 100 stratified random splits, each with 80% for training and 20% for testing
for i in range(n_samples):
    # Generate a stratified random sample of the data
    X_sample, y_sample = resample(X, y, stratify=y, replace=False)
    
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X_sample, y_sample, test_size=0.2, stratify=y_sample)
    
    # Fit a Ridge model to the training data using different values of alpha
    for j, alpha in enumerate(alphas):
        ridge = Ridge(alpha=alpha)
        ridge.fit(X_train, y_train)
        coefs[i, j] = ridge.coef_
        
        # Calculate AUC score for the testing set
        y_pred = ridge.predict(X_test)
        auc_scores[i, j] = roc_auc_score(y_test, y_pred)
    
# Calculate mean and standard deviation of AUC scores for each alpha value
mean_auc = np.mean(auc_scores, axis=0)
std_auc = np.std(auc_scores, axis=0)

# Calculate confidence intervals for each alpha value
z = 1.96 # 95% confidence level
ci_lower = mean_auc - z * std_auc / np.sqrt(n_samples)
ci_upper = mean_auc + z * std_auc / np.sqrt(n_samples)

# Find the best performing alpha value based on the mean AUC score
best_alpha = alphas[np.argmax(mean_auc)]

print(f"Best alpha: {best_alpha}")
print(f"Mean AUC scores: {mean_auc}")
print(f"Standard deviations of AUC scores: {std_auc}")
print(f"95% Confidence intervals: ({ci_lower}, {ci_upper})")

Best alpha: 0.1
Mean AUC scores: [0.99340278 0.99382606 0.9921627  0.99170966]
Standard deviations of AUC scores: [0.0060642  0.00591905 0.00711855 0.00673244]
95% Confidence intervals: ([0.99221419 0.99266593 0.99076746 0.9903901 ], [0.99459136 0.99498619 0.99355793 0.99302921])


In [32]:
mean_auc

array([0.99340278, 0.99382606, 0.9921627 , 0.99170966])

In [38]:
import numpy as np
from sklearn.linear_model import Ridge
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedShuffleSplit, GridSearchCV
from scipy.stats import sem, t

# Define your features and targets here
np.random.seed(42)
num_samples = 1000
num_features = 10
features = np.random.randn(num_samples, num_features)
targets = np.random.randint(0, 2, size=num_samples)

# Define the hyperparameters to be tested
param_grid = {'alpha': [0.001, 0.01, 0.1, 1, 10, 100]}

# Define the number of stratified random splits to perform
num_splits = 100

# Define the array to be used for stratification (not in features or targets)
strat_array = np.random.randint(0, 5, size=len(targets))

# Initialize lists to store scores and best hyperparameters for each split
split_scores = []
split_best_params = []

# Loop over the specified number of splits
for i in range(num_splits):
    # Split the data into training and testing sets using stratified random sampling
    split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=i)
    train_index, test_index = next(split.split(features, strat_array))
    X_train, X_test = features[train_index], features[test_index]
    y_train, y_test = targets[train_index], targets[test_index]

    # Fit the ridge regression model with hyperparameter tuning
    ridge = Ridge()
    grid_search = GridSearchCV(ridge, param_grid, scoring='roc_auc', cv=split, n_jobs=-1)
    grid_search.fit(X_train, y_train)

    # Record the best hyperparameters and score for this split
    split_scores.append(grid_search.best_score_)
    split_best_params.append(grid_search.best_params_)

# Calculate the mean and standard error of the scores
mean_score = np.mean(split_scores)
std_err_score = sem(split_scores)

# Calculate the 95% confidence interval for the mean score
conf_interval = t.interval(0.95, num_splits - 1, loc=mean_score, scale=std_err_score)

print(f"Mean score: {mean_score:.3f}")
print(f"Standard error: {std_err_score:.3f}")
print(f"95% Confidence interval: {conf_interval}")



Mean score: 0.498
Standard error: 0.004
95% Confidence interval: (0.4887089394236692, 0.5064036843558082)


In [43]:
split_scores

[0.5807157368338802,
 0.5167994999218628,
 0.5632812500000001,
 0.5210189092045633,
 0.474202626641651,
 0.5446163463041099,
 0.43483023001095295,
 0.5656660412757974,
 0.4613220815752461,
 0.4427254258477887,
 0.44928895139865604,
 0.49070167213627136,
 0.48983739837398377,
 0.4738240350054696,
 0.5202375371151742,
 0.5404687499999999,
 0.49624765478424016,
 0.46109375,
 0.47585560243788083,
 0.5509693558474047,
 0.5153508771929824,
 0.4907754846779237,
 0.4911704953899047,
 0.5364900765744647,
 0.491875,
 0.5921237693389593,
 0.45859375,
 0.5103922487888732,
 0.3809375,
 0.46038443506797944,
 0.527673545966229,
 0.4238162212845757,
 0.4243277048155097,
 0.46210345366463507,
 0.43980612883051906,
 0.49773402094077196,
 0.5,
 0.5563369276449445,
 0.54875,
 0.39453124999999994,
 0.5015625,
 0.49445225816533833,
 0.4578125,
 0.4677293327082357,
 0.44928895139865593,
 0.5519612439443663,
 0.521875,
 0.5866166353971232,
 0.49054539771839345,
 0.55953125,
 0.4921875,
 0.44155844155844154,
 

In [33]:
#########################################     K-FOLD SPLIT    #########################################
x_all = features.drop(drop_cols, axis = 1) 
y_all = features.yield_mt
x_train, x_test, y_train, y_test = train_test_split(x_all, y_all, test_size=0.2, random_state=0)

gss = GroupShuffleSplit(n_splits=2, train_size=.8, random_state=seed)


#########################################     K-FOLD CV    ###########################################
### SETUP
alphas = {'alpha': np.logspace(-8, 8, base = 10, num = 17)}
kfold  = KFold()
ridge  = Ridge()   
### GRID SEARCH - FINDING BEST REGULARIZATION PARAMETER
ridge_reg = GridSearchCV(ridge, alphas, scoring = 'r2', cv = kfold)
ridge_reg.fit(x_train, y_train)
best_model = ridge_reg.best_estimator_
### PREDICT - PREDICTING WITH BEST HYPERPARAMETER
val_predictions = cross_val_predict(best_model, X = x_train, y = y_train, cv = kfold)   
train_predictions = best_model.predict(x_train)
test_predictions  = best_model.predict(x_test)

NameError: name 'features' is not defined

In [19]:
from sklearn.model_selection import StratifiedGroupKFold

X = list(range(18))
y = [1] * 6 + [0] * 12
groups = [1, 2, 3, 3, 4, 4, 1, 1, 2, 2, 3, 4, 5, 5, 5, 6, 6, 6]
sgkf = StratifiedGroupKFold(n_splits=2)
for train, test in sgkf.split(X, y, groups=groups):
    print("%s %s" % (train, test))

[ 1  4  5  8  9 11 15 16 17] [ 0  2  3  6  7 10 12 13 14]
[ 0  2  3  6  7 10 12 13 14] [ 1  4  5  8  9 11 15 16 17]


In [7]:
import numpy as np
from sklearn.model_selection import GroupShuffleSplit

X = np.ones(shape=(8, 2))
y = np.ones(shape=(8, 1))
groups = np.array([1, 1, 2, 2, 2, 3, 3, 3])
print(groups.shape)

gss = GroupShuffleSplit(n_splits=1, train_size=0.8, random_state=42)
gss.get_n_splits()

print(gss)

for i, (train_index, test_index) in enumerate(gss.split(X, y, groups)):
    print(f"Fold {i}:")
    print(f"  Train: index={train_index}, group={groups[train_index]}")
    print(f"  Test:  index={test_index}, group={groups[test_index]}")

(8,)
GroupShuffleSplit(n_splits=1, random_state=42, test_size=None, train_size=0.8)
Fold 0:
  Train: index=[2 3 4 5 6 7], group=[2 2 2 3 3 3]
  Test:  index=[0 1], group=[1 1]


In [16]:
list(enumerate(gss.split(X, y, groups)))

[(0, (array([2, 3, 4, 5, 6, 7], dtype=int64), array([0, 1], dtype=int64)))]

In [9]:
list(gss.split(X, y, groups))

[(array([2, 3, 4, 5, 6, 7], dtype=int64), array([0, 1], dtype=int64))]

In [14]:
x_train_index, x_test_index = next(gss.split(X, y, groups))
x_train, x_test

(array([2, 3, 4, 5, 6, 7], dtype=int64), array([0, 1], dtype=int64))

In [3]:
def climate_model(
    variable_groups=["pre", "tmp", "ndvi"],
    # he_anom=[True, False],
    hot_encode=True,
    anomaly=False,
    index_cols=["year", "district", "yield_mt"],
    year_start=2016,
    n_splits=5,
):
    #########################################     READ DATA    #########################################
    data = pd.read_csv(here("data", "climate", "climate_summary.csv"))
    data = data.dropna()

    # hot_encode = he_anom[0]
    # anom = he_anom[1]

    keep_cols = []

    for var in variable_groups:
        tmp = data.columns[data.columns.to_series().str.contains(var)].tolist()
        keep_cols.append(tmp)

    keep_cols = [*index_cols, *[col for cols in keep_cols for col in cols]]

    data = data.loc[:, keep_cols]

    data = data[data.year >= year_start]

    crop_yield = data.copy().loc[:, tuple(index_cols)].reset_index(drop=True)
    crop_yield["log_yield"] = np.log10(crop_yield.yield_mt.to_numpy() + 1)

    ########################################    STANDARDIZE FEATURES    #########################################
    data = data.set_index(index_cols)
    data_scaled = StandardScaler().fit_transform(data.values)
    data = pd.DataFrame(data_scaled, index=data.index).reset_index()
    data.columns = data.columns.astype(str)

    #########################################     CALCULATE ANOMALY   #########################################
    if anomaly:
        data["yield_mt"] = np.log10(data.yield_mt.to_numpy() + 1)
        data.set_index(["year", "district"], inplace=True)
        var_cols = data.columns
        data = data[var_cols] - data.groupby(["district"], as_index=True)[
            var_cols
        ].transform("mean")
        data.reset_index(drop=False, inplace=True)
    else:
        pass

    #########################################    HOT ENCODE    #########################################
    if hot_encode:
        index_cols.remove("district")
        data = pd.get_dummies(data, columns=["district"], drop_first=False)
    else:
        pass

    #########################################     K-FOLD SPLIT    #########################################
    x_all = data.drop(index_cols, axis=1)
    y_all = np.log10(data.yield_mt.to_numpy() + 1)
    x_train, x_test, y_train, y_test = train_test_split(
        x_all, y_all, test_size=0.2, random_state=0
    )

    #########################################     K-FOLD CV    #########################################
    ### SETUP
    tic = time.time()
    kfold = KFold(n_splits=n_splits)
    alphas = {"alpha": np.logspace(-8, 8, base=10, num=17)}

    i = 0
    start = [i]
    end = [x_train.shape[1]]

    for var in variable_groups:
        i += 12
        start.append(i)
        end.append(i)
    start.sort()
    end.sort()

    if not hot_encode:
        start = start[0:-1]
        end = end[0:-1]

    ### GRID SEARCH - FINDING BEST REGULARIZATION PARAMETER(S)
    best_lambdas, best_scores, best_model = kfold_rr_multi_lambda_tuning(
        X=x_train,
        y=y_train,
        grid=alphas.get("alpha"),
        n_splits=n_splits,
        start=start,
        end=end,
        static_lam=1,
        verbose=0,
        show_linalg_warning=False,
        fit_model_after_tuning=True,
    )
    ### PREDICT WITH BEST HYPERPARAMETER(S)
    val_predictions = cross_val_predict(best_model, X=x_train, y=y_train, cv=kfold)
    train_predictions = best_model.predict(x_train)
    test_predictions = best_model.predict(x_test)

    #########################################     DE-MEAN R2    #########################################
    crop_yield["prediction"] = np.maximum(best_model.predict(x_all), 0)

    train_split = pd.DataFrame(
        np.repeat("train", len(x_train)), columns=["split"], index=x_train.index
    )
    train_split = train_split.join(
        crop_yield.copy()[crop_yield.index.isin(x_train.index)]
    )
    train_split["cv_prediction"] = np.maximum(val_predictions, 0)
    train_split["demean_cv_yield"] = train_split["log_yield"] - train_split.groupby(
        "district"
    )["log_yield"].transform("mean")
    train_split["demean_cv_prediction"] = train_split[
        "cv_prediction"
    ] - train_split.groupby("district")["cv_prediction"].transform("mean")

    test_split = pd.DataFrame(
        np.repeat("test", len(x_test)), columns=["split"], index=x_test.index
    )
    test_split = test_split.join(crop_yield.copy()[crop_yield.index.isin(x_test.index)])
    test_split["cv_prediction"] = np.repeat(np.nan, len(x_test))
    test_split["demean_cv_yield"] = np.repeat(np.nan, len(x_test))
    test_split["demean_cv_prediction"] = np.repeat(np.nan, len(x_test))

    predictions = pd.concat([train_split, test_split])

    test_split["demean_test_yield"] = test_split["log_yield"] - test_split.groupby(
        "district"
    )["log_yield"].transform("mean")
    test_split["demean_test_prediction"] = test_split[
        "prediction"
    ] - test_split.groupby("district")["prediction"].transform("mean")

    print(
        f"""
\t\tFinish:
\t\t\tVariables: {variable_groups}
\t\t\tLambdas:   {best_lambdas}
\t\t\tOne-hot encoding: {hot_encode}
\t\t\tAnomaly: {anomaly}

\t\t\tFinal Val  R2: {r2_score(y_train, val_predictions):0.4f} 
\t\t\tFinal Test R2: {r2_score(y_test, test_predictions):0.4f}

\t\t\tDemean Val  R2: {r2_score(train_split.demean_cv_yield, train_split.demean_cv_prediction):0.4f}
\t\t\tDemean Test R2: {r2_score(test_split.demean_test_yield, test_split.demean_test_prediction):0.4f}

\t\t\tTotal time: {(time.time()-tic)/60:0.2f} minutes
    """
    )
    d = {
        "variables": variable_groups,
        "year_start": year_start,
        "hot_encode": hot_encode,
        "anomaly": anomaly,
        "total_n": len(x_all),
        "train_n": len(x_train),
        "test_n": len(x_test),
        "best_reg_param": [best_lambdas],
        "mean_of_val_R2": [best_scores],
        "val_R2": r2_score(y_train, val_predictions),
        "val_r": pearsonr(val_predictions, y_train)[0],
        "val_r2": pearsonr(val_predictions, y_train)[0] ** 2,
        "train_R2": r2_score(y_train, train_predictions),
        "train_r": pearsonr(train_predictions, y_train)[0],
        "train_r2": pearsonr(train_predictions, y_train)[0] ** 2,
        "test_R2": r2_score(y_test, test_predictions),
        "test_r": pearsonr(test_predictions, y_test)[0],
        "test_r2": pearsonr(test_predictions, y_test)[0] ** 2,
        "demean_cv_R2": r2_score(
            train_split.demean_cv_yield, train_split.demean_cv_prediction
        ),
        "demean_cv_r": pearsonr(
            train_split.demean_cv_yield, train_split.demean_cv_prediction
        )[0],
        "demean_cv_r2": pearsonr(
            train_split.demean_cv_yield, train_split.demean_cv_prediction
        )[0]
        ** 2,
    }
    return d

In [4]:
variables = ["pre", "tmp", "ndvi"]
HE = [True, False]
# anom = [True, False]

clim = list(itertools.combinations(variables, 2))
clim.append([variables[0]])
clim.append([variables[1]])
clim.append([variables[2]])
clim.append(variables)
clim = [list(elem) for elem in clim]
clim.sort(key=len)
clim

[['pre'],
 ['tmp'],
 ['ndvi'],
 ['pre', 'tmp'],
 ['pre', 'ndvi'],
 ['tmp', 'ndvi'],
 ['pre', 'tmp', 'ndvi']]

In [18]:
results = pd.read_csv(here("data", "results", "climate_model_2023-03-22.csv"))
# results

In [5]:
output = []
for var in clim:
    print(f"Variable: {var}")
    for yr in [2009, 2016]:
        print(f"\tYear start: {yr}")
        for he in [True, False]:
            print(f"\tHE: {he}")
            for anom in [True, False]:
                print(f"\tAnomaly: {anom}")
                if he and anom:
                    print(
                        "\t\tPass, one-hot encoding does not make sense with an anomaly model"
                    )
                else:
                    out = climate_model(
                        variable_groups=var,
                        year_start=yr,
                        hot_encode=he,
                        anomaly=anom,
                        index_cols=["year", "district", "yield_mt"],
                    )
                    output.append(out)

results = pd.DataFrame(output)

Variable: ['pre']
	Year start: 2009
	HE: True
	Anomaly: True
		Pass, one-hot encoding does not make sense with an anomaly model
	Anomaly: False

		Finish:
			Variables: ['pre']
			Lambdas:   [0.01, 0.0001]
			One-hot encoding: True
			Anomaly: False

			Final Val  R2: 0.5849 
			Final Test R2: 0.6076

			Demean Val  R2: -0.0879
			Demean Test R2: 0.1732

			Total time: 0.14 minutes
    
	HE: False
	Anomaly: True

		Finish:
			Variables: ['pre']
			Lambdas:   [0.01]
			One-hot encoding: False
			Anomaly: True

			Final Val  R2: 0.0922 
			Final Test R2: 0.1317

			Demean Val  R2: 0.0536
			Demean Test R2: 0.0623

			Total time: 0.01 minutes
    
	Anomaly: False

		Finish:
			Variables: ['pre']
			Lambdas:   [0.01]
			One-hot encoding: False
			Anomaly: False

			Final Val  R2: 0.3238 
			Final Test R2: 0.3568

			Demean Val  R2: -0.0444
			Demean Test R2: 0.1544

			Total time: 0.01 minutes
    
	Year start: 2016
	HE: True
	Anomaly: True
		Pass, one-hot encoding does not make sense with

In [15]:
mask = results.anomaly == True
cols = ["demean_cv_R2", "demean_cv_r", "demean_cv_r2"]
results.loc[mask, cols] = np.nan
results

Unnamed: 0,variables,year_start,hot_encode,anomaly,total_n,train_n,test_n,best_reg_param,mean_of_val_R2,val_R2,...,val_r2,train_R2,train_r,train_r2,test_R2,test_r,test_r2,demean_cv_R2,demean_cv_r,demean_cv_r2
0,['pre'],2009,True,False,936,748,188,"[[0.01, 0.0001]]","[[0.3277257775042445, 0.5824141187414437]]",0.584925,...,0.586982,0.673345,0.820619,0.673416,0.607593,0.784196,0.614964,-0.087939,0.086422,0.007469
1,['pre'],2009,False,True,936,748,188,[[0.01]],[[0.08960267369017674]],0.092229,...,0.093043,0.120812,0.348091,0.121168,0.13166,0.369041,0.136192,,,
2,['pre'],2009,False,False,936,748,188,[[0.01]],[[0.321786743897409]],0.323814,...,0.324037,0.344588,0.587108,0.344695,0.356793,0.597501,0.357007,-0.044408,0.242147,0.058635
3,['pre'],2016,True,False,432,345,87,"[[0.01, 0.001]]","[[0.32500092422759935, 0.6005759481906309]]",0.622504,...,0.631423,0.802148,0.896928,0.80448,0.694306,0.83472,0.696757,-0.627066,-0.371544,0.138045
4,['pre'],2016,False,True,432,345,87,[[0.01]],[[0.11208094470342225]],0.128262,...,0.128937,0.16773,0.410903,0.168841,0.14377,0.388156,0.150665,,,
5,['pre'],2016,False,False,432,345,87,[[0.01]],[[0.317815981897002]],0.343397,...,0.345237,0.394337,0.628025,0.394416,0.377233,0.614321,0.377391,-0.329451,-0.137914,0.01902
6,['tmp'],2009,True,False,936,748,188,"[[0.01, 1e-05]]","[[0.5559933136470949, 0.7032702524465111]]",0.705109,...,0.706553,0.771816,0.878597,0.771933,0.720124,0.851631,0.725275,0.232497,0.496481,0.246493
7,['tmp'],2009,False,True,936,748,188,[[0.001]],[[0.34346137190266657]],0.363192,...,0.363984,0.395077,0.628576,0.395107,0.379433,0.624822,0.390402,,,
8,['tmp'],2009,False,False,936,748,188,[[0.01]],[[0.552363547644122]],0.555149,...,0.555172,0.57641,0.75933,0.576582,0.536959,0.733677,0.538281,0.2696,0.535722,0.286998
9,['tmp'],2016,True,False,432,345,87,"[[0.01, 0.001]]","[[0.6192899462388842, 0.7594750722874657]]",0.772396,...,0.77394,0.873456,0.935209,0.874615,0.817738,0.905197,0.819381,0.021033,0.32342,0.1046


In [9]:
today = date.today().strftime("%Y-%m-%d")
file_name = f"climate_model_{today}.csv"
print(f"Saving results as: {file_name}\n\n")
results.to_csv(here("data", "results", file_name), index=False)

Saving results as: climate_model_2023-03-22.csv




In [27]:
# climate_model(
#     pd.read_csv(here("data", "climate", "climate_summary.csv")),
#     year_start=2016,
#     variable_groups=["pre", "tmp", "ndvi"],
# )

In [29]:
paramlist = list(itertools.product([True, False], [True, False]))
paramlist = [list(elem) for elem in paramlist]
paramlist = list(itertools.product(clim, paramlist))
# paramlist

In [7]:
# output = []
# for ls in clim:
#     print(ls)
#     for he in HE:
#         print(he)
#         out = climate_model(
#             variable_groups=ls,
#             hot_encode=he,
#             index_cols=["year", "district", "yield_mt"],
#         )
#         output.append(out)
# results = pd.DataFrame(output)

In [8]:
# results

In [6]:
paramlist = (i for i in paramlist)

In [None]:
def climate_model(variable_groups, he_anom):
    print(variable_groups, flush=True)
    
if __name__ == "__main__":
    multiprocessing.Pool().starmap(climate_model, paramlist)
    mult

In [None]:
%%time    
##### With progress bar
workers = os.cpu_count()
if __name__ == "__main__":
    multiprocessing.Pool().starmap(climate_model, paramlist)
    # with multiprocessing.Pool(processes=workers) as pool:
    #     output = pool.starmap(climate_model, paramlist)
    # output = p_tqdm.p_umap(climate_model, paramlist, num_cpus=workers)
    # results = pd.concat(output).reset_index(drop=True)
    # today = date.today().strftime("%Y-%m-%d")
    # file_name = f'results_{today}.csv'
    # print(f"Saving results as: {file_name}\n\n")           
    # results.to_csv(here("data","results", file_name), index=False)

In [6]:
### TESTING

data = pd.read_csv(here("data", "climate", "climate_summary.csv"))
hot_encode = True
anomaly = False
variable_groups = ["pre", "tmp", "ndvi"]
index_cols = ["year", "district", "yield_mt"]
year_start = 2016
n_splits = 5

#########################################     READ DATA    #########################################
data = data.dropna()

keep_cols = []

for var in variable_groups:
    tmp = data.columns[data.columns.to_series().str.contains(var)].tolist()
    keep_cols.append(tmp)

keep_cols = [*index_cols, *[col for cols in keep_cols for col in cols]]

data = data.loc[:, keep_cols]

data = data[data.year >= year_start]

crop_yield = data.copy().loc[:, tuple(index_cols)].reset_index(drop=True)
crop_yield["log_yield"] = np.log10(crop_yield.yield_mt.to_numpy() + 1)

########################################    STANDARDIZE FEATURES    #########################################
data = data.set_index(index_cols)
data_scaled = StandardScaler().fit_transform(data.values)
data = pd.DataFrame(data_scaled, index=data.index).reset_index()
data.columns = data.columns.astype(str)

#########################################    HOT ENCODE    #########################################
if hot_encode:
    index_cols.remove("district")
    data = pd.get_dummies(data, columns=["district"], drop_first=False)
else:
    pass

#########################################     K-FOLD SPLIT    #########################################
x_all = data.drop(index_cols, axis=1)
y_all = np.log10(data.yield_mt.to_numpy() + 1)
x_train, x_test, y_train, y_test = train_test_split(
    x_all, y_all, test_size=0.2, random_state=0
)

#########################################     K-FOLD CV    #########################################
### SETUP
tic = time.time()
kfold = KFold(n_splits=n_splits)
alphas = {"alpha": np.logspace(-8, 8, base=10, num=17)}

i = 0
start = [i]
end = [x_train.shape[1]]

for var in variable_groups:
    i += 12
    start.append(i)
    end.append(i)
start.sort()
end.sort()

if not hot_encode:
    start = start[0:-1]
    end = end[0:-1]

### GRID SEARCH - FINDING BEST REGULARIZATION PARAMETER(S)
best_lambdas, best_scores, best_model = kfold_rr_multi_lambda_tuning(
    X=x_train,
    y=y_train,
    grid=alphas.get("alpha"),
    n_splits=n_splits,
    start=start,
    end=end,
    static_lam=1,
    verbose=2,
    show_linalg_warning=False,
    fit_model_after_tuning=True,
)
### PREDICT WITH BEST HYPERPARAMETER(S)
val_predictions = cross_val_predict(best_model, X=x_train, y=y_train, cv=kfold)
train_predictions = best_model.predict(x_train)
test_predictions = best_model.predict(x_test)

#########################################     DE-MEAN R2    #########################################
crop_yield["prediction"] = np.maximum(best_model.predict(x_all), 0)

train_split = pd.DataFrame(
    np.repeat("train", len(x_train)), columns=["split"], index=x_train.index
)
train_split = train_split.join(crop_yield.copy()[crop_yield.index.isin(x_train.index)])
train_split["cv_prediction"] = np.maximum(val_predictions, 0)
train_split["demean_cv_yield"] = train_split["log_yield"] - train_split.groupby(
    "district"
)["log_yield"].transform("mean")
train_split["demean_cv_prediction"] = train_split[
    "cv_prediction"
] - train_split.groupby("district")["cv_prediction"].transform("mean")

test_split = pd.DataFrame(
    np.repeat("test", len(x_test)), columns=["split"], index=x_test.index
)
test_split = test_split.join(crop_yield.copy()[crop_yield.index.isin(x_test.index)])
test_split["cv_prediction"] = np.repeat(np.nan, len(x_test))
test_split["demean_cv_yield"] = np.repeat(np.nan, len(x_test))
test_split["demean_cv_prediction"] = np.repeat(np.nan, len(x_test))

predictions = pd.concat([train_split, test_split])

test_split["demean_test_yield"] = test_split["log_yield"] - test_split.groupby(
    "district"
)["log_yield"].transform("mean")
test_split["demean_test_prediction"] = test_split["prediction"] - test_split.groupby(
    "district"
)["prediction"].transform("mean")

# variable_groups.append("districts")
# group_lambdas = dict(zip(variable_groups, best_lambdas))
# group_lambdas

print(
    f"""
Finish:
    Variables: {variable_groups}
    Lambdas: {best_lambdas}
    One-hot encoding: {hot_encode}
    Anomaly: {anomaly}

    Final Val  R2: {r2_score(y_train, val_predictions):0.4f} 
    Final Test R2: {r2_score(y_test, test_predictions):0.4f}

    Demean Val  R2: {r2_score(train_split.demean_cv_yield, train_split.demean_cv_prediction):0.4f}
    Demean Test R2: {r2_score(test_split.demean_test_yield, test_split.demean_test_prediction):0.4f}

    Total time: {(time.time()-tic)/60:0.2f} minutes
"""
)

1e-08 1e-07 1e-06 1e-05 1e-04 1e-03 1e-02 1e-01 1e+00 1e+01 1e+02 1e+03 1e+04 1e+05 1e+06 1e+07 1e+08 
	Best λ 1: 0.01
	Val R2 1: 0.5526

1e-08 1e-07 1e-06 1e-05 1e-04 1e-03 1e-02 1e-01 1e+00 1e+01 1e+02 1e+03 1e+04 1e+05 1e+06 1e+07 1e+08 
	Best λ 2: 0.001
	Val R2 2: 0.6862

1e-08 1e-07 1e-06 1e-05 1e-04 1e-03 1e-02 1e-01 1e+00 1e+01 1e+02 1e+03 1e+04 1e+05 1e+06 1e+07 1e+08 
	Best λ 3: 0.1
	Val R2 3: 0.6868

1e-08 1e-07 1e-06 1e-05 1e-04 1e-03 1e-02 1e-01 1e+00 1e+01 1e+02 1e+03 1e+04 1e+05 1e+06 1e+07 1e+08 
	Best λ 4: 1e-05
	Val R2 4: 0.7895


Finish:
    Variables: ['pre', 'tmp', 'ndvi']
    Lambdas: [0.01, 0.001, 0.1, 1e-05]
    One-hot encoding: True
    Anomaly: False

    Final Val  R2: 0.8008 
    Final Test R2: 0.8356

    Demean Val  R2: 0.1744
    Demean Test R2: 0.6005

    Total time: 0.30 minutes



# NDVI model

In [7]:
climate_df = pd.read_csv(here("data", "climate", "climate_summary.csv"))
climate_df = climate_df.dropna()
drop_cols = ["year", "district", "yield_mt"]
ndvi_cols = climate_df.columns[climate_df.columns.to_series().str.contains("ndvi")]
keep_cols = [*ndvi_cols, *drop_cols]
climate_df = climate_df.loc[:, keep_cols]
climate_df = climate_df[climate_df.year >= 2016]

hot_encode = True
# hot_encode = False

crop_yield = climate_df.copy().loc[:, tuple(drop_cols)].reset_index(drop=True)
crop_yield["log_yield"] = np.log10(crop_yield.yield_mt.to_numpy() + 1)

#########################################    HOT ENCODE    #########################################
if hot_encode:
    drop_cols.remove("district")
    climate_df = pd.get_dummies(climate_df, columns=["district"], drop_first=False)
else:
    pass

#########################################    STANDARDIZE FEATURES    #########################################
climate_df = climate_df.set_index(drop_cols)
climate_df_scaled = StandardScaler().fit_transform(climate_df.values)
climate_df = pd.DataFrame(climate_df_scaled, index=climate_df.index).reset_index()
climate_df.columns = climate_df.columns.astype(str)

#########################################     K-FOLD SPLIT    #########################################
x_all = climate_df.drop(drop_cols, axis=1)
y_all = np.log10(climate_df.yield_mt.to_numpy() + 1)
x_train, x_test, y_train, y_test = train_test_split(
    x_all, y_all, test_size=0.2, random_state=0
)

#########################################     K-FOLD CV   ###########################################
### SETUP
alphas = {"alpha": np.logspace(-8, 8, base=10, num=17)}
kfold = KFold()
ridge = Ridge(random_state=0)
### GRID SEARCH - FINDING BEST REGULARIZATION PARAMETER
ridge_reg = GridSearchCV(ridge, alphas, scoring="r2", cv=kfold)
ridge_reg.fit(x_train, y_train)
best_model = ridge_reg.best_estimator_
### PREDICT - PREDICTING WITH BEST HYPERPARAMETER
val_predictions = cross_val_predict(best_model, X=x_train, y=y_train, cv=kfold)
train_predictions = best_model.predict(x_train)
test_predictions = best_model.predict(x_test)

#########################################     DE-MEAN R2    #########################################
crop_yield["prediction"] = np.maximum(best_model.predict(x_all), 0)

train_split = pd.DataFrame(
    np.repeat("train", len(x_train)), columns=["split"], index=x_train.index
)
train_split = train_split.join(crop_yield.copy()[crop_yield.index.isin(x_train.index)])
train_split["cv_prediction"] = np.maximum(val_predictions, 0)
train_split["demean_cv_yield"] = train_split["log_yield"] - train_split.groupby(
    "district"
)["log_yield"].transform("mean")
train_split["demean_cv_prediction"] = train_split[
    "cv_prediction"
] - train_split.groupby("district")["cv_prediction"].transform("mean")

test_split = pd.DataFrame(
    np.repeat("test", len(x_test)), columns=["split"], index=x_test.index
)
test_split = test_split.join(crop_yield.copy()[crop_yield.index.isin(x_test.index)])
test_split["cv_prediction"] = np.repeat(np.nan, len(x_test))
test_split["demean_cv_yield"] = np.repeat(np.nan, len(x_test))
test_split["demean_cv_prediction"] = np.repeat(np.nan, len(x_test))

predictions = pd.concat([train_split, test_split])

test_split["demean_test_yield"] = test_split["log_yield"] - test_split.groupby(
    "district"
)["log_yield"].transform("mean")
test_split["demean_test_prediction"] = test_split["prediction"] - test_split.groupby(
    "district"
)["prediction"].transform("mean")

print(
    f"Val  R2: {r2_score(y_train, val_predictions):0.4f}",
    f"\nTest R2: {r2_score(y_test, test_predictions):0.4f}",
    f"\n\nDemean Val  R2: {r2_score(train_split.demean_cv_yield, train_split.demean_cv_prediction):0.4f}",
    f"\nDemean Test R2: {r2_score(test_split.demean_test_yield, test_split.demean_test_prediction):0.4f}",
)

Val  R2: 0.7802 
Test R2: 0.8085 

Demean Val  R2: 0.0921 
Demean Test R2: 0.4394


# Precipitation, Temperature, and NDVI model

In [8]:
climate_df = pd.read_csv(here("data", "climate", "climate_summary.csv"))
climate_df = climate_df.dropna()
drop_cols = ["year", "district", "yield_mt"]
climate_df = climate_df[climate_df.year >= 2016]

hot_encode = True
# hot_encode = False

crop_yield = climate_df.copy().loc[:, tuple(drop_cols)].reset_index(drop=True)
crop_yield["log_yield"] = np.log10(crop_yield.yield_mt.to_numpy() + 1)

#########################################    HOT ENCODE    #########################################
if hot_encode:
    drop_cols.remove("district")
    climate_df = pd.get_dummies(climate_df, columns=["district"], drop_first=False)
else:
    pass

#########################################    STANDARDIZE FEATURES    #########################################
climate_df = climate_df.set_index(drop_cols)
climate_df_scaled = StandardScaler().fit_transform(climate_df.values)
climate_df = pd.DataFrame(climate_df_scaled, index=climate_df.index).reset_index()
climate_df.columns = climate_df.columns.astype(str)

#########################################     K-FOLD SPLIT    #########################################
x_all = climate_df.drop(drop_cols, axis=1)
y_all = np.log10(climate_df.yield_mt.to_numpy() + 1)
x_train, x_test, y_train, y_test = train_test_split(
    x_all, y_all, test_size=0.2, random_state=0
)

#########################################     K-FOLD CV   ###########################################
### SETUP
alphas = {"alpha": np.logspace(-8, 8, base=10, num=17)}
kfold = KFold()
ridge = Ridge(random_state=0)
### GRID SEARCH - FINDING BEST REGULARIZATION PARAMETER
ridge_reg = GridSearchCV(ridge, alphas, scoring="r2", cv=kfold)
ridge_reg.fit(x_train, y_train)
best_model = ridge_reg.best_estimator_
### PREDICT - PREDICTING WITH BEST HYPERPARAMETER
val_predictions = cross_val_predict(best_model, X=x_train, y=y_train, cv=kfold)
train_predictions = best_model.predict(x_train)
test_predictions = best_model.predict(x_test)

#########################################     DE-MEAN R2    #########################################
crop_yield["prediction"] = np.maximum(best_model.predict(x_all), 0)

train_split = pd.DataFrame(
    np.repeat("train", len(x_train)), columns=["split"], index=x_train.index
)
train_split = train_split.join(crop_yield.copy()[crop_yield.index.isin(x_train.index)])
train_split["cv_prediction"] = np.maximum(val_predictions, 0)
train_split["demean_cv_yield"] = train_split["log_yield"] - train_split.groupby(
    "district"
)["log_yield"].transform("mean")
train_split["demean_cv_prediction"] = train_split[
    "cv_prediction"
] - train_split.groupby("district")["cv_prediction"].transform("mean")

test_split = pd.DataFrame(
    np.repeat("test", len(x_test)), columns=["split"], index=x_test.index
)
test_split = test_split.join(crop_yield.copy()[crop_yield.index.isin(x_test.index)])
test_split["cv_prediction"] = np.repeat(np.nan, len(x_test))
test_split["demean_cv_yield"] = np.repeat(np.nan, len(x_test))
test_split["demean_cv_prediction"] = np.repeat(np.nan, len(x_test))

predictions = pd.concat([train_split, test_split])

test_split["demean_test_yield"] = test_split["log_yield"] - test_split.groupby(
    "district"
)["log_yield"].transform("mean")
test_split["demean_test_prediction"] = test_split["prediction"] - test_split.groupby(
    "district"
)["prediction"].transform("mean")

print(
    f"Val  R2: {r2_score(y_train, val_predictions):0.4f}",
    f"\nTest R2: {r2_score(y_test, test_predictions):0.4f}",
    f"\n\nDemean Val  R2: {r2_score(train_split.demean_cv_yield, train_split.demean_cv_prediction):0.4f}",
    f"\nDemean Test R2: {r2_score(test_split.demean_test_yield, test_split.demean_test_prediction):0.4f}",
)

Val  R2: 0.8044 
Test R2: 0.8297 

Demean Val  R2: 0.1723 
Demean Test R2: 0.5297


In [9]:
ridge_reg.best_score_

0.792567922097443

# NDVI Anomaly Model

In [10]:
climate_df = pd.read_csv(here("data", "climate", "climate_summary.csv"))
climate_df = climate_df.dropna()
drop_cols = ["year", "district", "yield_mt"]
ndvi_cols = climate_df.columns[climate_df.columns.to_series().str.contains("ndvi")]
keep_cols = [*ndvi_cols, *drop_cols]
climate_df = climate_df.loc[:, keep_cols]
climate_df = climate_df[climate_df.year >= 2016]

crop_yield = climate_df.copy().loc[:, tuple(drop_cols)].reset_index(drop=True)
crop_yield["log_yield"] = np.log10(crop_yield.yield_mt.to_numpy() + 1)

#########################################    STANDARDIZE FEATURES    #########################################
climate_df = climate_df.set_index(drop_cols)
climate_df_scaled = StandardScaler().fit_transform(climate_df.values)
climate_df = pd.DataFrame(climate_df_scaled, index=climate_df.index).reset_index()
climate_df.columns = climate_df.columns.astype(str)

#########################################     CALCULATE ANOMALY   #########################################
climate_df["yield_mt"] = np.log10(climate_df.yield_mt.to_numpy() + 1)
climate_df.set_index(["year", "district"], inplace=True)
var_cols = climate_df.columns
climate_df = climate_df[var_cols] - climate_df.groupby(["district"], as_index=True)[
    var_cols
].transform("mean")
climate_df.reset_index(drop=False, inplace=True)

#########################################     K-FOLD SPLIT    #########################################
x_all = climate_df.drop(drop_cols, axis=1)
y_all = climate_df.yield_mt
x_train, x_test, y_train, y_test = train_test_split(
    x_all, y_all, test_size=0.2, random_state=0
)

#########################################     K-FOLD CV   ###########################################
### SETUP
alphas = {"alpha": np.logspace(-8, 8, base=10, num=17)}
kfold = KFold()
ridge = Ridge(random_state=0)
### GRID SEARCH - FINDING BEST REGULARIZATION PARAMETER
ridge_reg = GridSearchCV(ridge, alphas, scoring="r2", cv=kfold)
ridge_reg.fit(x_train, y_train)
best_model = ridge_reg.best_estimator_
### PREDICT - PREDICTING WITH BEST HYPERPARAMETER
val_predictions = cross_val_predict(best_model, X=x_train, y=y_train, cv=kfold)
train_predictions = best_model.predict(x_train)
test_predictions = best_model.predict(x_test)

#########################################     DE-MEAN R2    #########################################
crop_yield["prediction"] = best_model.predict(x_all)

train_split = pd.DataFrame(
    np.repeat("train", len(x_train)), columns=["split"], index=x_train.index
)
train_split = train_split.join(crop_yield.copy()[crop_yield.index.isin(x_train.index)])
train_split["cv_prediction"] = val_predictions

test_split = pd.DataFrame(
    np.repeat("test", len(x_test)), columns=["split"], index=x_test.index
)
test_split = test_split.join(crop_yield.copy()[crop_yield.index.isin(x_test.index)])
test_split["cv_prediction"] = np.repeat(np.nan, len(x_test))

predictions = pd.concat([train_split, test_split])

print(
    f"Val  R2: {r2_score(y_train, val_predictions):0.4f}\nTest R2: {r2_score(y_test, test_predictions):0.4f}"
)

Val  R2: 0.4449
Test R2: 0.4293


# Precipitation, Temperature, and NDVI  Anomaly model

In [11]:
climate_df = pd.read_csv(here("data", "climate", "climate_summary.csv"))
climate_df = climate_df.dropna()
drop_cols = ["year", "district", "yield_mt"]
climate_df = climate_df[climate_df.year >= 2016]

crop_yield = climate_df.copy().loc[:, tuple(drop_cols)].reset_index(drop=True)
crop_yield["log_yield"] = np.log10(crop_yield.yield_mt.to_numpy() + 1)

#########################################    STANDARDIZE FEATURES    #########################################
climate_df = climate_df.set_index(drop_cols)
climate_df_scaled = StandardScaler().fit_transform(climate_df.values)
climate_df = pd.DataFrame(climate_df_scaled, index=climate_df.index).reset_index()
climate_df.columns = climate_df.columns.astype(str)

#########################################     CALCULATE ANOMALY   #########################################
climate_df["yield_mt"] = np.log10(climate_df.yield_mt.to_numpy() + 1)
climate_df.set_index(["year", "district"], inplace=True)
var_cols = climate_df.columns
climate_df = climate_df[var_cols] - climate_df.groupby(["district"], as_index=True)[
    var_cols
].transform("mean")
climate_df.reset_index(drop=False, inplace=True)

#########################################     K-FOLD SPLIT    #########################################
x_all = climate_df.drop(drop_cols, axis=1)
y_all = climate_df.yield_mt
x_train, x_test, y_train, y_test = train_test_split(
    x_all, y_all, test_size=0.2, random_state=0
)

#########################################     K-FOLD CV   ###########################################
### SETUP
alphas = {"alpha": np.logspace(-8, 8, base=10, num=17)}
kfold = KFold()
ridge = Ridge(random_state=0)
### GRID SEARCH - FINDING BEST REGULARIZATION PARAMETER
ridge_reg = GridSearchCV(ridge, alphas, scoring="r2", cv=kfold)
ridge_reg.fit(x_train, y_train)
best_model = ridge_reg.best_estimator_
### PREDICT - PREDICTING WITH BEST HYPERPARAMETER
val_predictions = cross_val_predict(best_model, X=x_train, y=y_train, cv=kfold)
train_predictions = best_model.predict(x_train)
test_predictions = best_model.predict(x_test)

#########################################     DE-MEAN R2    #########################################
crop_yield["prediction"] = best_model.predict(x_all)

train_split = pd.DataFrame(
    np.repeat("train", len(x_train)), columns=["split"], index=x_train.index
)
train_split = train_split.join(crop_yield.copy()[crop_yield.index.isin(x_train.index)])
train_split["cv_prediction"] = val_predictions

test_split = pd.DataFrame(
    np.repeat("test", len(x_test)), columns=["split"], index=x_test.index
)
test_split = test_split.join(crop_yield.copy()[crop_yield.index.isin(x_test.index)])
test_split["cv_prediction"] = np.repeat(np.nan, len(x_test))

predictions = pd.concat([train_split, test_split])

print(
    f"Val  R2: {r2_score(y_train, val_predictions):0.4f}\nTest R2: {r2_score(y_test, test_predictions):0.4f}"
)

Val  R2: 0.5284
Test R2: 0.5145
