In [1]:
# !pip install -q pyhere p_tqdm glum

In [2]:
%load_ext lab_black

In [3]:
import time
import math
import os
import glob
from pyhere import here
from datetime import date
import re
from collections import Counter

import numpy as np
import pandas as pd
import geopandas
import pickle

import pyarrow
import itertools
import multiprocessing
import p_tqdm

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge, RidgeCV
from sklearn.model_selection import (
    train_test_split,
    KFold,
    LeaveOneGroupOut,
    cross_val_score,
    GridSearchCV,
    cross_val_predict,
)
from sklearn.metrics import r2_score
from scipy.stats import spearmanr, pearsonr

from task_modeling_utils import *

In [4]:
def climate_model(
    data,
    hot_encode=True,
    anomaly=False,
    variable_groups=["pre", "tmp", "ndvi"],
    index_cols=["year", "district", "yield_mt"],
    year_start=2016,
    n_splits=5,
):
    #########################################     READ DATA    #########################################
    data = data.dropna()

    keep_cols = []

    for var in variable_groups:
        tmp = data.columns[data.columns.to_series().str.contains(var)].tolist()
        keep_cols.append(tmp)

    keep_cols = [*index_cols, *[col for cols in keep_cols for col in cols]]

    data = data.loc[:, keep_cols]

    data = data[data.year >= year_start]

    crop_yield = data.copy().loc[:, tuple(index_cols)].reset_index(drop=True)
    crop_yield["log_yield"] = np.log10(crop_yield.yield_mt.to_numpy() + 1)

    ########################################    STANDARDIZE FEATURES    #########################################
    data = data.set_index(index_cols)
    data_scaled = StandardScaler().fit_transform(data.values)
    data = pd.DataFrame(data_scaled, index=data.index).reset_index()
    data.columns = data.columns.astype(str)

    #########################################    HOT ENCODE    #########################################
    if hot_encode:
        index_cols.remove("district")
        data = pd.get_dummies(data, columns=["district"], drop_first=False)
    else:
        pass

    #########################################     K-FOLD SPLIT    #########################################
    x_all = data.drop(index_cols, axis=1)
    y_all = np.log10(data.yield_mt.to_numpy() + 1)
    x_train, x_test, y_train, y_test = train_test_split(
        x_all, y_all, test_size=0.2, random_state=0
    )

    #########################################     K-FOLD CV    #########################################
    ### SETUP
    tic = time.time()
    kfold = KFold(n_splits=n_splits)
    alphas = {"alpha": np.logspace(-8, 8, base=10, num=17)}

    i = 0
    start = [i]
    end = [x_train.shape[1]]

    for var in variable_groups:
        i += 12
        start.append(i)
        end.append(i)
    start.sort()
    end.sort()

    if not hot_encode:
        start = start[0:-1]
        end = end[0:-1]

    ### GRID SEARCH - FINDING BEST REGULARIZATION PARAMETER(S)
    best_lambdas, best_scores, best_model = kfold_rr_multi_lambda_tuning(
        X=x_train,
        y=y_train,
        grid=alphas.get("alpha"),
        n_splits=n_splits,
        start=start,
        end=end,
        static_lam=1,
        verbose=2,
        show_linalg_warning=False,
        fit_model_after_tuning=True,
    )
    ### PREDICT WITH BEST HYPERPARAMETER(S)
    val_predictions = cross_val_predict(best_model, X=x_train, y=y_train, cv=kfold)
    train_predictions = best_model.predict(x_train)
    test_predictions = best_model.predict(x_test)

    #########################################     DE-MEAN R2    #########################################
    crop_yield["prediction"] = np.maximum(best_model.predict(x_all), 0)

    train_split = pd.DataFrame(
        np.repeat("train", len(x_train)), columns=["split"], index=x_train.index
    )
    train_split = train_split.join(
        crop_yield.copy()[crop_yield.index.isin(x_train.index)]
    )
    train_split["cv_prediction"] = np.maximum(val_predictions, 0)
    train_split["demean_cv_yield"] = train_split["log_yield"] - train_split.groupby(
        "district"
    )["log_yield"].transform("mean")
    train_split["demean_cv_prediction"] = train_split[
        "cv_prediction"
    ] - train_split.groupby("district")["cv_prediction"].transform("mean")

    test_split = pd.DataFrame(
        np.repeat("test", len(x_test)), columns=["split"], index=x_test.index
    )
    test_split = test_split.join(crop_yield.copy()[crop_yield.index.isin(x_test.index)])
    test_split["cv_prediction"] = np.repeat(np.nan, len(x_test))
    test_split["demean_cv_yield"] = np.repeat(np.nan, len(x_test))
    test_split["demean_cv_prediction"] = np.repeat(np.nan, len(x_test))

    predictions = pd.concat([train_split, test_split])

    test_split["demean_test_yield"] = test_split["log_yield"] - test_split.groupby(
        "district"
    )["log_yield"].transform("mean")
    test_split["demean_test_prediction"] = test_split[
        "prediction"
    ] - test_split.groupby("district")["prediction"].transform("mean")

    print(
        f"""
Finish:
    Variables: {variable_groups}
    Lambdas:   {best_lambdas}
    One-hot encoding: {hot_encode}
    Anomaly: {anomaly}
    
    Final Val  R2: {r2_score(y_train, val_predictions):0.4f} 
    Final Test R2: {r2_score(y_test, test_predictions):0.4f}
    
    Demean Val  R2: {r2_score(train_split.demean_cv_yield, train_split.demean_cv_prediction):0.4f}
    Demean Test R2: {r2_score(test_split.demean_test_yield, test_split.demean_test_prediction):0.4f}
    
    Total time: {(time.time()-tic)/60:0.2f} minutes
    """
    )

In [5]:
climate_model(
    pd.read_csv(here("data", "climate", "climate_summary.csv")),
    year_start=2016,
    variable_groups=["pre", "tmp", "ndvi"],
)

1e-08 1e-07 1e-06 1e-05 1e-04 1e-03 1e-02 1e-01 1e+00 1e+01 1e+02 1e+03 1e+04 1e+05 1e+06 1e+07 1e+08 
	Best λ 1: 0.01
	Val R2 1: 0.5526

1e-08 1e-07 1e-06 1e-05 1e-04 1e-03 1e-02 1e-01 1e+00 1e+01 1e+02 1e+03 1e+04 1e+05 1e+06 1e+07 1e+08 
	Best λ 2: 0.001
	Val R2 2: 0.6862

1e-08 1e-07 1e-06 1e-05 1e-04 1e-03 1e-02 1e-01 1e+00 1e+01 1e+02 1e+03 1e+04 1e+05 1e+06 1e+07 1e+08 
	Best λ 3: 0.1
	Val R2 3: 0.6868

1e-08 1e-07 1e-06 1e-05 1e-04 1e-03 1e-02 1e-01 1e+00 1e+01 1e+02 1e+03 1e+04 1e+05 1e+06 1e+07 1e+08 
	Best λ 4: 1e-05
	Val R2 4: 0.7895


Finish:
    Variables: ['pre', 'tmp', 'ndvi']
    Lambdas:   [0.01, 0.001, 0.1, 1e-05]
    One-hot encoding: True
    Anomaly: False
    
    Final Val  R2: 0.8008 
    Final Test R2: 0.8356
    
    Demean Val  R2: 0.1744
    Demean Test R2: 0.6005
    
    Total time: 0.30 minutes
    


In [6]:
### TESTING

data = pd.read_csv(here("data", "climate", "climate_summary.csv"))
hot_encode = True
anomaly = False
variable_groups = ["pre", "tmp", "ndvi"]
index_cols = ["year", "district", "yield_mt"]
year_start = 2016
n_splits = 5

#########################################     READ DATA    #########################################
data = data.dropna()

keep_cols = []

for var in variable_groups:
    tmp = data.columns[data.columns.to_series().str.contains(var)].tolist()
    keep_cols.append(tmp)

keep_cols = [*index_cols, *[col for cols in keep_cols for col in cols]]

data = data.loc[:, keep_cols]

data = data[data.year >= year_start]

crop_yield = data.copy().loc[:, tuple(index_cols)].reset_index(drop=True)
crop_yield["log_yield"] = np.log10(crop_yield.yield_mt.to_numpy() + 1)

########################################    STANDARDIZE FEATURES    #########################################
data = data.set_index(index_cols)
data_scaled = StandardScaler().fit_transform(data.values)
data = pd.DataFrame(data_scaled, index=data.index).reset_index()
data.columns = data.columns.astype(str)

#########################################    HOT ENCODE    #########################################
if hot_encode:
    index_cols.remove("district")
    data = pd.get_dummies(data, columns=["district"], drop_first=False)
else:
    pass

#########################################     K-FOLD SPLIT    #########################################
x_all = data.drop(index_cols, axis=1)
y_all = np.log10(data.yield_mt.to_numpy() + 1)
x_train, x_test, y_train, y_test = train_test_split(
    x_all, y_all, test_size=0.2, random_state=0
)

#########################################     K-FOLD CV    #########################################
### SETUP
tic = time.time()
kfold = KFold(n_splits=n_splits)
alphas = {"alpha": np.logspace(-8, 8, base=10, num=17)}

i = 0
start = [i]
end = [x_train.shape[1]]

for var in variable_groups:
    i += 12
    start.append(i)
    end.append(i)
start.sort()
end.sort()

if not hot_encode:
    start = start[0:-1]
    end = end[0:-1]

### GRID SEARCH - FINDING BEST REGULARIZATION PARAMETER(S)
best_lambdas, best_scores, best_model = kfold_rr_multi_lambda_tuning(
    X=x_train,
    y=y_train,
    grid=alphas.get("alpha"),
    n_splits=n_splits,
    start=start,
    end=end,
    static_lam=1,
    verbose=2,
    show_linalg_warning=False,
    fit_model_after_tuning=True,
)
### PREDICT WITH BEST HYPERPARAMETER(S)
val_predictions = cross_val_predict(best_model, X=x_train, y=y_train, cv=kfold)
train_predictions = best_model.predict(x_train)
test_predictions = best_model.predict(x_test)

#########################################     DE-MEAN R2    #########################################
crop_yield["prediction"] = np.maximum(best_model.predict(x_all), 0)

train_split = pd.DataFrame(
    np.repeat("train", len(x_train)), columns=["split"], index=x_train.index
)
train_split = train_split.join(crop_yield.copy()[crop_yield.index.isin(x_train.index)])
train_split["cv_prediction"] = np.maximum(val_predictions, 0)
train_split["demean_cv_yield"] = train_split["log_yield"] - train_split.groupby(
    "district"
)["log_yield"].transform("mean")
train_split["demean_cv_prediction"] = train_split[
    "cv_prediction"
] - train_split.groupby("district")["cv_prediction"].transform("mean")

test_split = pd.DataFrame(
    np.repeat("test", len(x_test)), columns=["split"], index=x_test.index
)
test_split = test_split.join(crop_yield.copy()[crop_yield.index.isin(x_test.index)])
test_split["cv_prediction"] = np.repeat(np.nan, len(x_test))
test_split["demean_cv_yield"] = np.repeat(np.nan, len(x_test))
test_split["demean_cv_prediction"] = np.repeat(np.nan, len(x_test))

predictions = pd.concat([train_split, test_split])

test_split["demean_test_yield"] = test_split["log_yield"] - test_split.groupby(
    "district"
)["log_yield"].transform("mean")
test_split["demean_test_prediction"] = test_split["prediction"] - test_split.groupby(
    "district"
)["prediction"].transform("mean")

# variable_groups.append("districts")
# group_lambdas = dict(zip(variable_groups, best_lambdas))
# group_lambdas

print(
    f"""
Finish:
    Variables: {variable_groups}
    Lambdas: {best_lambdas}
    One-hot encoding: {hot_encode}
    Anomaly: {anomaly}

    Final Val  R2: {r2_score(y_train, val_predictions):0.4f} 
    Final Test R2: {r2_score(y_test, test_predictions):0.4f}

    Demean Val  R2: {r2_score(train_split.demean_cv_yield, train_split.demean_cv_prediction):0.4f}
    Demean Test R2: {r2_score(test_split.demean_test_yield, test_split.demean_test_prediction):0.4f}

    Total time: {(time.time()-tic)/60:0.2f} minutes
"""
)

1e-08 1e-07 1e-06 1e-05 1e-04 1e-03 1e-02 1e-01 1e+00 1e+01 1e+02 1e+03 1e+04 1e+05 1e+06 1e+07 1e+08 
	Best λ 1: 0.01
	Val R2 1: 0.5526

1e-08 1e-07 1e-06 1e-05 1e-04 1e-03 1e-02 1e-01 1e+00 1e+01 1e+02 1e+03 1e+04 1e+05 1e+06 1e+07 1e+08 
	Best λ 2: 0.001
	Val R2 2: 0.6862

1e-08 1e-07 1e-06 1e-05 1e-04 1e-03 1e-02 1e-01 1e+00 1e+01 1e+02 1e+03 1e+04 1e+05 1e+06 1e+07 1e+08 
	Best λ 3: 0.1
	Val R2 3: 0.6868

1e-08 1e-07 1e-06 1e-05 1e-04 1e-03 1e-02 1e-01 1e+00 1e+01 1e+02 1e+03 1e+04 1e+05 1e+06 1e+07 1e+08 
	Best λ 4: 1e-05
	Val R2 4: 0.7895


Finish:
    Variables: ['pre', 'tmp', 'ndvi']
    Lambdas: [0.01, 0.001, 0.1, 1e-05]
    One-hot encoding: True
    Anomaly: False

    Final Val  R2: 0.8008 
    Final Test R2: 0.8356

    Demean Val  R2: 0.1744
    Demean Test R2: 0.6005

    Total time: 0.30 minutes



# NDVI model

In [7]:
climate_df = pd.read_csv(here("data", "climate", "climate_summary.csv"))
climate_df = climate_df.dropna()
drop_cols = ["year", "district", "yield_mt"]
ndvi_cols = climate_df.columns[climate_df.columns.to_series().str.contains("ndvi")]
keep_cols = [*ndvi_cols, *drop_cols]
climate_df = climate_df.loc[:, keep_cols]
climate_df = climate_df[climate_df.year >= 2016]

hot_encode = True
# hot_encode = False

crop_yield = climate_df.copy().loc[:, tuple(drop_cols)].reset_index(drop=True)
crop_yield["log_yield"] = np.log10(crop_yield.yield_mt.to_numpy() + 1)

#########################################    HOT ENCODE    #########################################
if hot_encode:
    drop_cols.remove("district")
    climate_df = pd.get_dummies(climate_df, columns=["district"], drop_first=False)
else:
    pass

#########################################    STANDARDIZE FEATURES    #########################################
climate_df = climate_df.set_index(drop_cols)
climate_df_scaled = StandardScaler().fit_transform(climate_df.values)
climate_df = pd.DataFrame(climate_df_scaled, index=climate_df.index).reset_index()
climate_df.columns = climate_df.columns.astype(str)

#########################################     K-FOLD SPLIT    #########################################
x_all = climate_df.drop(drop_cols, axis=1)
y_all = np.log10(climate_df.yield_mt.to_numpy() + 1)
x_train, x_test, y_train, y_test = train_test_split(
    x_all, y_all, test_size=0.2, random_state=0
)

#########################################     K-FOLD CV   ###########################################
### SETUP
alphas = {"alpha": np.logspace(-8, 8, base=10, num=17)}
kfold = KFold()
ridge = Ridge(random_state=0)
### GRID SEARCH - FINDING BEST REGULARIZATION PARAMETER
ridge_reg = GridSearchCV(ridge, alphas, scoring="r2", cv=kfold)
ridge_reg.fit(x_train, y_train)
best_model = ridge_reg.best_estimator_
### PREDICT - PREDICTING WITH BEST HYPERPARAMETER
val_predictions = cross_val_predict(best_model, X=x_train, y=y_train, cv=kfold)
train_predictions = best_model.predict(x_train)
test_predictions = best_model.predict(x_test)

#########################################     DE-MEAN R2    #########################################
crop_yield["prediction"] = np.maximum(best_model.predict(x_all), 0)

train_split = pd.DataFrame(
    np.repeat("train", len(x_train)), columns=["split"], index=x_train.index
)
train_split = train_split.join(crop_yield.copy()[crop_yield.index.isin(x_train.index)])
train_split["cv_prediction"] = np.maximum(val_predictions, 0)
train_split["demean_cv_yield"] = train_split["log_yield"] - train_split.groupby(
    "district"
)["log_yield"].transform("mean")
train_split["demean_cv_prediction"] = train_split[
    "cv_prediction"
] - train_split.groupby("district")["cv_prediction"].transform("mean")

test_split = pd.DataFrame(
    np.repeat("test", len(x_test)), columns=["split"], index=x_test.index
)
test_split = test_split.join(crop_yield.copy()[crop_yield.index.isin(x_test.index)])
test_split["cv_prediction"] = np.repeat(np.nan, len(x_test))
test_split["demean_cv_yield"] = np.repeat(np.nan, len(x_test))
test_split["demean_cv_prediction"] = np.repeat(np.nan, len(x_test))

predictions = pd.concat([train_split, test_split])

test_split["demean_test_yield"] = test_split["log_yield"] - test_split.groupby(
    "district"
)["log_yield"].transform("mean")
test_split["demean_test_prediction"] = test_split["prediction"] - test_split.groupby(
    "district"
)["prediction"].transform("mean")

print(
    f"Val  R2: {r2_score(y_train, val_predictions):0.4f}",
    f"\nTest R2: {r2_score(y_test, test_predictions):0.4f}",
    f"\n\nDemean Val  R2: {r2_score(train_split.demean_cv_yield, train_split.demean_cv_prediction):0.4f}",
    f"\nDemean Test R2: {r2_score(test_split.demean_test_yield, test_split.demean_test_prediction):0.4f}",
)

Val  R2: 0.7802 
Test R2: 0.8085 

Demean Val  R2: 0.0921 
Demean Test R2: 0.4394


# Precipitation, Temperature, and NDVI model

In [8]:
climate_df = pd.read_csv(here("data", "climate", "climate_summary.csv"))
climate_df = climate_df.dropna()
drop_cols = ["year", "district", "yield_mt"]
climate_df = climate_df[climate_df.year >= 2016]

hot_encode = True
# hot_encode = False

crop_yield = climate_df.copy().loc[:, tuple(drop_cols)].reset_index(drop=True)
crop_yield["log_yield"] = np.log10(crop_yield.yield_mt.to_numpy() + 1)

#########################################    HOT ENCODE    #########################################
if hot_encode:
    drop_cols.remove("district")
    climate_df = pd.get_dummies(climate_df, columns=["district"], drop_first=False)
else:
    pass

#########################################    STANDARDIZE FEATURES    #########################################
climate_df = climate_df.set_index(drop_cols)
climate_df_scaled = StandardScaler().fit_transform(climate_df.values)
climate_df = pd.DataFrame(climate_df_scaled, index=climate_df.index).reset_index()
climate_df.columns = climate_df.columns.astype(str)

#########################################     K-FOLD SPLIT    #########################################
x_all = climate_df.drop(drop_cols, axis=1)
y_all = np.log10(climate_df.yield_mt.to_numpy() + 1)
x_train, x_test, y_train, y_test = train_test_split(
    x_all, y_all, test_size=0.2, random_state=0
)

#########################################     K-FOLD CV   ###########################################
### SETUP
alphas = {"alpha": np.logspace(-8, 8, base=10, num=17)}
kfold = KFold()
ridge = Ridge(random_state=0)
### GRID SEARCH - FINDING BEST REGULARIZATION PARAMETER
ridge_reg = GridSearchCV(ridge, alphas, scoring="r2", cv=kfold)
ridge_reg.fit(x_train, y_train)
best_model = ridge_reg.best_estimator_
### PREDICT - PREDICTING WITH BEST HYPERPARAMETER
val_predictions = cross_val_predict(best_model, X=x_train, y=y_train, cv=kfold)
train_predictions = best_model.predict(x_train)
test_predictions = best_model.predict(x_test)

#########################################     DE-MEAN R2    #########################################
crop_yield["prediction"] = np.maximum(best_model.predict(x_all), 0)

train_split = pd.DataFrame(
    np.repeat("train", len(x_train)), columns=["split"], index=x_train.index
)
train_split = train_split.join(crop_yield.copy()[crop_yield.index.isin(x_train.index)])
train_split["cv_prediction"] = np.maximum(val_predictions, 0)
train_split["demean_cv_yield"] = train_split["log_yield"] - train_split.groupby(
    "district"
)["log_yield"].transform("mean")
train_split["demean_cv_prediction"] = train_split[
    "cv_prediction"
] - train_split.groupby("district")["cv_prediction"].transform("mean")

test_split = pd.DataFrame(
    np.repeat("test", len(x_test)), columns=["split"], index=x_test.index
)
test_split = test_split.join(crop_yield.copy()[crop_yield.index.isin(x_test.index)])
test_split["cv_prediction"] = np.repeat(np.nan, len(x_test))
test_split["demean_cv_yield"] = np.repeat(np.nan, len(x_test))
test_split["demean_cv_prediction"] = np.repeat(np.nan, len(x_test))

predictions = pd.concat([train_split, test_split])

test_split["demean_test_yield"] = test_split["log_yield"] - test_split.groupby(
    "district"
)["log_yield"].transform("mean")
test_split["demean_test_prediction"] = test_split["prediction"] - test_split.groupby(
    "district"
)["prediction"].transform("mean")

print(
    f"Val  R2: {r2_score(y_train, val_predictions):0.4f}",
    f"\nTest R2: {r2_score(y_test, test_predictions):0.4f}",
    f"\n\nDemean Val  R2: {r2_score(train_split.demean_cv_yield, train_split.demean_cv_prediction):0.4f}",
    f"\nDemean Test R2: {r2_score(test_split.demean_test_yield, test_split.demean_test_prediction):0.4f}",
)

Val  R2: 0.8044 
Test R2: 0.8297 

Demean Val  R2: 0.1723 
Demean Test R2: 0.5297


In [9]:
ridge_reg.best_score_

0.792567922097443

# NDVI Anomaly Model

In [10]:
climate_df = pd.read_csv(here("data", "climate", "climate_summary.csv"))
climate_df = climate_df.dropna()
drop_cols = ["year", "district", "yield_mt"]
ndvi_cols = climate_df.columns[climate_df.columns.to_series().str.contains("ndvi")]
keep_cols = [*ndvi_cols, *drop_cols]
climate_df = climate_df.loc[:, keep_cols]
climate_df = climate_df[climate_df.year >= 2016]

crop_yield = climate_df.copy().loc[:, tuple(drop_cols)].reset_index(drop=True)
crop_yield["log_yield"] = np.log10(crop_yield.yield_mt.to_numpy() + 1)

#########################################    STANDARDIZE FEATURES    #########################################
climate_df = climate_df.set_index(drop_cols)
climate_df_scaled = StandardScaler().fit_transform(climate_df.values)
climate_df = pd.DataFrame(climate_df_scaled, index=climate_df.index).reset_index()
climate_df.columns = climate_df.columns.astype(str)

#########################################     CALCULATE ANOMALY   #########################################
climate_df["yield_mt"] = np.log10(climate_df.yield_mt.to_numpy() + 1)
climate_df.set_index(["year", "district"], inplace=True)
var_cols = climate_df.columns
climate_df = climate_df[var_cols] - climate_df.groupby(["district"], as_index=True)[
    var_cols
].transform("mean")
climate_df.reset_index(drop=False, inplace=True)

#########################################     K-FOLD SPLIT    #########################################
x_all = climate_df.drop(drop_cols, axis=1)
y_all = climate_df.yield_mt
x_train, x_test, y_train, y_test = train_test_split(
    x_all, y_all, test_size=0.2, random_state=0
)

#########################################     K-FOLD CV   ###########################################
### SETUP
alphas = {"alpha": np.logspace(-8, 8, base=10, num=17)}
kfold = KFold()
ridge = Ridge(random_state=0)
### GRID SEARCH - FINDING BEST REGULARIZATION PARAMETER
ridge_reg = GridSearchCV(ridge, alphas, scoring="r2", cv=kfold)
ridge_reg.fit(x_train, y_train)
best_model = ridge_reg.best_estimator_
### PREDICT - PREDICTING WITH BEST HYPERPARAMETER
val_predictions = cross_val_predict(best_model, X=x_train, y=y_train, cv=kfold)
train_predictions = best_model.predict(x_train)
test_predictions = best_model.predict(x_test)

#########################################     DE-MEAN R2    #########################################
crop_yield["prediction"] = best_model.predict(x_all)

train_split = pd.DataFrame(
    np.repeat("train", len(x_train)), columns=["split"], index=x_train.index
)
train_split = train_split.join(crop_yield.copy()[crop_yield.index.isin(x_train.index)])
train_split["cv_prediction"] = val_predictions

test_split = pd.DataFrame(
    np.repeat("test", len(x_test)), columns=["split"], index=x_test.index
)
test_split = test_split.join(crop_yield.copy()[crop_yield.index.isin(x_test.index)])
test_split["cv_prediction"] = np.repeat(np.nan, len(x_test))

predictions = pd.concat([train_split, test_split])

print(
    f"Val  R2: {r2_score(y_train, val_predictions):0.4f}\nTest R2: {r2_score(y_test, test_predictions):0.4f}"
)

Val  R2: 0.4449
Test R2: 0.4293


# Precipitation, Temperature, and NDVI  Anomaly model

In [11]:
climate_df = pd.read_csv(here("data", "climate", "climate_summary.csv"))
climate_df = climate_df.dropna()
drop_cols = ["year", "district", "yield_mt"]
climate_df = climate_df[climate_df.year >= 2016]

crop_yield = climate_df.copy().loc[:, tuple(drop_cols)].reset_index(drop=True)
crop_yield["log_yield"] = np.log10(crop_yield.yield_mt.to_numpy() + 1)

#########################################    STANDARDIZE FEATURES    #########################################
climate_df = climate_df.set_index(drop_cols)
climate_df_scaled = StandardScaler().fit_transform(climate_df.values)
climate_df = pd.DataFrame(climate_df_scaled, index=climate_df.index).reset_index()
climate_df.columns = climate_df.columns.astype(str)

#########################################     CALCULATE ANOMALY   #########################################
climate_df["yield_mt"] = np.log10(climate_df.yield_mt.to_numpy() + 1)
climate_df.set_index(["year", "district"], inplace=True)
var_cols = climate_df.columns
climate_df = climate_df[var_cols] - climate_df.groupby(["district"], as_index=True)[
    var_cols
].transform("mean")
climate_df.reset_index(drop=False, inplace=True)

#########################################     K-FOLD SPLIT    #########################################
x_all = climate_df.drop(drop_cols, axis=1)
y_all = climate_df.yield_mt
x_train, x_test, y_train, y_test = train_test_split(
    x_all, y_all, test_size=0.2, random_state=0
)

#########################################     K-FOLD CV   ###########################################
### SETUP
alphas = {"alpha": np.logspace(-8, 8, base=10, num=17)}
kfold = KFold()
ridge = Ridge(random_state=0)
### GRID SEARCH - FINDING BEST REGULARIZATION PARAMETER
ridge_reg = GridSearchCV(ridge, alphas, scoring="r2", cv=kfold)
ridge_reg.fit(x_train, y_train)
best_model = ridge_reg.best_estimator_
### PREDICT - PREDICTING WITH BEST HYPERPARAMETER
val_predictions = cross_val_predict(best_model, X=x_train, y=y_train, cv=kfold)
train_predictions = best_model.predict(x_train)
test_predictions = best_model.predict(x_test)

#########################################     DE-MEAN R2    #########################################
crop_yield["prediction"] = best_model.predict(x_all)

train_split = pd.DataFrame(
    np.repeat("train", len(x_train)), columns=["split"], index=x_train.index
)
train_split = train_split.join(crop_yield.copy()[crop_yield.index.isin(x_train.index)])
train_split["cv_prediction"] = val_predictions

test_split = pd.DataFrame(
    np.repeat("test", len(x_test)), columns=["split"], index=x_test.index
)
test_split = test_split.join(crop_yield.copy()[crop_yield.index.isin(x_test.index)])
test_split["cv_prediction"] = np.repeat(np.nan, len(x_test))

predictions = pd.concat([train_split, test_split])

print(
    f"Val  R2: {r2_score(y_train, val_predictions):0.4f}\nTest R2: {r2_score(y_test, test_predictions):0.4f}"
)

Val  R2: 0.5284
Test R2: 0.5145
