In [1]:
# !pip install -q pyhere p_tqdm glum

In [3]:
%load_ext lab_black

In [2]:
import time
import os

from pyhere import here
from datetime import date

import numpy as np
import pandas as pd

import pyarrow
import itertools
import multiprocessing
import p_tqdm

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.model_selection import (
    train_test_split,
    KFold,
    GridSearchCV,
    cross_val_predict
)
from sklearn.metrics import r2_score
from scipy.stats import spearmanr, pearsonr

from task_modeling_utils import *

In [26]:
def climate_model(
    variable_groups=["pre", "tmp", "ndvi"],
    # he_anom=[True, False],
    hot_encode=True,
    anomaly=False,
    index_cols=["year", "district", "yield_mt"],
    year_start=2016,
    n_splits=5,
):
    #########################################     READ DATA    #########################################
    data = pd.read_csv(here("data", "climate", "climate_summary.csv"))
    data = data.dropna()

    # hot_encode = he_anom[0]
    # anom = he_anom[1]

    keep_cols = []

    for var in variable_groups:
        tmp = data.columns[data.columns.to_series().str.contains(var)].tolist()
        keep_cols.append(tmp)

    keep_cols = [*index_cols, *[col for cols in keep_cols for col in cols]]

    data = data.loc[:, keep_cols]

    data = data[data.year >= year_start]

    crop_yield = data.copy().loc[:, tuple(index_cols)].reset_index(drop=True)
    crop_yield["log_yield"] = np.log10(crop_yield.yield_mt.to_numpy() + 1)

    ########################################    STANDARDIZE FEATURES    #########################################
    data = data.set_index(index_cols)
    data_scaled = StandardScaler().fit_transform(data.values)
    data = pd.DataFrame(data_scaled, index=data.index).reset_index()
    data.columns = data.columns.astype(str)

    #########################################    HOT ENCODE    #########################################
    if hot_encode:
        index_cols.remove("district")
        data = pd.get_dummies(data, columns=["district"], drop_first=False)
    else:
        pass

    #########################################     K-FOLD SPLIT    #########################################
    x_all = data.drop(index_cols, axis=1)
    y_all = np.log10(data.yield_mt.to_numpy() + 1)
    x_train, x_test, y_train, y_test = train_test_split(
        x_all, y_all, test_size=0.2, random_state=0
    )

    #########################################     K-FOLD CV    #########################################
    ### SETUP
    tic = time.time()
    kfold = KFold(n_splits=n_splits)
    alphas = {"alpha": np.logspace(-8, 8, base=10, num=17)}

    i = 0
    start = [i]
    end = [x_train.shape[1]]

    for var in variable_groups:
        i += 12
        start.append(i)
        end.append(i)
    start.sort()
    end.sort()

    if not hot_encode:
        start = start[0:-1]
        end = end[0:-1]

    ### GRID SEARCH - FINDING BEST REGULARIZATION PARAMETER(S)
    best_lambdas, best_scores, best_model = kfold_rr_multi_lambda_tuning(
        X=x_train,
        y=y_train,
        grid=alphas.get("alpha"),
        n_splits=n_splits,
        start=start,
        end=end,
        static_lam=1,
        verbose=0,
        show_linalg_warning=False,
        fit_model_after_tuning=True,
    )
    ### PREDICT WITH BEST HYPERPARAMETER(S)
    val_predictions = cross_val_predict(best_model, X=x_train, y=y_train, cv=kfold)
    train_predictions = best_model.predict(x_train)
    test_predictions = best_model.predict(x_test)

    #########################################     DE-MEAN R2    #########################################
    crop_yield["prediction"] = np.maximum(best_model.predict(x_all), 0)

    train_split = pd.DataFrame(
        np.repeat("train", len(x_train)), columns=["split"], index=x_train.index
    )
    train_split = train_split.join(
        crop_yield.copy()[crop_yield.index.isin(x_train.index)]
    )
    train_split["cv_prediction"] = np.maximum(val_predictions, 0)
    train_split["demean_cv_yield"] = train_split["log_yield"] - train_split.groupby(
        "district"
    )["log_yield"].transform("mean")
    train_split["demean_cv_prediction"] = train_split[
        "cv_prediction"
    ] - train_split.groupby("district")["cv_prediction"].transform("mean")

    test_split = pd.DataFrame(
        np.repeat("test", len(x_test)), columns=["split"], index=x_test.index
    )
    test_split = test_split.join(crop_yield.copy()[crop_yield.index.isin(x_test.index)])
    test_split["cv_prediction"] = np.repeat(np.nan, len(x_test))
    test_split["demean_cv_yield"] = np.repeat(np.nan, len(x_test))
    test_split["demean_cv_prediction"] = np.repeat(np.nan, len(x_test))

    predictions = pd.concat([train_split, test_split])

    test_split["demean_test_yield"] = test_split["log_yield"] - test_split.groupby(
        "district"
    )["log_yield"].transform("mean")
    test_split["demean_test_prediction"] = test_split[
        "prediction"
    ] - test_split.groupby("district")["prediction"].transform("mean")

    print(
        f"""
Finish:
    Variables: {variable_groups}
    Lambdas:   {best_lambdas}
    One-hot encoding: {hot_encode}
    Anomaly: {anomaly}
    
    Final Val  R2: {r2_score(y_train, val_predictions):0.4f} 
    Final Test R2: {r2_score(y_test, test_predictions):0.4f}
    
    Demean Val  R2: {r2_score(train_split.demean_cv_yield, train_split.demean_cv_prediction):0.4f}
    Demean Test R2: {r2_score(test_split.demean_test_yield, test_split.demean_test_prediction):0.4f}
    
    Total time: {(time.time()-tic)/60:0.2f} minutes
    """
    )
    d = {
        "variables": variable_groups,
        "year_start": year_start,
        "hot_encode": hot_encode,
        "anomaly": anomaly,
        "total_n": len(x_all),
        "train_n": len(x_train),
        "test_n": len(x_test),
        "best_reg_param": [best_lambdas],
        "mean_of_val_R2": [best_scores],
        "val_R2": r2_score(y_train, val_predictions),
        "val_r": pearsonr(val_predictions, y_train)[0],
        "val_r2": pearsonr(val_predictions, y_train)[0] ** 2,
        "train_R2": r2_score(y_train, train_predictions),
        "train_r": pearsonr(train_predictions, y_train)[0],
        "train_r2": pearsonr(train_predictions, y_train)[0] ** 2,
        "test_R2": r2_score(y_test, test_predictions),
        "test_r": pearsonr(test_predictions, y_test)[0],
        "test_r2": pearsonr(test_predictions, y_test)[0] ** 2,
        "demean_cv_R2": r2_score(
            train_split.demean_cv_yield, train_split.demean_cv_prediction
        ),
        "demean_cv_r": pearsonr(
            train_split.demean_cv_yield, train_split.demean_cv_prediction
        )[0],
        "demean_cv_r2": pearsonr(
            train_split.demean_cv_yield, train_split.demean_cv_prediction
        )[0]
        ** 2,
    }
    return d

In [27]:
# climate_model(
#     pd.read_csv(here("data", "climate", "climate_summary.csv")),
#     year_start=2016,
#     variable_groups=["pre", "tmp", "ndvi"],
# )

In [28]:
variables = ["pre", "tmp", "ndvi"]
HE = [True, False]
anom = [True, False]

clim = list(itertools.combinations(variables, 2))
clim.append([variables[0]])
clim.append([variables[1]])
clim.append([variables[2]])
clim.append(variables)
clim = [list(elem) for elem in clim]
clim.sort(key=len)
# clim

In [29]:
paramlist = list(itertools.product([True, False], [True, False]))
paramlist = [list(elem) for elem in paramlist]
paramlist = list(itertools.product(clim, paramlist))
# paramlist

In [30]:
output = []
for ls in clim:
    print(ls)
    for he in HE:
        print(he)
        out = climate_model(
            variable_groups=ls,
            hot_encode=he,
            index_cols=["year", "district", "yield_mt"],
        )
        output.append(out)
results = pd.DataFrame(output)

['pre']
True

Finish:
    Variables: ['pre']
    Lambdas:   [0.01, 0.001]
    One-hot encoding: True
    Anomaly: False
    
    Final Val  R2: 0.6225 
    Final Test R2: 0.6943
    
    Demean Val  R2: -0.6271
    Demean Test R2: 0.1442
    
    Total time: 0.16 minutes
    
False

Finish:
    Variables: ['pre']
    Lambdas:   [0.01]
    One-hot encoding: False
    Anomaly: False
    
    Final Val  R2: 0.3434 
    Final Test R2: 0.3772
    
    Demean Val  R2: -0.3295
    Demean Test R2: -0.2086
    
    Total time: 0.02 minutes
    
['tmp']
True

Finish:
    Variables: ['tmp']
    Lambdas:   [0.01, 0.001]
    One-hot encoding: True
    Anomaly: False
    
    Final Val  R2: 0.7724 
    Final Test R2: 0.8177
    
    Demean Val  R2: 0.0210
    Demean Test R2: 0.4869
    
    Total time: 0.16 minutes
    
False

Finish:
    Variables: ['tmp']
    Lambdas:   [0.01]
    One-hot encoding: False
    Anomaly: False
    
    Final Val  R2: 0.6322 
    Final Test R2: 0.6490
    
    Demean V

In [31]:
results

Unnamed: 0,variables,year_start,hot_encode,anomaly,total_n,train_n,test_n,best_reg_param,mean_of_val_R2,val_R2,...,val_r2,train_R2,train_r,train_r2,test_R2,test_r,test_r2,demean_cv_R2,demean_cv_r,demean_cv_r2
0,[pre],2016,True,False,432,345,87,"[[0.01, 0.001]]","[[0.3250009242275994, 0.6005759481906308]]",0.622504,...,0.631423,0.802148,0.896928,0.80448,0.694306,0.83472,0.696757,-0.627066,-0.371544,0.138045
1,[pre],2016,False,False,432,345,87,[[0.01]],[[0.31781598189700205]],0.343397,...,0.345237,0.394337,0.628025,0.394416,0.377233,0.614321,0.377391,-0.329451,-0.137914,0.01902
2,[tmp],2016,True,False,432,345,87,"[[0.01, 0.001]]","[[0.6192899462388841, 0.7594750722874657]]",0.772396,...,0.77394,0.873456,0.935209,0.874615,0.817738,0.905197,0.819381,0.021033,0.32342,0.1046
3,[tmp],2016,False,False,432,345,87,[[0.01]],[[0.6151863495642378]],0.632216,...,0.632265,0.657021,0.810916,0.657585,0.64902,0.808263,0.653289,0.289468,0.540687,0.292342
4,[ndvi],2016,True,False,432,345,87,"[[0.01, 0.001]]","[[0.25744743523510616, 0.7632153019026009]]",0.776423,...,0.776574,0.875685,0.937197,0.878338,0.804603,0.898677,0.80762,0.037282,0.303158,0.091905
5,[ndvi],2016,False,False,432,345,87,[[0.01]],[[0.244861108267257]],0.271104,...,0.2727,0.31741,0.563545,0.317583,0.324052,0.588325,0.346126,0.29615,0.557405,0.3107
6,"[pre, tmp]",2016,True,False,432,345,87,"[[0.1, 0.01, 0.01]]","[[0.5052548383406703, 0.6738933818363734, 0.76...",0.774578,...,0.775854,0.845276,0.922097,0.850264,0.801093,0.898263,0.806877,0.165001,0.456145,0.208068
7,"[pre, tmp]",2016,False,False,432,345,87,"[[0.1, 0.01]]","[[0.5009054966917378, 0.6709133958104069]]",0.686913,...,0.687049,0.724484,0.851868,0.725679,0.722417,0.853195,0.727942,0.308743,0.570704,0.325703
8,"[pre, ndvi]",2016,True,False,432,345,87,"[[0.01, 0.001, 0.0001]]","[[0.40933173229183684, 0.4629392954787267, 0.7...",0.786272,...,0.794674,0.903675,0.950671,0.903776,0.800254,0.8952,0.801383,0.110937,0.437815,0.191682
9,"[pre, ndvi]",2016,False,False,432,345,87,"[[0.01, 0.001]]","[[0.40261554193944293, 0.4559516999439376]]",0.483837,...,0.487112,0.562675,0.750238,0.562856,0.491529,0.707057,0.49993,0.105026,0.41856,0.175193


In [32]:
today = date.today().strftime("%Y-%m-%d")
file_name = f"climate_model_{today}.csv"
print(f"Saving results as: {file_name}\n\n")
results.to_csv(here("data", "results", file_name), index=False)

Saving results as: climate_model_2023-03-20.csv




In [6]:
paramlist = (i for i in paramlist)

In [None]:
def climate_model(variable_groups, he_anom):
    print(variable_groups, flush=True)
    
if __name__ == "__main__":
    multiprocessing.Pool().starmap(climate_model, paramlist)
    mult

In [None]:
%%time    
##### With progress bar
workers = os.cpu_count()
if __name__ == "__main__":
    multiprocessing.Pool().starmap(climate_model, paramlist)
    # with multiprocessing.Pool(processes=workers) as pool:
    #     output = pool.starmap(climate_model, paramlist)
    # output = p_tqdm.p_umap(climate_model, paramlist, num_cpus=workers)
    # results = pd.concat(output).reset_index(drop=True)
    # today = date.today().strftime("%Y-%m-%d")
    # file_name = f'results_{today}.csv'
    # print(f"Saving results as: {file_name}\n\n")           
    # results.to_csv(here("data","results", file_name), index=False)

In [6]:
### TESTING

data = pd.read_csv(here("data", "climate", "climate_summary.csv"))
hot_encode = True
anomaly = False
variable_groups = ["pre", "tmp", "ndvi"]
index_cols = ["year", "district", "yield_mt"]
year_start = 2016
n_splits = 5

#########################################     READ DATA    #########################################
data = data.dropna()

keep_cols = []

for var in variable_groups:
    tmp = data.columns[data.columns.to_series().str.contains(var)].tolist()
    keep_cols.append(tmp)

keep_cols = [*index_cols, *[col for cols in keep_cols for col in cols]]

data = data.loc[:, keep_cols]

data = data[data.year >= year_start]

crop_yield = data.copy().loc[:, tuple(index_cols)].reset_index(drop=True)
crop_yield["log_yield"] = np.log10(crop_yield.yield_mt.to_numpy() + 1)

########################################    STANDARDIZE FEATURES    #########################################
data = data.set_index(index_cols)
data_scaled = StandardScaler().fit_transform(data.values)
data = pd.DataFrame(data_scaled, index=data.index).reset_index()
data.columns = data.columns.astype(str)

#########################################    HOT ENCODE    #########################################
if hot_encode:
    index_cols.remove("district")
    data = pd.get_dummies(data, columns=["district"], drop_first=False)
else:
    pass

#########################################     K-FOLD SPLIT    #########################################
x_all = data.drop(index_cols, axis=1)
y_all = np.log10(data.yield_mt.to_numpy() + 1)
x_train, x_test, y_train, y_test = train_test_split(
    x_all, y_all, test_size=0.2, random_state=0
)

#########################################     K-FOLD CV    #########################################
### SETUP
tic = time.time()
kfold = KFold(n_splits=n_splits)
alphas = {"alpha": np.logspace(-8, 8, base=10, num=17)}

i = 0
start = [i]
end = [x_train.shape[1]]

for var in variable_groups:
    i += 12
    start.append(i)
    end.append(i)
start.sort()
end.sort()

if not hot_encode:
    start = start[0:-1]
    end = end[0:-1]

### GRID SEARCH - FINDING BEST REGULARIZATION PARAMETER(S)
best_lambdas, best_scores, best_model = kfold_rr_multi_lambda_tuning(
    X=x_train,
    y=y_train,
    grid=alphas.get("alpha"),
    n_splits=n_splits,
    start=start,
    end=end,
    static_lam=1,
    verbose=2,
    show_linalg_warning=False,
    fit_model_after_tuning=True,
)
### PREDICT WITH BEST HYPERPARAMETER(S)
val_predictions = cross_val_predict(best_model, X=x_train, y=y_train, cv=kfold)
train_predictions = best_model.predict(x_train)
test_predictions = best_model.predict(x_test)

#########################################     DE-MEAN R2    #########################################
crop_yield["prediction"] = np.maximum(best_model.predict(x_all), 0)

train_split = pd.DataFrame(
    np.repeat("train", len(x_train)), columns=["split"], index=x_train.index
)
train_split = train_split.join(crop_yield.copy()[crop_yield.index.isin(x_train.index)])
train_split["cv_prediction"] = np.maximum(val_predictions, 0)
train_split["demean_cv_yield"] = train_split["log_yield"] - train_split.groupby(
    "district"
)["log_yield"].transform("mean")
train_split["demean_cv_prediction"] = train_split[
    "cv_prediction"
] - train_split.groupby("district")["cv_prediction"].transform("mean")

test_split = pd.DataFrame(
    np.repeat("test", len(x_test)), columns=["split"], index=x_test.index
)
test_split = test_split.join(crop_yield.copy()[crop_yield.index.isin(x_test.index)])
test_split["cv_prediction"] = np.repeat(np.nan, len(x_test))
test_split["demean_cv_yield"] = np.repeat(np.nan, len(x_test))
test_split["demean_cv_prediction"] = np.repeat(np.nan, len(x_test))

predictions = pd.concat([train_split, test_split])

test_split["demean_test_yield"] = test_split["log_yield"] - test_split.groupby(
    "district"
)["log_yield"].transform("mean")
test_split["demean_test_prediction"] = test_split["prediction"] - test_split.groupby(
    "district"
)["prediction"].transform("mean")

# variable_groups.append("districts")
# group_lambdas = dict(zip(variable_groups, best_lambdas))
# group_lambdas

print(
    f"""
Finish:
    Variables: {variable_groups}
    Lambdas: {best_lambdas}
    One-hot encoding: {hot_encode}
    Anomaly: {anomaly}

    Final Val  R2: {r2_score(y_train, val_predictions):0.4f} 
    Final Test R2: {r2_score(y_test, test_predictions):0.4f}

    Demean Val  R2: {r2_score(train_split.demean_cv_yield, train_split.demean_cv_prediction):0.4f}
    Demean Test R2: {r2_score(test_split.demean_test_yield, test_split.demean_test_prediction):0.4f}

    Total time: {(time.time()-tic)/60:0.2f} minutes
"""
)

1e-08 1e-07 1e-06 1e-05 1e-04 1e-03 1e-02 1e-01 1e+00 1e+01 1e+02 1e+03 1e+04 1e+05 1e+06 1e+07 1e+08 
	Best λ 1: 0.01
	Val R2 1: 0.5526

1e-08 1e-07 1e-06 1e-05 1e-04 1e-03 1e-02 1e-01 1e+00 1e+01 1e+02 1e+03 1e+04 1e+05 1e+06 1e+07 1e+08 
	Best λ 2: 0.001
	Val R2 2: 0.6862

1e-08 1e-07 1e-06 1e-05 1e-04 1e-03 1e-02 1e-01 1e+00 1e+01 1e+02 1e+03 1e+04 1e+05 1e+06 1e+07 1e+08 
	Best λ 3: 0.1
	Val R2 3: 0.6868

1e-08 1e-07 1e-06 1e-05 1e-04 1e-03 1e-02 1e-01 1e+00 1e+01 1e+02 1e+03 1e+04 1e+05 1e+06 1e+07 1e+08 
	Best λ 4: 1e-05
	Val R2 4: 0.7895


Finish:
    Variables: ['pre', 'tmp', 'ndvi']
    Lambdas: [0.01, 0.001, 0.1, 1e-05]
    One-hot encoding: True
    Anomaly: False

    Final Val  R2: 0.8008 
    Final Test R2: 0.8356

    Demean Val  R2: 0.1744
    Demean Test R2: 0.6005

    Total time: 0.30 minutes



# NDVI model

In [7]:
climate_df = pd.read_csv(here("data", "climate", "climate_summary.csv"))
climate_df = climate_df.dropna()
drop_cols = ["year", "district", "yield_mt"]
ndvi_cols = climate_df.columns[climate_df.columns.to_series().str.contains("ndvi")]
keep_cols = [*ndvi_cols, *drop_cols]
climate_df = climate_df.loc[:, keep_cols]
climate_df = climate_df[climate_df.year >= 2016]

hot_encode = True
# hot_encode = False

crop_yield = climate_df.copy().loc[:, tuple(drop_cols)].reset_index(drop=True)
crop_yield["log_yield"] = np.log10(crop_yield.yield_mt.to_numpy() + 1)

#########################################    HOT ENCODE    #########################################
if hot_encode:
    drop_cols.remove("district")
    climate_df = pd.get_dummies(climate_df, columns=["district"], drop_first=False)
else:
    pass

#########################################    STANDARDIZE FEATURES    #########################################
climate_df = climate_df.set_index(drop_cols)
climate_df_scaled = StandardScaler().fit_transform(climate_df.values)
climate_df = pd.DataFrame(climate_df_scaled, index=climate_df.index).reset_index()
climate_df.columns = climate_df.columns.astype(str)

#########################################     K-FOLD SPLIT    #########################################
x_all = climate_df.drop(drop_cols, axis=1)
y_all = np.log10(climate_df.yield_mt.to_numpy() + 1)
x_train, x_test, y_train, y_test = train_test_split(
    x_all, y_all, test_size=0.2, random_state=0
)

#########################################     K-FOLD CV   ###########################################
### SETUP
alphas = {"alpha": np.logspace(-8, 8, base=10, num=17)}
kfold = KFold()
ridge = Ridge(random_state=0)
### GRID SEARCH - FINDING BEST REGULARIZATION PARAMETER
ridge_reg = GridSearchCV(ridge, alphas, scoring="r2", cv=kfold)
ridge_reg.fit(x_train, y_train)
best_model = ridge_reg.best_estimator_
### PREDICT - PREDICTING WITH BEST HYPERPARAMETER
val_predictions = cross_val_predict(best_model, X=x_train, y=y_train, cv=kfold)
train_predictions = best_model.predict(x_train)
test_predictions = best_model.predict(x_test)

#########################################     DE-MEAN R2    #########################################
crop_yield["prediction"] = np.maximum(best_model.predict(x_all), 0)

train_split = pd.DataFrame(
    np.repeat("train", len(x_train)), columns=["split"], index=x_train.index
)
train_split = train_split.join(crop_yield.copy()[crop_yield.index.isin(x_train.index)])
train_split["cv_prediction"] = np.maximum(val_predictions, 0)
train_split["demean_cv_yield"] = train_split["log_yield"] - train_split.groupby(
    "district"
)["log_yield"].transform("mean")
train_split["demean_cv_prediction"] = train_split[
    "cv_prediction"
] - train_split.groupby("district")["cv_prediction"].transform("mean")

test_split = pd.DataFrame(
    np.repeat("test", len(x_test)), columns=["split"], index=x_test.index
)
test_split = test_split.join(crop_yield.copy()[crop_yield.index.isin(x_test.index)])
test_split["cv_prediction"] = np.repeat(np.nan, len(x_test))
test_split["demean_cv_yield"] = np.repeat(np.nan, len(x_test))
test_split["demean_cv_prediction"] = np.repeat(np.nan, len(x_test))

predictions = pd.concat([train_split, test_split])

test_split["demean_test_yield"] = test_split["log_yield"] - test_split.groupby(
    "district"
)["log_yield"].transform("mean")
test_split["demean_test_prediction"] = test_split["prediction"] - test_split.groupby(
    "district"
)["prediction"].transform("mean")

print(
    f"Val  R2: {r2_score(y_train, val_predictions):0.4f}",
    f"\nTest R2: {r2_score(y_test, test_predictions):0.4f}",
    f"\n\nDemean Val  R2: {r2_score(train_split.demean_cv_yield, train_split.demean_cv_prediction):0.4f}",
    f"\nDemean Test R2: {r2_score(test_split.demean_test_yield, test_split.demean_test_prediction):0.4f}",
)

Val  R2: 0.7802 
Test R2: 0.8085 

Demean Val  R2: 0.0921 
Demean Test R2: 0.4394


# Precipitation, Temperature, and NDVI model

In [8]:
climate_df = pd.read_csv(here("data", "climate", "climate_summary.csv"))
climate_df = climate_df.dropna()
drop_cols = ["year", "district", "yield_mt"]
climate_df = climate_df[climate_df.year >= 2016]

hot_encode = True
# hot_encode = False

crop_yield = climate_df.copy().loc[:, tuple(drop_cols)].reset_index(drop=True)
crop_yield["log_yield"] = np.log10(crop_yield.yield_mt.to_numpy() + 1)

#########################################    HOT ENCODE    #########################################
if hot_encode:
    drop_cols.remove("district")
    climate_df = pd.get_dummies(climate_df, columns=["district"], drop_first=False)
else:
    pass

#########################################    STANDARDIZE FEATURES    #########################################
climate_df = climate_df.set_index(drop_cols)
climate_df_scaled = StandardScaler().fit_transform(climate_df.values)
climate_df = pd.DataFrame(climate_df_scaled, index=climate_df.index).reset_index()
climate_df.columns = climate_df.columns.astype(str)

#########################################     K-FOLD SPLIT    #########################################
x_all = climate_df.drop(drop_cols, axis=1)
y_all = np.log10(climate_df.yield_mt.to_numpy() + 1)
x_train, x_test, y_train, y_test = train_test_split(
    x_all, y_all, test_size=0.2, random_state=0
)

#########################################     K-FOLD CV   ###########################################
### SETUP
alphas = {"alpha": np.logspace(-8, 8, base=10, num=17)}
kfold = KFold()
ridge = Ridge(random_state=0)
### GRID SEARCH - FINDING BEST REGULARIZATION PARAMETER
ridge_reg = GridSearchCV(ridge, alphas, scoring="r2", cv=kfold)
ridge_reg.fit(x_train, y_train)
best_model = ridge_reg.best_estimator_
### PREDICT - PREDICTING WITH BEST HYPERPARAMETER
val_predictions = cross_val_predict(best_model, X=x_train, y=y_train, cv=kfold)
train_predictions = best_model.predict(x_train)
test_predictions = best_model.predict(x_test)

#########################################     DE-MEAN R2    #########################################
crop_yield["prediction"] = np.maximum(best_model.predict(x_all), 0)

train_split = pd.DataFrame(
    np.repeat("train", len(x_train)), columns=["split"], index=x_train.index
)
train_split = train_split.join(crop_yield.copy()[crop_yield.index.isin(x_train.index)])
train_split["cv_prediction"] = np.maximum(val_predictions, 0)
train_split["demean_cv_yield"] = train_split["log_yield"] - train_split.groupby(
    "district"
)["log_yield"].transform("mean")
train_split["demean_cv_prediction"] = train_split[
    "cv_prediction"
] - train_split.groupby("district")["cv_prediction"].transform("mean")

test_split = pd.DataFrame(
    np.repeat("test", len(x_test)), columns=["split"], index=x_test.index
)
test_split = test_split.join(crop_yield.copy()[crop_yield.index.isin(x_test.index)])
test_split["cv_prediction"] = np.repeat(np.nan, len(x_test))
test_split["demean_cv_yield"] = np.repeat(np.nan, len(x_test))
test_split["demean_cv_prediction"] = np.repeat(np.nan, len(x_test))

predictions = pd.concat([train_split, test_split])

test_split["demean_test_yield"] = test_split["log_yield"] - test_split.groupby(
    "district"
)["log_yield"].transform("mean")
test_split["demean_test_prediction"] = test_split["prediction"] - test_split.groupby(
    "district"
)["prediction"].transform("mean")

print(
    f"Val  R2: {r2_score(y_train, val_predictions):0.4f}",
    f"\nTest R2: {r2_score(y_test, test_predictions):0.4f}",
    f"\n\nDemean Val  R2: {r2_score(train_split.demean_cv_yield, train_split.demean_cv_prediction):0.4f}",
    f"\nDemean Test R2: {r2_score(test_split.demean_test_yield, test_split.demean_test_prediction):0.4f}",
)

Val  R2: 0.8044 
Test R2: 0.8297 

Demean Val  R2: 0.1723 
Demean Test R2: 0.5297


In [9]:
ridge_reg.best_score_

0.792567922097443

# NDVI Anomaly Model

In [10]:
climate_df = pd.read_csv(here("data", "climate", "climate_summary.csv"))
climate_df = climate_df.dropna()
drop_cols = ["year", "district", "yield_mt"]
ndvi_cols = climate_df.columns[climate_df.columns.to_series().str.contains("ndvi")]
keep_cols = [*ndvi_cols, *drop_cols]
climate_df = climate_df.loc[:, keep_cols]
climate_df = climate_df[climate_df.year >= 2016]

crop_yield = climate_df.copy().loc[:, tuple(drop_cols)].reset_index(drop=True)
crop_yield["log_yield"] = np.log10(crop_yield.yield_mt.to_numpy() + 1)

#########################################    STANDARDIZE FEATURES    #########################################
climate_df = climate_df.set_index(drop_cols)
climate_df_scaled = StandardScaler().fit_transform(climate_df.values)
climate_df = pd.DataFrame(climate_df_scaled, index=climate_df.index).reset_index()
climate_df.columns = climate_df.columns.astype(str)

#########################################     CALCULATE ANOMALY   #########################################
climate_df["yield_mt"] = np.log10(climate_df.yield_mt.to_numpy() + 1)
climate_df.set_index(["year", "district"], inplace=True)
var_cols = climate_df.columns
climate_df = climate_df[var_cols] - climate_df.groupby(["district"], as_index=True)[
    var_cols
].transform("mean")
climate_df.reset_index(drop=False, inplace=True)

#########################################     K-FOLD SPLIT    #########################################
x_all = climate_df.drop(drop_cols, axis=1)
y_all = climate_df.yield_mt
x_train, x_test, y_train, y_test = train_test_split(
    x_all, y_all, test_size=0.2, random_state=0
)

#########################################     K-FOLD CV   ###########################################
### SETUP
alphas = {"alpha": np.logspace(-8, 8, base=10, num=17)}
kfold = KFold()
ridge = Ridge(random_state=0)
### GRID SEARCH - FINDING BEST REGULARIZATION PARAMETER
ridge_reg = GridSearchCV(ridge, alphas, scoring="r2", cv=kfold)
ridge_reg.fit(x_train, y_train)
best_model = ridge_reg.best_estimator_
### PREDICT - PREDICTING WITH BEST HYPERPARAMETER
val_predictions = cross_val_predict(best_model, X=x_train, y=y_train, cv=kfold)
train_predictions = best_model.predict(x_train)
test_predictions = best_model.predict(x_test)

#########################################     DE-MEAN R2    #########################################
crop_yield["prediction"] = best_model.predict(x_all)

train_split = pd.DataFrame(
    np.repeat("train", len(x_train)), columns=["split"], index=x_train.index
)
train_split = train_split.join(crop_yield.copy()[crop_yield.index.isin(x_train.index)])
train_split["cv_prediction"] = val_predictions

test_split = pd.DataFrame(
    np.repeat("test", len(x_test)), columns=["split"], index=x_test.index
)
test_split = test_split.join(crop_yield.copy()[crop_yield.index.isin(x_test.index)])
test_split["cv_prediction"] = np.repeat(np.nan, len(x_test))

predictions = pd.concat([train_split, test_split])

print(
    f"Val  R2: {r2_score(y_train, val_predictions):0.4f}\nTest R2: {r2_score(y_test, test_predictions):0.4f}"
)

Val  R2: 0.4449
Test R2: 0.4293


# Precipitation, Temperature, and NDVI  Anomaly model

In [11]:
climate_df = pd.read_csv(here("data", "climate", "climate_summary.csv"))
climate_df = climate_df.dropna()
drop_cols = ["year", "district", "yield_mt"]
climate_df = climate_df[climate_df.year >= 2016]

crop_yield = climate_df.copy().loc[:, tuple(drop_cols)].reset_index(drop=True)
crop_yield["log_yield"] = np.log10(crop_yield.yield_mt.to_numpy() + 1)

#########################################    STANDARDIZE FEATURES    #########################################
climate_df = climate_df.set_index(drop_cols)
climate_df_scaled = StandardScaler().fit_transform(climate_df.values)
climate_df = pd.DataFrame(climate_df_scaled, index=climate_df.index).reset_index()
climate_df.columns = climate_df.columns.astype(str)

#########################################     CALCULATE ANOMALY   #########################################
climate_df["yield_mt"] = np.log10(climate_df.yield_mt.to_numpy() + 1)
climate_df.set_index(["year", "district"], inplace=True)
var_cols = climate_df.columns
climate_df = climate_df[var_cols] - climate_df.groupby(["district"], as_index=True)[
    var_cols
].transform("mean")
climate_df.reset_index(drop=False, inplace=True)

#########################################     K-FOLD SPLIT    #########################################
x_all = climate_df.drop(drop_cols, axis=1)
y_all = climate_df.yield_mt
x_train, x_test, y_train, y_test = train_test_split(
    x_all, y_all, test_size=0.2, random_state=0
)

#########################################     K-FOLD CV   ###########################################
### SETUP
alphas = {"alpha": np.logspace(-8, 8, base=10, num=17)}
kfold = KFold()
ridge = Ridge(random_state=0)
### GRID SEARCH - FINDING BEST REGULARIZATION PARAMETER
ridge_reg = GridSearchCV(ridge, alphas, scoring="r2", cv=kfold)
ridge_reg.fit(x_train, y_train)
best_model = ridge_reg.best_estimator_
### PREDICT - PREDICTING WITH BEST HYPERPARAMETER
val_predictions = cross_val_predict(best_model, X=x_train, y=y_train, cv=kfold)
train_predictions = best_model.predict(x_train)
test_predictions = best_model.predict(x_test)

#########################################     DE-MEAN R2    #########################################
crop_yield["prediction"] = best_model.predict(x_all)

train_split = pd.DataFrame(
    np.repeat("train", len(x_train)), columns=["split"], index=x_train.index
)
train_split = train_split.join(crop_yield.copy()[crop_yield.index.isin(x_train.index)])
train_split["cv_prediction"] = val_predictions

test_split = pd.DataFrame(
    np.repeat("test", len(x_test)), columns=["split"], index=x_test.index
)
test_split = test_split.join(crop_yield.copy()[crop_yield.index.isin(x_test.index)])
test_split["cv_prediction"] = np.repeat(np.nan, len(x_test))

predictions = pd.concat([train_split, test_split])

print(
    f"Val  R2: {r2_score(y_train, val_predictions):0.4f}\nTest R2: {r2_score(y_test, test_predictions):0.4f}"
)

Val  R2: 0.5284
Test R2: 0.5145
