In [1]:
# !pip install -q pyhere p_tqdm glum

In [1]:
## import warnings
import time
import math
import os
import glob
from pyhere import here
from datetime import date
import re

import numpy as np
import pandas as pd
import geopandas
import pickle

import pyarrow
import itertools
import multiprocessing
import p_tqdm

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge, RidgeCV
from sklearn.model_selection import train_test_split, KFold, LeaveOneGroupOut, cross_val_score, GridSearchCV, cross_val_predict
from sklearn.metrics import r2_score
from scipy.stats import spearmanr,  pearsonr

from task_modeling_utils import *
from prediction_utils import *

In [4]:
point_pattern = re.compile("20k-points")
wa_pattern = re.compile("cm-False")

data_dir = here("data")
directory = here("data", "random_features", "summary")
files = os.listdir(directory)
files = [f for f in files if f not in ('.gitkeep', '.ipynb_checkpoints')]
files = [f for f in files if not (bool(point_pattern.search(f)) & bool(wa_pattern.search(f)))]
len(files)

44

In [5]:
paramlist = list(itertools.product(files, [True, False]))
len(paramlist)

88

In [6]:
# from dask import delayed
# @delayed
def model(params):
#########################################     SET PARAMS    #########################################
    file         = params[0][0]
    hot_encode   = params[1][0]
    f            = file.split(sep="_")
    satellite    = f[0]
    bands        = f[1].replace("bands-", "")
    country_code = f[2]
    points       = f[3].replace("k-points", "")
    num_features = f[4].replace("-features", "")
    yrs          = f[5].replace("yr-", "").split(sep="-")
    mns          = f[6].replace("mn-", "").split(sep="-")
    limit_months = str2bool(f[7].replace("lm-", ""))
    crop_mask    = str2bool(f[8].replace("cm-", ""))
    weighted_avg = str2bool(f[9].replace("wa-", ""))
    years        = range(int(yrs[0]), int(yrs[1])+1)
    month_range  = list(range(int(mns[0]), int(mns[1])+1))

#########################################     READ DATA    #########################################
    fn = f"{directory}/{file}"
    features = pd.read_feather(fn)
    features.drop(['crop_perc'], axis=1, errors='ignore', inplace=True)

    drop_cols = ['district', 'year', 'yield_mt']

    crop_yield = features.copy().loc[:, tuple(drop_cols)]
    crop_yield["log_yield"] = np.log10(crop_yield.yield_mt.to_numpy() + 1)

########################################     HOT ENCODE    ###########################################
    if hot_encode:
        drop_cols.remove("district")
        features = pd.get_dummies(features, columns=["district"], drop_first=False)
    else:
        pass

#########################################     K-FOLD SPLIT    #########################################
    x_all = features.drop(drop_cols, axis = 1) 
    y_all = np.log10(features.yield_mt.to_numpy() + 1)
    x_train, x_test, y_train, y_test = train_test_split(x_all, y_all, test_size=0.2, random_state=0)

#########################################     K-FOLD CV    ###########################################
    ### SETUP
    ridge  = Ridge()  
    kfold  = KFold(n_splits=5)
    alphas = {'alpha': np.logspace(-8, 8, base = 10, num = 17)}
    ### GRID SEARCH - FINDING BEST REGULARIZATION PARAMETER(S)
    if hot_encode:
        best_lambdas, best_scores, best_model = kfold_rr_multi_lambda_tuning(
            X=x_train, y=y_train, 
            grid=alphas.get('alpha'), 
            start=[0, x_train.shape[1]-72],
            end=[x_train.shape[1]-72, x_train.shape[1]], 
            static_lam=1e-16,
            verbose=False,
            show_linalg_warning=False,
            fit_model_after_tuning=True
        )
    else:
        search = GridSearchCV(ridge, alphas, scoring = 'r2', cv = kfold).fit(x_train, y_train)
        best_model   = search.best_estimator_
        best_scores  = search.best_score_
        best_lambdas = best_model.alpha
    ### PREDICT WITH BEST HYPERPARAMETER(S)
    val_predictions   = cross_val_predict(best_model, X=x_train, y=y_train, cv=kfold)   
    train_predictions = best_model.predict(x_train)
    test_predictions  = best_model.predict(x_test)

#########################################     DE-MEAN R2    #########################################    
    crop_yield["prediction"] = np.maximum(best_model.predict(x_all), 0)

    train_split = pd.DataFrame(np.repeat('train', len(x_train)), columns = ['split'], index = x_train.index)
    train_split = train_split.join(crop_yield.copy()[crop_yield.index.isin(x_train.index)])
    train_split['cv_prediction'] = np.maximum(val_predictions, 0)
    train_split["demean_cv_yield"] = train_split["log_yield"]-train_split.groupby('district')['log_yield'].transform('mean')
    train_split["demean_cv_prediction"] = train_split["cv_prediction"]-train_split.groupby('district')['cv_prediction'].transform('mean')

    test_split = pd.DataFrame(np.repeat('test', len(x_test)), columns = ['split'], index = x_test.index)
    test_split = test_split.join(crop_yield.copy()[crop_yield.index.isin(x_test.index)])
    test_split['cv_prediction'] = np.repeat(np.nan, len(x_test))
    test_split["demean_cv_yield"] = np.repeat(np.nan, len(x_test))
    test_split["demean_cv_prediction"] = np.repeat(np.nan, len(x_test))

    predictions = pd.concat([train_split, test_split])

#########################################     SAVE RESULTS    #########################################
    d = {
        'country'     : country_code,
        'satellite'   : satellite,
        'bands'       : bands,
        'num_features': num_features,
        'points'      : points, 
        'month_range' : f'{min(month_range)}-{max(month_range)}',

        'limit_months': limit_months,
        'crop_mask'   : crop_mask,
        'weighted_avg': weighted_avg,
        'hot_encode': hot_encode,

        'total_n': len(x_all),
        'train_n': len(x_train),
        'test_n' : len(x_test),

        'best_reg_param': [best_lambdas],
        'mean_of_val_R2': [best_scores],
        'val_R2': r2_score(y_train, val_predictions),
        'val_r' : pearsonr(val_predictions, y_train)[0],
        'val_r2': pearsonr(val_predictions, y_train)[0] ** 2,

        'train_R2': r2_score(y_train, train_predictions),
        'train_r' : pearsonr(train_predictions, y_train)[0],
        'train_r2': pearsonr(train_predictions, y_train)[0] ** 2,

        'test_R2': r2_score(y_test, test_predictions),
        'test_r' : pearsonr(test_predictions, y_test)[0],
        'test_r2': pearsonr(test_predictions, y_test)[0] ** 2,

        'demean_cv_R2': r2_score(train_split.demean_cv_yield, train_split.demean_cv_prediction),
        'demean_cv_r':  pearsonr(train_split.demean_cv_yield, train_split.demean_cv_prediction)[0],
        'demean_cv_r2': pearsonr(train_split.demean_cv_yield, train_split.demean_cv_prediction)[0] ** 2,
    }
    # return d
    return pd.DataFrame(data=d, index=[0])

In [5]:
# %%time    
# ##### With progress bar
# workers = os.cpu_count()
# if __name__ == "__main__":
#     output = []
#     for result in p_tqdm.p_umap(model, paramlist, num_cpus=workers):
#         output.append(result)
#     results = pd.concat(output).reset_index(drop=True)
#     today = date.today().strftime("%Y-%m-%d")
#     file_name = f'results_{today}.csv'
#     print(f"Saving results as: {file_name}\n\n")           
#     results.to_csv(here("data","results", file_name), index=False)

In [6]:
# import dask_gateway
# from distributed.diagnostics.plugin import UploadFile, UploadDirectory, PipInstall

# cluster = dask_gateway.GatewayCluster()
# client = cluster.get_client()
# client.register_worker_plugin(PipInstall(['glum', 'pyhere']))
# client.register_worker_plugin(UploadFile(here("code", "3_task_modeling", "task_modeling_utils.py")))
# # client.register_worker_plugin(UploadDirectory(here("data", "random_features", "summary")))
# cluster.scale(44)
# print(cluster.dashboard_link)

In [119]:
# for fn in files:
#     client.register_worker_plugin(UploadFile(here('data', 'random_features', 'summary', fn)))

In [120]:
# futures = []
# for fn in files:
#     df = pd.read_feather(here('data', 'random_features', 'summary', fn))
#     future = client.scatter(df)
#     futures.append(future)

In [121]:
# import dask.dataframe as dd
# ddf = dd.from_delayed(futures, meta=df)

In [122]:
# ddf.map_partitions(model, True, meta=(None, 'f8'))

In [123]:
# ddf.map_partitions(model, paramlist)

In [150]:
# results_map = client.map(model, paramlist) 

In [164]:
# sent = client.submit(model, paramlist) 
# result = client.gather(sent)

In [7]:
# result.persist()

In [8]:
# result.compute()

In [9]:
# from dask.distributed import Client
# with Client(n_workers=88) as client:
#     sent = client.submit(model, paramlist)
#     result = client.gather(sent)

In [6]:
# from dask.distributed import Client
# workers = os.cpu_count()
# if __name__ == "__main__":
#     output = []
#     with Client(n_workers=88) as client:
#         for result in p_tqdm.p_umap(model, paramlist):
#             output.append(result).compute()
#     results = pd.concat(output).reset_index(drop=True)
#     today = date.today().strftime("%Y-%m-%d")
#     file_name = f'results_{today}.csv'
#     print(f"Saving results as: {file_name}\n\n")           
#     results.to_csv(here("data","results", file_name), index=False)

In [6]:
# %%time    
# #### No progress bar
# multiprocessing.set_start_method('spawn')
# workers = os.cpu_count()
# if __name__ == "__main__":
#     with multiprocessing.Pool(processes=workers) as pool:
#         output = []
#         for result in pool.imap_unordered(model, paramlist, chunksize=2):
#             output.append(result)
#     results = pd.concat(output).reset_index(drop=True)
#     today = date.today().strftime("%Y-%m-%d")
#     file_name = f'results_{today}.csv'
#     print(f"Saving results as: {file_name}\n\n")           
#     results.to_csv(here("data","results", file_name))

In [10]:
### TESTING
file         = 'landsat-8-c2-l2_bands-1-2-3-4-5-6-7_ZMB_15k-points_1000-features_yr-2013-2021_mn-4-9_lm-True_cm-False_wa-False_summary.feather'
hot_encode   = True

#########################################     SET PARAMS    #########################################
# file         = params[0]
# hot_encode   = params[1]
f            = file.split(sep="_")
satellite    = f[0]
bands        = f[1].replace("bands-", "")
country_code = f[2]
points       = f[3].replace("k-points", "")
num_features = f[4].replace("-features", "")
yrs          = f[5].replace("yr-", "").split(sep="-")
mns          = f[6].replace("mn-", "").split(sep="-")
limit_months = str2bool(f[7].replace("lm-", ""))
crop_mask    = str2bool(f[8].replace("cm-", ""))
weighted_avg = str2bool(f[9].replace("wa-", ""))
years        = range(int(yrs[0]), int(yrs[1])+1)
month_range  = list(range(int(mns[0]), int(mns[1])+1))

#########################################     READ DATA    #########################################
fn = f"{directory}/{file}"
features = pd.read_feather(fn)
features.drop(['crop_perc'], axis=1, errors='ignore', inplace=True)

climate_df = pd.read_csv(here('data', 'climate', 'climate_summary.csv'))

drop_cols = ['district', 'year', 'yield_mt']

#########################################    JOIN CLIMATE VARS    #########################################  
ndvi_cols = climate_df.columns[climate_df.columns.to_series().str.contains('ndvi')]
keep_cols = [*ndvi_cols, *drop_cols]
climate_df = climate_df.loc[:, keep_cols]

features = features.set_index(drop_cols).join(climate_df.set_index(drop_cols)).reset_index()
features = features[features.year <= max(climate_df.year)]

crop_yield = features.copy().loc[:, tuple(drop_cols)]
crop_yield["log_yield"] = np.log10(crop_yield.yield_mt.to_numpy() + 1)

#########################################     HOT ENCODE    ###########################################
if hot_encode:
    # features['district'] = features.district.astype('category')
    drop_cols.remove("district")
    features = pd.get_dummies(features, columns=["district"], drop_first=False)
else:
    pass

#########################################    STANDARDIZE FEATURES    #########################################    
features = features.set_index(drop_cols) 
features_scaled = StandardScaler().fit_transform(features.values)
features = pd.DataFrame(features_scaled, index=features.index).reset_index()
features.columns = features.columns.astype(str)          

features.yield_mt = np.log10(features.yield_mt.to_numpy() + 1)

#########################################     K-FOLD SPLIT    #########################################
x_all = features.drop(drop_cols, axis = 1) 
y_all = features.yield_mt
x_train, x_test, y_train, y_test = train_test_split(x_all, y_all, test_size=0.2, random_state=0)

In [11]:
x_train

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,6074,6075,6076,6077,6078,6079,6080,6081,6082,6083
34,0.122405,-0.602627,-0.962090,-0.373393,0.087622,-0.395953,0.665469,0.815541,0.836532,0.860886,...,-0.118678,-0.118678,-0.118678,-0.118678,-0.118678,-0.118678,-0.118678,-0.118678,-0.118678,-0.118678
627,-0.745102,-1.319972,-1.137262,-0.828373,-0.100150,-0.273584,0.508537,-0.248048,-0.594027,-0.255724,...,-0.118678,-0.118678,-0.118678,-0.118678,-0.118678,-0.118678,-0.118678,8.426150,-0.118678,-0.118678
592,-1.108298,-1.835012,-1.133221,-0.726423,-0.530001,-1.330731,-1.639542,-2.489729,-2.327456,-2.254350,...,-0.118678,-0.118678,-0.118678,8.426150,-0.118678,-0.118678,-0.118678,-0.118678,-0.118678,-0.118678
530,0.832510,0.480655,0.677306,0.382840,0.190721,-0.118106,0.180867,0.882195,0.817981,0.662049,...,-0.118678,-0.118678,-0.118678,-0.118678,-0.118678,-0.118678,-0.118678,-0.118678,-0.118678,-0.118678
443,0.200511,1.014885,1.385173,0.913613,0.745622,0.396749,-0.745719,0.204117,0.234500,0.226061,...,-0.118678,-0.118678,-0.118678,-0.118678,-0.118678,-0.118678,-0.118678,-0.118678,-0.118678,-0.118678
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9,1.507697,1.086205,1.644558,1.474270,-2.304456,-0.379790,0.718619,0.913073,0.959412,0.971583,...,-0.118678,-0.118678,-0.118678,-0.118678,-0.118678,-0.118678,-0.118678,-0.118678,-0.118678,-0.118678
359,-0.855198,1.012120,0.686110,0.021326,0.500773,0.143984,-1.111080,0.031450,0.429969,-0.844280,...,-0.118678,-0.118678,-0.118678,-0.118678,-0.118678,-0.118678,-0.118678,-0.118678,-0.118678,-0.118678
192,0.520318,0.075699,0.494057,0.007725,0.658465,0.811826,0.411325,-0.106081,0.321837,0.033495,...,-0.118678,-0.118678,-0.118678,-0.118678,-0.118678,-0.118678,-0.118678,-0.118678,-0.118678,-0.118678
629,-2.786142,-0.523502,-0.856285,-0.245399,-0.068380,-0.054709,-2.941933,-0.661904,-0.925709,-0.882008,...,-0.118678,-0.118678,-0.118678,-0.118678,-0.118678,-0.118678,-0.118678,8.426150,-0.118678,-0.118678


In [13]:
best_lam, res, model = kfold_rr_multi_lambda_tuning(
    x_train, y_train, 
    grid=np.logspace(-8, 8, base = 10, num = 17), 
    start=[0, x_train.shape[1]-(72+12), x_train.shape[1]-72],
    end=[x_train.shape[1]-(72+12), x_train.shape[1]-72, x_train.shape[1]], 
    static_lam=1,
    verbose=True,
    show_linalg_warning=False,
    fit_model_after_tuning=True
)
kfold = KFold(n_splits=5)
val_predictions = cross_val_predict(model, X = x_train, y = y_train, cv = kfold) 
print(f"""Final Val R2: {r2_score(y_train, val_predictions):0.4f}
Final Test R2: {r2_score(y_test, model.predict(x_test)):0.4f}""")

1e-08 1e-07 1e-06 1e-05 0.0001 0.001 0.01 0.1 1.0 10.0 100.0 1000.0 10000.0 100000.0 1000000.0 10000000.0 100000000.0 
	Best λ 1: 10.0
	Val R2 1: 0.7478
1e-08 1e-07 1e-06 1e-05 0.0001 0.001 0.01 0.1 1.0 10.0 100.0 1000.0 10000.0 100000.0 1000000.0 10000000.0 100000000.0 
	Best λ 2: 0.01
	Val R2 2: 0.7572
1e-08 1e-07 1e-06 1e-05 0.0001 0.001 0.01 0.1 1.0 10.0 100.0 1000.0 10000.0 100000.0 1000000.0 10000000.0 100000000.0 
	Best λ 3: 0.01
	Val R2 3: 0.8292
Total time: 26.35 minutes
Final Val R2: 0.8299
Final Test R2: 0.8006


In [14]:
os.cpu_count()

16

In [13]:
%%time

lambdas=np.logspace(-8, 8, base = 10, num = 17)

kfold_results = kfold_solve_custom_split_col(
    X=x_train,
    y=y_train,
    locations=x_train.index,
    split_col=x_train.reset_index().index,
    lambdas=lambdas,
    static_lam_val=1e-8,
    static_lam_idxs=list(range(0,x_train.shape[1]-72)),
    intercept=True,
    num_folds=5,
    random_state=0,
    return_preds=True,
    return_model=False,
    svd_solve=False,
    allow_linalg_warning_instances=True,
    fit_model_after_tuning=False,
)
best_alpha_1_idx = interpret_kfold_results(kfold_results, "r2_score")[0][0][0]
best_alpha_1 = lambdas[best_alpha_1_idx]
preds = np.maximum(get_pred_truth_locs(kfold_results)[0].flatten(), 0)
truth = get_pred_truth_locs(kfold_results)[1].flatten()
# print(
# f"""Best alpha 1: {best_alpha_1}
# Val R2: {r2_score(truth, preds):0.4f}\n"""
# )

kfold_results = kfold_solve_custom_split_col(
    X=x_train,
    y=y_train,
    locations=x_train.index,
    split_col=x_train.reset_index().index,
    lambdas=lambdas,
    static_lam_val=best_alpha_1,
    static_lam_idxs=list(range(x_train.shape[1]-72, x_train.shape[1])),
    intercept=True,
    num_folds=5,
    random_state=0,
    return_preds=True,
    return_model=False,
    svd_solve=False,
    allow_linalg_warning_instances=True,
    fit_model_after_tuning=False,
)
best_alpha_2_idx = interpret_kfold_results(kfold_results, "r2_score")[0][0][0]
best_alpha_2 = lambdas[best_alpha_2_idx]
preds = np.maximum(get_pred_truth_locs(kfold_results)[0].flatten(), 0)
truth = get_pred_truth_locs(kfold_results)[1].flatten()


model, intercept_term = custom_ridge(
    X=x_train,
    y=y_train,
    lam=best_alpha_1, 
    intercept=True,
    static_lam_val=best_alpha_2,
    static_lam_idxs=list(range(x_train.shape[1]-72, x_train.shape[1])))
pred_test = np.asarray(x_test).dot(model) + intercept_term 
pred_test = np.maximum(pred_test, 0)

# print(
# f"""Best alpha 2: {best_alpha_2}
# Fianl Val R2: {r2_score(truth, preds):0.4f}
# Final test R2: {r2_score(y_test, pred_test):0.4f}\n"""
# )

we will allow this model upon model selection
we will allow this model upon model selection
we will allow this model upon model selection
we will allow this model upon model selection
we will allow this model upon model selection
we will allow this model upon model selection
we will allow this model upon model selection
we will allow this model upon model selection
we will allow this model upon model selection
we will allow this model upon model selection


  ary = asanyarray(ary)


we will allow this model upon model selection
we will allow this model upon model selection
we will allow this model upon model selection


  ary = asanyarray(ary)


CPU times: user 10min 44s, sys: 8min 48s, total: 19min 32s
Wall time: 6min 29s


In [15]:
print(
f"""Best alpha 2: {best_alpha_2}
Fianl Val R2: {r2_score(truth, preds):0.4f}
Final test R2: {r2_score(y_test, pred_test):0.4f}\n"""
)

Best alpha 2: 1000.0
Fianl Val R2: 0.7900
Final test R2: 0.0057



In [18]:
# make_train_pred_scatterplot(task = "Validation", y_test = y_test, preds_test = pred_test)

In [19]:
# make_train_pred_scatterplot(task = "Validation", y_test = truth, preds_test = preds)

In [26]:
%%time
### TESTING
file         = 'landsat-8-c2-l2_bands-1-2-3-4-5-6-7_ZMB_15k-points_1000-features_yr-2013-2021_mn-4-9_lm-True_cm-False_wa-False_summary.feather'
hot_encode   = True

#########################################     SET PARAMS    #########################################
# file         = params[0]
# hot_encode   = params[1]
f            = file.split(sep="_")
satellite    = f[0]
bands        = f[1].replace("bands-", "")
country_code = f[2]
points       = f[3].replace("k-points", "")
num_features = f[4].replace("-features", "")
yrs          = f[5].replace("yr-", "").split(sep="-")
mns          = f[6].replace("mn-", "").split(sep="-")
limit_months = str2bool(f[7].replace("lm-", ""))
crop_mask    = str2bool(f[8].replace("cm-", ""))
weighted_avg = str2bool(f[9].replace("wa-", ""))
years        = range(int(yrs[0]), int(yrs[1])+1)
month_range  = list(range(int(mns[0]), int(mns[1])+1))

#########################################     READ DATA    #########################################
fn = f"{directory}/{file}"
features = pd.read_feather(fn)
features.drop(['crop_perc'], axis=1, errors='ignore', inplace=True)

climate_df = pd.read_csv(here('data', 'climate', 'climate_summary.csv'))

drop_cols = ['district', 'year', 'yield_mt']

crop_yield = features.copy().loc[:, tuple(drop_cols)]
crop_yield["log_yield"] = np.log10(crop_yield.yield_mt.to_numpy() + 1)
    
########################################     HOT ENCODE    ###########################################
if hot_encode:
    drop_cols.remove("district")
    features = pd.get_dummies(features, columns=["district"], drop_first=False)
else:
    pass

#########################################     K-FOLD SPLIT    #########################################
x_all = features.drop(drop_cols, axis = 1) 
y_all = np.log10(features.yield_mt.to_numpy() + 1)
x_train, x_test, y_train, y_test = train_test_split(x_all, y_all, test_size=0.2, random_state=0)

#########################################     K-FOLD CV    ###########################################
### SETUP
ridge  = Ridge()  
kfold  = KFold(n_splits=5)
alphas = {'alpha': np.logspace(-8, 8, base = 10, num = 17)}
### GRID SEARCH - FINDING BEST REGULARIZATION PARAMETER(S)
if hot_encode:
    best_lambdas, best_scores, best_model = kfold_rr_multi_lambda_tuning(
        X=x_train, y=y_train, 
        grid=alphas.get('alpha'), 
        start=[0, x_train.shape[1]-72],
        end=[x_train.shape[1]-72, x_train.shape[1]], 
        static_lam=1e-16,
        verbose=True,
        show_linalg_warning=False,
        fit_model_after_tuning=True
    )
else:
    search = GridSearchCV(ridge, alphas, scoring = 'r2', cv = kfold).fit(x_train, y_train)
    best_model   = search.best_estimator_
    best_scores  = search.best_score_
    best_lambdas = best_model.alpha
### PREDICT WITH BEST HYPERPARAMETER(S)
val_predictions   = cross_val_predict(best_model, X=x_train, y=y_train, cv=kfold)   
train_predictions = best_model.predict(x_train)
test_predictions  = best_model.predict(x_test)

#########################################     DE-MEAN R2    #########################################    
crop_yield["prediction"] = np.maximum(best_model.predict(x_all), 0)

train_split = pd.DataFrame(np.repeat('train', len(x_train)), columns = ['split'], index = x_train.index)
train_split = train_split.join(crop_yield.copy()[crop_yield.index.isin(x_train.index)])
train_split['cv_prediction'] = np.maximum(val_predictions, 0)
train_split["demean_cv_yield"] = train_split["log_yield"]-train_split.groupby('district')['log_yield'].transform('mean')
train_split["demean_cv_prediction"] = train_split["cv_prediction"]-train_split.groupby('district')['cv_prediction'].transform('mean')

test_split = pd.DataFrame(np.repeat('test', len(x_test)), columns = ['split'], index = x_test.index)
test_split = test_split.join(crop_yield.copy()[crop_yield.index.isin(x_test.index)])
test_split['cv_prediction'] = np.repeat(np.nan, len(x_test))
test_split["demean_cv_yield"] = np.repeat(np.nan, len(x_test))
test_split["demean_cv_prediction"] = np.repeat(np.nan, len(x_test))

predictions = pd.concat([train_split, test_split])

#########################################     SAVE RESULTS    #########################################
d = {
    'country'     : country_code,
    'satellite'   : satellite,
    'bands'       : bands,
    'num_features': num_features,
    'points'      : points, 
    'month_range' : f'{min(month_range)}-{max(month_range)}',

    'limit_months': limit_months,
    'crop_mask'   : crop_mask,
    'weighted_avg': weighted_avg,
    'hot_encode': hot_encode,

    'total_n': len(x_all),
    'train_n': len(x_train),
    'test_n' : len(x_test),

    'best_reg_param': [best_lambdas],
    'mean_of_val_R2': [best_scores],
    'val_R2': r2_score(y_train, val_predictions),
    'val_r' : pearsonr(val_predictions, y_train)[0],
    'val_r2': pearsonr(val_predictions, y_train)[0] ** 2,

    'train_R2': r2_score(y_train, train_predictions),
    'train_r' : pearsonr(train_predictions, y_train)[0],
    'train_r2': pearsonr(train_predictions, y_train)[0] ** 2,

    'test_R2': r2_score(y_test, test_predictions),
    'test_r' : pearsonr(test_predictions, y_test)[0],
    'test_r2': pearsonr(test_predictions, y_test)[0] ** 2,

    'demean_cv_R2': r2_score(train_split.demean_cv_yield, train_split.demean_cv_prediction),
    'demean_cv_r':  pearsonr(train_split.demean_cv_yield, train_split.demean_cv_prediction)[0],
    'demean_cv_r2': pearsonr(train_split.demean_cv_yield, train_split.demean_cv_prediction)[0] ** 2,
}

test_split["demean_test_yield"] = test_split["log_yield"]-test_split.groupby('district')['log_yield'].transform('mean')
test_split["demean_test_prediction"] = test_split["prediction"]-test_split.groupby('district')['prediction'].transform('mean')

print(f'Val  R2: {r2_score(y_train, val_predictions):0.2f}\nTest R2: {r2_score(y_test, test_predictions):0.2f}',
     f'\n\nDemean Val  R2: {r2_score(train_split.demean_cv_yield, train_split.demean_cv_prediction):0.2f}',
     f'\nDemean Test R2: {r2_score(test_split.demean_test_yield, test_split.demean_test_prediction):0.2f}')

1e-08 1e-07 1e-06 1e-05 0.0001 0.001 0.01 0.1 1.0 10.0 100.0 1000.0 10000.0 100000.0 1000000.0 10000000.0 100000000.0 
	Best λ 1: 0.1
	Val R2 1: 0.7755
1e-08 1e-07 1e-06 1e-05 0.0001 0.001 0.01 0.1 1.0 10.0 100.0 1000.0 10000.0 100000.0 1000000.0 10000000.0 100000000.0 
	Best λ 2: 0.001
	Val R2 2: 0.7837
Total time: 24.49 minutes
Val  R2: 0.78
Test R2: 0.72 

Demean Val  R2: -0.02 
Demean Test R2: -0.02
CPU times: user 42min 44s, sys: 21min 2s, total: 1h 3min 47s
Wall time: 25min 5s


In [27]:
pd.DataFrame(d).iloc[:, 0:18]

Unnamed: 0,country,satellite,bands,num_features,points,month_range,limit_months,crop_mask,weighted_avg,hot_encode,total_n,train_n,test_n,best_reg_param,mean_of_val_R2,val_R2,val_r,val_r2
0,ZMB,landsat-8-c2-l2,1-2-3-4-5-6-7,1000,15,4-9,True,False,False,True,648,518,130,"[0.1, 0.001]","[0.7754502460328604, 0.7837355325757185]",0.784073,0.885803,0.784646


In [28]:
# ### TESTING
# file         = 'landsat-8-c2-l2_bands-1-2-3-4-5-6-7_ZMB_15k-points_1000-features_yr-2013-2021_mn-4-9_lm-True_cm-False_wa-False_summary.feather'
# hot_encode   = True

# #########################################     SET PARAMS    #########################################
# # file         = params[0]
# # hot_encode   = params[1]
# f            = file.split(sep="_")
# satellite    = f[0]
# bands        = f[1].replace("bands-", "")
# country_code = f[2]
# points       = f[3].replace("k-points", "")
# num_features = f[4].replace("-features", "")
# yrs          = f[5].replace("yr-", "").split(sep="-")
# mns          = f[6].replace("mn-", "").split(sep="-")
# limit_months = str2bool(f[7].replace("lm-", ""))
# crop_mask    = str2bool(f[8].replace("cm-", ""))
# weighted_avg = str2bool(f[9].replace("wa-", ""))
# years        = range(int(yrs[0]), int(yrs[1])+1)
# month_range  = list(range(int(mns[0]), int(mns[1])+1))

# #########################################     READ DATA    #########################################
# fn = f"{directory}/{file}"
# features = pd.read_feather(fn)
# features.drop(['crop_perc'], axis=1, errors='ignore', inplace=True)

# climate_df = pd.read_csv(here('data', 'climate', 'climate_summary.csv'))

# drop_cols = ['district', 'year', 'yield_mt']

# #########################################    JOIN CLIMATE VARS    #########################################  
# ndvi_cols = climate_df.columns[climate_df.columns.to_series().str.contains('ndvi')]
# keep_cols = [*ndvi_cols, *drop_cols]
# climate_df = climate_df.loc[:, keep_cols]

# features = features.set_index(drop_cols).join(climate_df.set_index(drop_cols)).reset_index()
# features = features[features.year <= max(climate_df.year)]

# crop_yield = features.copy().loc[:, tuple(drop_cols)]
# crop_yield["log_yield"] = np.log10(crop_yield.yield_mt.to_numpy() + 1)

# #########################################    STANDARDIZE FEATURES    #########################################    
# features = features.set_index(drop_cols) 
# features_scaled = StandardScaler().fit_transform(features.values)
# features = pd.DataFrame(features_scaled, index=features.index).reset_index()
# features.columns = features.columns.astype(str)          

# features.yield_mt = np.log10(features.yield_mt.to_numpy() + 1)

# #########################################     HOT ENCODE    ###########################################
# if hot_encode:
#     drop_cols.remove("district")
#     features = pd.get_dummies(features, columns=["district"], drop_first=False)
#     # features = mfe(
#     #     df = features,
#     #     var_cols=features.set_index(['district', 'year']).columns,
#     #     group_cols='district'
#     # )
# else:
#     pass

# #########################################     K-FOLD SPLIT    #########################################
# x_all = features.drop(drop_cols, axis = 1) 
# y_all = features.yield_mt
# x_train, x_test, y_train, y_test = train_test_split(x_all, y_all, test_size=0.2, random_state=0)
# 
# #########################################     K-FOLD CV    ###########################################
# ### SETUP
# alphas = {'alpha': np.logspace(-8, 8, base = 10, num = 17)}
# kfold  = KFold()
# ridge  = Ridge()   
# ### GRID SEARCH - FINDING BEST REGULARIZATION PARAMETER
# ridge_reg = GridSearchCV(ridge, alphas, scoring = 'r2', cv = kfold)
# ridge_reg.fit(x_train, y_train)
# best_model = ridge_reg.best_estimator_
# ### PREDICT - PREDICTING WITH BEST HYPERPARAMETER
# val_predictions = cross_val_predict(best_model, X = x_train, y = y_train, cv = kfold)   
# train_predictions = best_model.predict(x_train)
# test_predictions  = best_model.predict(x_test)

# #########################################     DE-MEAN R2    #########################################    
# crop_yield["prediction"] = np.maximum(best_model.predict(x_all), 0)

# train_split = pd.DataFrame(np.repeat('train', len(x_train)), columns = ['split'], index = x_train.index)
# train_split = train_split.join(crop_yield.copy()[crop_yield.index.isin(x_train.index)])
# train_split['cv_prediction'] = np.maximum(val_predictions, 0)
# train_split["demean_cv_yield"] = train_split["log_yield"]-train_split.groupby('district')['log_yield'].transform('mean')
# train_split["demean_cv_prediction"] = train_split["cv_prediction"]-train_split.groupby('district')['cv_prediction'].transform('mean')

# test_split = pd.DataFrame(np.repeat('test', len(x_test)), columns = ['split'], index = x_test.index)
# test_split = test_split.join(crop_yield.copy()[crop_yield.index.isin(x_test.index)])
# test_split['cv_prediction'] = np.repeat(np.nan, len(x_test))
# test_split["demean_cv_yield"] = np.repeat(np.nan, len(x_test))
# test_split["demean_cv_prediction"] = np.repeat(np.nan, len(x_test))

# predictions = pd.concat([train_split, test_split])

# #########################################     SAVE MODELS   #########################################  
# #     model_fn_suffix = file.replace('_summary.feather', '')
# #     k_model_fn  = f'kfold-cv_rr-model_{model_fn_suffix}_he-{hot_encode}.pkl'

# #     with open(here('models', k_model_fn),'wb') as f:
# #         pickle.dump(best_model, f)

# #########################################     SAVE RESULTS    #########################################
# d = {
#     'country'     : country_code,
#     'satellite'   : satellite,
#     'bands'       : bands,
#     'num_features': num_features,
#     'points'      : points, 
#     'month_range' : f'{min(month_range)}-{max(month_range)}',

#     'limit_months': limit_months,
#     'crop_mask'   : crop_mask,
#     'weighted_avg': weighted_avg,
#     'hot_encode': hot_encode,

#     'total_n': len(x_all),
#     'train_n': len(x_train),
#     'test_n' : len(x_test),

#     'best_reg_param': list(ridge_reg.best_params_.values())[0],
#     'mean_of_val_R2': ridge_reg.best_score_,
#     'val_R2': r2_score(y_train, val_predictions),
#     'val_r' : pearsonr(val_predictions, y_train)[0],
#     'val_r2': pearsonr(val_predictions, y_train)[0] ** 2,

#     'train_R2': r2_score(y_train, train_predictions),
#     'train_r' : pearsonr(train_predictions, y_train)[0],
#     'train_r2': pearsonr(train_predictions, y_train)[0] ** 2,

#     'test_R2': r2_score(y_test, test_predictions),
#     'test_r' : pearsonr(test_predictions, y_test)[0],
#     'test_r2': pearsonr(test_predictions, y_test)[0] ** 2,

#     'demean_cv_R2': r2_score(train_split.demean_cv_yield, train_split.demean_cv_prediction),
#     'demean_cv_r':  pearsonr(train_split.demean_cv_yield, train_split.demean_cv_prediction)[0],
#     'demean_cv_r2': pearsonr(train_split.demean_cv_yield, train_split.demean_cv_prediction)[0] ** 2,
# }

# test_split["demean_test_yield"] = test_split["log_yield"]-test_split.groupby('district')['log_yield'].transform('mean')
# test_split["demean_test_prediction"] = test_split["prediction"]-test_split.groupby('district')['prediction'].transform('mean')

# print(f'Val  R2: {r2_score(y_train, val_predictions):0.2f}',
#       f'\nTest R2: {r2_score(y_test, test_predictions):0.2f}',
#      f'\n\nDemean Val  R2: {r2_score(train_split.demean_cv_yield, train_split.demean_cv_prediction):0.2f}',
#      f'\nDemean Test R2: {r2_score(test_split.demean_test_yield, test_split.demean_test_prediction):0.2f}')