In [1]:
## import warnings
import time
import math
import os
import glob
from pyhere import here
from datetime import date
import re

import numpy as np
import pandas as pd
import geopandas
import pickle

import pyarrow
import itertools
import multiprocessing
import p_tqdm

from sklearn.linear_model import Ridge, RidgeCV
from sklearn.model_selection import train_test_split, KFold, LeaveOneGroupOut, cross_val_score, GridSearchCV, cross_val_predict
from sklearn.metrics import r2_score
from scipy.stats import spearmanr,  pearsonr

In [2]:
def str2bool(string):
    return string.lower() in ("yes", "true", "t", "1")

point_pattern = re.compile("20k-points")
wa_pattern = re.compile("cm-False")

In [3]:
data_dir = here("data")
directory = here("data", "random_features", "summary")
files = os.listdir(directory)
files = [f for f in files if f not in ('.gitkeep', '.ipynb_checkpoints')]
files = [f for f in files if not (bool(point_pattern.search(f)) & bool(wa_pattern.search(f)))]
len(files)

44

In [4]:
paramlist = list(itertools.product(files, [True, False]))
len(paramlist)

88

In [5]:
def model(params):
#########################################     SET PARAMS    #########################################
    file         = params[0]
    hot_encode   = params[1]
    f            = file.split(sep="_")
    satellite    = f[0]
    bands        = f[1].replace("bands-", "")
    country_code = f[2]
    points       = f[3].replace("k-points", "")
    num_features = f[4].replace("-features", "")
    yrs          = f[5].replace("yr-", "").split(sep="-")
    mns          = f[6].replace("mn-", "").split(sep="-")
    limit_months = str2bool(f[7].replace("lm-", ""))
    crop_mask    = str2bool(f[8].replace("cm-", ""))
    weighted_avg = str2bool(f[9].replace("wa-", ""))
    years        = range(int(yrs[0]), int(yrs[1])+1)
    month_range  = list(range(int(mns[0]), int(mns[1])+1))
    
    alphas = {'alpha': np.logspace(-8, 8, base = 10, num = 17)}
    kfold  = KFold()
    logo   = LeaveOneGroupOut()
    ridge  = Ridge()    
    
#########################################     READ DATA    #########################################
    fn = f"{directory}/{file}"
    features = pd.read_feather(fn)
     
    drop_cols = ['district', 'year', 'yield_mt', "crop_perc"]
            
    if weighted_avg:
        drop_cols.remove("crop_perc")
    else:
        pass

    crop_yield = features.copy().loc[:, tuple(drop_cols)]
    
#########################################     HOT ENCODE    ###########################################
    if hot_encode:
        drop_cols.remove("district")
        features = pd.get_dummies(features, columns=["district"], drop_first=False)
    else:
        pass
    
    features['yield_mt'] = np.log10(features.yield_mt.to_numpy() + 1)
    
#########################################     K-FOLD SPLIT    #########################################
    x_all = features.drop(drop_cols, axis = 1) 
    y_all = features.yield_mt
    x_train, x_test, y_train, y_test = train_test_split(x_all, y_all, test_size=0.2, random_state=0)

#########################################     K-FOLD CV    ###########################################

    ### GRID SEARCH - FINDING BEST REGULARIZATION PARAMETER
    kfold_ridge_reg = GridSearchCV(ridge, alphas, scoring = 'r2', cv = kfold)
    kfold_ridge_reg.fit(x_train, y_train)
    kfold_best_model = kfold_ridge_reg.best_estimator_
    ### PREDICT - PREDICTING WITH BEST HYPERPARAMETER
    kfold_val_predictions = cross_val_predict(kfold_best_model, X = x_train, y = y_train, cv = kfold)   
    y_pred_train_k = kfold_best_model.predict(x_train)
    y_pred_test_k  = kfold_best_model.predict(x_test)

# #########################################     LOGO SPLIT   ###########################################
#     x_train_g = features[features.year < max(features.year)].drop(drop_cols, axis=1)
#     y_train_g = features[features.year < max(features.year)].yield_mt
#     g_train_g = features[features.year < max(features.year)].year.ravel()

#     x_test_g = features[features.year == max(features.year)].drop(drop_cols, axis=1)
#     y_test_g = features[features.year == max(features.year)].yield_mt
#     g_test_g = features[features.year == max(features.year)].year

# #########################################     LOGO CV    ###########################################
#     ### GRID SEARCH - FINDING BEST REGULARIZATION PARAMETER
#     logo_ridge_reg = GridSearchCV(ridge, alphas, scoring='r2', cv=logo)
#     logo_ridge_reg.fit(x_train_g, y_train_g, groups=g_train_g)
#     logo_best_model = logo_ridge_reg.best_estimator_
#     ### PREDICT - PREDICTING WITH BEST HYPERPARAMETER
#     logo_val_predictions = cross_val_predict(logo_best_model, X=x_train_g, y=y_train_g, groups=g_train_g, cv=logo)   
#     logo_train_pred = logo_best_model.predict(x_train_g)
#     logo_test_pred  = logo_best_model.predict(x_test_g)

#########################################     LOGO ITERATOR   ###########################################
    logo_val_results = []
    logo_train_results = []
    logo_test_results = []

    for year in features.year.unique():
#########################################     LOGO SPLIT   ###########################################
        x_train_g = features[features.year != year].drop(drop_cols, axis=1)
        y_train_g = features[features.year != year].yield_mt.ravel()
        g_train_g = features[features.year != year].year.ravel()
        d_train_g = crop_yield[crop_yield.year != year].district.ravel()

        x_test_g = features[features.year == year].drop(drop_cols, axis=1)
        y_test_g = features[features.year == year].yield_mt.ravel()
        g_test_g = features[features.year == year].year.ravel()
        d_test_g = crop_yield[crop_yield.year == year].district.ravel()

#########################################     LOGO CV   ###########################################
        ### GRID SEARCH - FINDING BEST REGULARIZATION PARAMETER
        logo_ridge_reg = GridSearchCV(ridge, alphas, scoring='r2', cv=logo)
        logo_ridge_reg.fit(x_train_g, y_train_g, groups=g_train_g)
        logo_best_model = logo_ridge_reg.best_estimator_
        ### PREDICT - PREDICTING WITH BEST HYPERPARAMETER
        logo_val_predictions = cross_val_predict(logo_best_model, X=x_train_g, y=y_train_g, groups=g_train_g, cv=logo) 
        logo_train_pred = logo_best_model.predict(x_train_g)
        logo_test_pred  = logo_best_model.predict(x_test_g)

#########################################     LOGO RESULTS   ###########################################
        val_results = {'year': g_train_g, 'district': d_train_g, 'split': 'val', 
                       'observed': y_train_g, 'predicted': logo_val_predictions}

        train_results = {'year': g_train_g, 'district': d_train_g,'split': 'train', 
                         'observed': y_train_g, 'predicted': logo_train_pred}

        test_results = {'year': g_test_g, 'district': d_test_g, 'split': 'test', 
                        'observed': y_test_g, 'predicted': logo_test_pred}
        
        logo_val_results.append(val_results)
        logo_train_results.append(train_results)
        logo_test_results.append(test_results)

#########################################     EXPLODE RESULTS   ###########################################
    explode_cols = ['year', 'district', 'observed', 'predicted']
    val_df   = pd.DataFrame(logo_val_results  ).explode(explode_cols) 
    train_df = pd.DataFrame(logo_train_results).explode(explode_cols) 
    test_df  = pd.DataFrame(logo_test_results ).explode(explode_cols)
    
    group_cols = ['year', 'district', 'split']
    val_summary   =   val_df.groupby(group_cols, as_index=False).mean()
    train_summary = train_df.groupby(group_cols, as_index=False).mean()

#########################################     DE-MEAN R2    #########################################    
    crop_yield["log_yield"] = np.log10(crop_yield.yield_mt.to_numpy() + 1)
    crop_yield["district_yield_mean"] = crop_yield.groupby('district')['log_yield'].transform('mean')
    crop_yield["demean_yield"] = crop_yield["log_yield"] - crop_yield["district_yield_mean"]
    
    crop_yield["kfold_prediction"] = np.maximum(kfold_best_model.predict(x_all), 0)
    crop_yield["kfold_district_prediction_mean"] = crop_yield.groupby('district')['kfold_prediction'].transform('mean')
    crop_yield["kfold_demean_prediction"] = crop_yield["kfold_prediction"] - crop_yield["kfold_district_prediction_mean"]
    
    join_cols = ['year', 'district']
    crop_yield = crop_yield.set_index(join_cols).join(test_df.set_index(join_cols)).reset_index()
    crop_yield["logo_district_prediction_mean"] = crop_yield.groupby('district')['predicted'].transform('mean')
    crop_yield["logo_demean_prediction"] = crop_yield["predicted"] - crop_yield["logo_district_prediction_mean"]
    
    # crop_yield["logo_prediction"] = np.maximum(logo_best_model.predict(x_all), 0)
    # crop_yield["logo_district_prediction_mean"] = crop_yield.groupby('district')['logo_prediction'].transform('mean')
    # crop_yield["logo_demean_prediction"] = crop_yield["logo_prediction"] - crop_yield["logo_district_prediction_mean"]
    
#########################################     SAVE MODELS   #########################################  
    model_fn_suffix = file.replace('_summary.feather', '')
    k_model_fn  = f'kfold-cv_rr-model_{model_fn_suffix}_he-{hot_encode}.pkl'
    logo_model_fn = f'logo-cv_rr-model_{model_fn_suffix}_he-{hot_encode}.pkl'
    
    with open(here('models', k_model_fn),'wb') as f:
        pickle.dump(kfold_best_model, f)
        
    with open(here('models', logo_model_fn),'wb') as f:
        pickle.dump(logo_best_model, f)
        
#########################################     SAVE RESULTS    #########################################
    d = {
        'country'     : country_code,
        'satellite'   : satellite,
        'bands'       : bands,
        'num_features': num_features,
        'points'      : points, 
        'month_range' : f'{min(month_range)}-{max(month_range)}',
        
        'limit_months': limit_months,
        'crop_mask'   : crop_mask,
        'weighted_avg': weighted_avg,
        'hot_encode'  : hot_encode,
        
        'kfold_total_n': len(x_all),
        'kfold_train_n': len(x_train),
        'kfold_test_n' : len(x_test),
        
        'kfold_best_reg_param': list(kfold_ridge_reg.best_params_.values())[0],
        'kfold_mean_of_val_R2': kfold_ridge_reg.best_score_,
        'kfold_val_R2': r2_score(y_train, kfold_val_predictions),
        'kfold_val_r' : pearsonr(kfold_val_predictions, y_train)[0],
        'kfold_val_r2': pearsonr(kfold_val_predictions, y_train)[0] ** 2,
        
        'kfold_train_R2': r2_score(y_train, y_pred_train_k),
        'kfold_train_r' : pearsonr(y_pred_train_k, y_train)[0],
        'kfold_train_r2': pearsonr(y_pred_train_k, y_train)[0] ** 2,
        
        'kfold_test_R2': r2_score(y_test, y_pred_test_k),
        'kfold_test_r' : pearsonr(y_pred_test_k, y_test)[0],
        'kfold_test_r2': pearsonr(y_pred_test_k, y_test)[0] ** 2,
        
        'logo_total_n': len(features),
        'logo_train_n': len(train_df),
        'logo_test_n' : len(test_df),    
        
        'logo_best_reg_param': list(logo_ridge_reg.best_params_.values())[0],      
        'logo_summary_val_R2': r2_score(val_summary.observed, val_summary.predicted),
        'logo_summary_val_r' : pearsonr(val_summary.observed, val_summary.predicted)[0],
        
        'logo_val_R2' : r2_score(val_df.observed, val_df.predicted),
        'logo_val_r'  : pearsonr(val_df.predicted, val_df.observed)[0],
        'logo_val_r2' : pearsonr(val_df.predicted, val_df.observed)[0] ** 2,
        
        'logo_summary_train_R2': r2_score(train_summary.observed, train_summary.predicted),
        'logo_summary_train_r' : pearsonr(train_summary.observed, train_summary.predicted)[0],
        
        'logo_train_R2': r2_score(train_df.observed, train_df.predicted),
        'logo_train_r' : pearsonr(train_df.predicted, train_df.observed)[0],
        'logo_train_r2': pearsonr(train_df.predicted, train_df.observed)[0] ** 2,
        
        'logo_test_R2': r2_score(test_df.observed, test_df.predicted),
        'logo_test_r' : pearsonr(test_df.predicted, test_df.observed)[0],
        'logo_test_r2': pearsonr(test_df.predicted, test_df.observed)[0] ** 2,
        
        'kfold_demean_R2': r2_score(crop_yield["demean_yield"], crop_yield["kfold_demean_prediction"]),
        'kfold_demean_r':  pearsonr(crop_yield["demean_yield"], crop_yield["kfold_demean_prediction"])[0],
        'kfold_demean_r2': pearsonr(crop_yield["demean_yield"], crop_yield["kfold_demean_prediction"])[0] ** 2,
        
        'logo_demean_R2': r2_score(crop_yield["demean_yield"], crop_yield["logo_demean_prediction"]),
        'logo_demean_r':  pearsonr(crop_yield["demean_yield"], crop_yield["logo_demean_prediction"])[0],
        'logo_demean_r2': pearsonr(crop_yield["demean_yield"], crop_yield["logo_demean_prediction"])[0] ** 2,
        
    }
    print('done')
    return pd.DataFrame(data=d, index=[0])

In [6]:
%%time    
##### With progress bar
workers = os.cpu_count()
if __name__ == "__main__":
    output = []
    for result in p_tqdm.p_umap(model, paramlist, num_cpus=workers):
        output.append(result)
    results = pd.concat(output).reset_index(drop=True)
    today = date.today().strftime("%Y-%m-%d")
    file_name = f'results_{today}.csv'
    print(f"Saving results as: {file_name}\n\n")           
    results.to_csv(here("data","results", file_name), index=False)

  0%|          | 0/88 [00:00<?, ?it/s]

Saving results as: results_2022-11-23.csv


CPU times: user 1.71 s, sys: 937 ms, total: 2.65 s
Wall time: 2h 50min 27s


In [7]:
results

Unnamed: 0,country,satellite,bands,num_features,points,month_range,limit_months,crop_mask,weighted_avg,hot_encode,...,logo_train_r2,logo_test_R2,logo_test_r,logo_test_r2,kfold_demean_R2,kfold_demean_r,kfold_demean_r2,logo_demean_R2,logo_demean_r,logo_demean_r2
0,ZMB,sentinel-2-l2a,2-3-4,1000,4,4-9,True,False,False,False,...,0.440318,-0.361305,0.235193,0.055316,0.449785,0.696348,0.484900,-2.384992,-0.099402,0.009881
1,ZMB,sentinel-2-l2a,2-3-4-8,1000,15,4-9,True,False,False,False,...,0.687445,0.404243,0.650719,0.423435,0.364483,0.617485,0.381288,-0.647625,0.098249,0.009653
2,ZMB,sentinel-2-l2a,2-3-4-8,1000,15,4-9,True,True,True,False,...,0.468789,0.099644,0.449377,0.201940,0.368260,0.645440,0.416593,-1.009621,0.056786,0.003225
3,ZMB,sentinel-2-l2a,2-3-4-8,1000,15,4-9,True,False,False,True,...,0.829760,0.389867,0.654661,0.428581,0.477287,0.691128,0.477657,-1.215675,0.057958,0.003359
4,ZMB,sentinel-2-l2a,2-3-4,1000,15,4-9,True,False,True,False,...,0.230123,0.040867,0.232004,0.053826,0.185483,0.562430,0.316328,-0.220105,0.122598,0.015030
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
83,ZMB,landsat-c2-l2,r-g-b-nir-swir16-swir22,1024,20,4-9,True,True,False,True,...,0.678487,0.187797,0.532123,0.283155,0.436000,0.660669,0.436483,-0.867956,-0.020912,0.000437
84,ZMB,landsat-c2-l2,r-g-b-nir-swir16-swir22,1024,20,1-12,False,True,True,True,...,0.435721,0.273096,0.524260,0.274848,0.347582,0.591030,0.349316,-0.259267,-0.028118,0.000791
85,ZMB,landsat-c2-l2,r-g-b-nir-swir16-swir22,1024,20,1-12,False,True,False,False,...,0.417061,0.256944,0.509197,0.259281,0.313864,0.567517,0.322075,-0.262830,-0.044156,0.001950
86,ZMB,landsat-c2-l2,r-g-b-nir-swir16-swir22,1024,20,1-12,False,True,True,False,...,0.425456,0.264026,0.515752,0.266000,0.308500,0.562415,0.316311,-0.257251,-0.037664,0.001419


In [None]:
# %%time    
# #### No progress bar
# multiprocessing.set_start_method('spawn')
# workers = os.cpu_count()
# if __name__ == "__main__":
#     with multiprocessing.Pool(processes=workers) as pool:
#         output = []
#         for result in pool.imap_unordered(model, paramlist, chunksize=2):
#             output.append(result)
#     results = pd.concat(output).reset_index(drop=True)
#     today = date.today().strftime("%Y-%m-%d")
#     file_name = f'results_{today}.csv'
#     print(f"Saving results as: {file_name}\n\n")           
#     results.to_csv(here("data","results", file_name))

In [31]:
# #########################################     SET PARAMS    #########################################
# file         = paramlist[0][0]
# hot_encode   = paramlist[0][1]



# f            = file.split(sep="_")
# satellite    = f[0]
# bands        = f[1].replace("bands-", "")
# country_code = f[2]
# points       = f[3].replace("k-points", "")
# num_features = f[4].replace("-features", "")
# yrs          = f[5].replace("yr-", "").split(sep="-")
# mns          = f[6].replace("mn-", "").split(sep="-")
# limit_months = str2bool(f[7].replace("lm-", ""))
# crop_mask    = str2bool(f[8].replace("cm-", ""))
# weighted_avg = str2bool(f[9].replace("wa-", ""))
# years        = range(int(yrs[0]), int(yrs[1])+1)
# month_range  = list(range(int(mns[0]), int(mns[1])+1))

# alphas = {'alpha': np.logspace(-8, 8, base = 10, num = 17)}
# kfold  = KFold()
# logo   = LeaveOneGroupOut()
# ridge  = Ridge()    

# #########################################     READ DATA    #########################################
# fn = f"{directory}/{file}"
# features = pd.read_feather(fn)

# drop_cols = ['district', 'year', 'yield_mt', "crop_perc"]

# if weighted_avg:
#     drop_cols.remove("crop_perc")
# else:
#     pass

# crop_yield = features.copy().loc[:, tuple(drop_cols)]

# #########################################     HOT ENCODE    ###########################################
# if hot_encode:
#     drop_cols.remove("district")
#     features = pd.get_dummies(features, columns=["district"], drop_first=False)
# else:
#     pass

# features['yield_mt'] = np.log10(features.yield_mt.to_numpy() + 1)

# #########################################     K-FOLD SPLIT    #########################################
# x_all = features.drop(drop_cols, axis = 1) 
# y_all = features.yield_mt
# x_train, x_test, y_train, y_test = train_test_split(x_all, y_all, test_size=0.2, random_state=0)

# #########################################     K-FOLD CV    ###########################################

# ### GRID SEARCH - FINDING BEST REGULARIZATION PARAMETER
# kfold_ridge_reg = GridSearchCV(ridge, alphas, scoring = 'r2', cv = kfold)
# kfold_ridge_reg.fit(x_train, y_train)
# kfold_best_model = kfold_ridge_reg.best_estimator_
# ### PREDICT - PREDICTING WITH BEST HYPERPARAMETER
# kfold_val_predictions = cross_val_predict(kfold_best_model, X = x_train, y = y_train, cv = kfold)   
# y_pred_train_k = kfold_best_model.predict(x_train)
# y_pred_test_k  = kfold_best_model.predict(x_test)

# # #########################################     LOGO SPLIT   ###########################################
# #     x_train_g = features[features.year < max(features.year)].drop(drop_cols, axis=1)
# #     y_train_g = features[features.year < max(features.year)].yield_mt
# #     g_train_g = features[features.year < max(features.year)].year.ravel()

# #     x_test_g = features[features.year == max(features.year)].drop(drop_cols, axis=1)
# #     y_test_g = features[features.year == max(features.year)].yield_mt
# #     g_test_g = features[features.year == max(features.year)].year

# # #########################################     LOGO CV    ###########################################
# #     ### GRID SEARCH - FINDING BEST REGULARIZATION PARAMETER
# #     logo_ridge_reg = GridSearchCV(ridge, alphas, scoring='r2', cv=logo)
# #     logo_ridge_reg.fit(x_train_g, y_train_g, groups=g_train_g)
# #     logo_best_model = logo_ridge_reg.best_estimator_
# #     ### PREDICT - PREDICTING WITH BEST HYPERPARAMETER
# #     logo_val_predictions = cross_val_predict(logo_best_model, X=x_train_g, y=y_train_g, groups=g_train_g, cv=logo)   
# #     logo_train_pred = logo_best_model.predict(x_train_g)
# #     logo_test_pred  = logo_best_model.predict(x_test_g)

# #########################################     LOGO ITERATOR   ###########################################
# logo_val_results = []
# logo_train_results = []
# logo_test_results = []

# for year in features.year.unique():
# #########################################     LOGO SPLIT   ###########################################
#     x_train_g = features[features.year != year].drop(drop_cols, axis=1)
#     y_train_g = features[features.year != year].yield_mt.ravel()
#     g_train_g = features[features.year != year].year.ravel()
#     d_train_g = crop_yield[crop_yield.year != year].district.ravel()

#     x_test_g = features[features.year == year].drop(drop_cols, axis=1)
#     y_test_g = features[features.year == year].yield_mt.ravel()
#     g_test_g = features[features.year == year].year.ravel()
#     d_test_g = crop_yield[crop_yield.year == year].district.ravel()

# #########################################     LOGO CV   ###########################################
#     ### GRID SEARCH - FINDING BEST REGULARIZATION PARAMETER
#     logo_ridge_reg = GridSearchCV(ridge, alphas, scoring='r2', cv=logo)
#     logo_ridge_reg.fit(x_train_g, y_train_g, groups=g_train_g)
#     logo_best_model = logo_ridge_reg.best_estimator_
#     ### PREDICT - PREDICTING WITH BEST HYPERPARAMETER
#     logo_val_predictions = cross_val_predict(logo_best_model, X=x_train_g, y=y_train_g, groups=g_train_g, cv=logo) 
#     logo_train_pred = logo_best_model.predict(x_train_g)
#     logo_test_pred  = logo_best_model.predict(x_test_g)

# #########################################     LOGO RESULTS   ###########################################
#     val_results = {'year': g_train_g, 'district': d_train_g, 'split': 'val', 
#                    'observed': y_train_g, 'predicted': logo_val_predictions}

#     train_results = {'year': g_train_g, 'district': d_train_g,'split': 'train', 
#                      'observed': y_train_g, 'predicted': logo_train_pred}

#     test_results = {'year': g_test_g, 'district': d_test_g, 'split': 'test', 
#                     'observed': y_test_g, 'predicted': logo_test_pred}

#     logo_val_results.append(val_results)
#     logo_train_results.append(train_results)
#     logo_test_results.append(test_results)

# #########################################     EXPLODE RESULTS   ###########################################
# explode_cols = ['year', 'district', 'observed', 'predicted']
# val_df   = pd.DataFrame(logo_val_results  ).explode(explode_cols) 
# train_df = pd.DataFrame(logo_train_results).explode(explode_cols) 
# test_df  = pd.DataFrame(logo_test_results ).explode(explode_cols)

# group_cols = ['year', 'district', 'split']
# val_summary   =   val_df.groupby(group_cols, as_index=False).mean()
# train_summary = train_df.groupby(group_cols, as_index=False).mean()

# #########################################     DE-MEAN R2    #########################################    
# crop_yield["log_yield"] = np.log10(crop_yield.yield_mt.to_numpy() + 1)
# crop_yield["district_yield_mean"] = crop_yield.groupby('district')['log_yield'].transform('mean')
# crop_yield["demean_yield"] = crop_yield["log_yield"] - crop_yield["district_yield_mean"]

# crop_yield["kfold_prediction"] = np.maximum(kfold_best_model.predict(x_all), 0)
# crop_yield["kfold_district_prediction_mean"] = crop_yield.groupby('district')['kfold_prediction'].transform('mean')
# crop_yield["kfold_demean_prediction"] = crop_yield["kfold_prediction"] - crop_yield["kfold_district_prediction_mean"]

# join_cols = ['year', 'district']
# crop_yield = crop_yield.set_index(join_cols).join(test_df.set_index(join_cols)).reset_index()
# crop_yield["logo_district_prediction_mean"] = crop_yield.groupby('district')['predicted'].transform('mean')
# crop_yield["logo_demean_prediction"] = crop_yield["predicted"] - crop_yield["logo_district_prediction_mean"]

# # crop_yield["logo_prediction"] = np.maximum(logo_best_model.predict(x_all), 0)
# # crop_yield["logo_district_prediction_mean"] = crop_yield.groupby('district')['logo_prediction'].transform('mean')
# # crop_yield["logo_demean_prediction"] = crop_yield["logo_prediction"] - crop_yield["logo_district_prediction_mean"]

# #########################################     SAVE MODELS   #########################################  
# # model_fn_suffix = file.replace('_summary.feather', '')
# # k_model_fn  = f'kfold-cv_rr-model_{model_fn_suffix}_he-{hot_encode}.pkl'
# # logo_model_fn = f'logo-cv_rr-model_{model_fn_suffix}_he-{hot_encode}.pkl'

# # with open(here('models', k_model_fn),'wb') as f:
# #     pickle.dump(kfold_best_model, f)

# # with open(here('models', logo_model_fn),'wb') as f:
# #     pickle.dump(logo_best_model, f)

# #########################################     SAVE RESULTS    #########################################
# d = {
#     'country'     : country_code,
#     'satellite'   : satellite,
#     'bands'       : bands,
#     'num_features': num_features,
#     'points'      : points, 
#     'month_range' : f'{min(month_range)}-{max(month_range)}',

#     'limit_months': limit_months,
#     'crop_mask'   : crop_mask,
#     'weighted_avg': weighted_avg,
#     'hot_encode'  : hot_encode,

#     'kfold_total_n': len(x_all),
#     'kfold_train_n': len(x_train),
#     'kfold_test_n' : len(x_test),

#     'kfold_best_reg_param': list(kfold_ridge_reg.best_params_.values())[0],
#     'kfold_mean_of_val_R2': kfold_ridge_reg.best_score_,
#     'kfold_val_R2': r2_score(y_train, kfold_val_predictions),
#     'kfold_val_r' : pearsonr(kfold_val_predictions, y_train)[0],
#     'kfold_val_r2': pearsonr(kfold_val_predictions, y_train)[0] ** 2,

#     'kfold_train_R2': r2_score(y_train, y_pred_train_k),
#     'kfold_train_r' : pearsonr(y_pred_train_k, y_train)[0],
#     'kfold_train_r2': pearsonr(y_pred_train_k, y_train)[0] ** 2,

#     'kfold_test_R2': r2_score(y_test, y_pred_test_k),
#     'kfold_test_r' : pearsonr(y_pred_test_k, y_test)[0],
#     'kfold_test_r2': pearsonr(y_pred_test_k, y_test)[0] ** 2,

#     'logo_total_n': len(features),
#     'logo_train_n': len(train_df),
#     'logo_test_n' : len(test_df),    

#     'logo_best_reg_param': list(logo_ridge_reg.best_params_.values())[0],      
#     'logo_summary_val_R2': r2_score(val_summary.observed, val_summary.predicted),
#     'logo_summary_val_r' : pearsonr(val_summary.observed, val_summary.predicted)[0],

#     'logo_val_R2' : r2_score(val_df.observed, val_df.predicted),
#     'logo_val_r'  : pearsonr(val_df.predicted, val_df.observed)[0],
#     'logo_val_r2' : pearsonr(val_df.predicted, val_df.observed)[0] ** 2,

#     'logo_summary_train_R2': r2_score(train_summary.observed, train_summary.predicted),
#     'logo_summary_train_r' : pearsonr(train_summary.observed, train_summary.predicted)[0],

#     'logo_train_R2': r2_score(train_df.observed, train_df.predicted),
#     'logo_train_r' : pearsonr(train_df.predicted, train_df.observed)[0],
#     'logo_train_r2': pearsonr(train_df.predicted, train_df.observed)[0] ** 2,

#     'logo_test_R2': r2_score(test_df.observed, test_df.predicted),
#     'logo_test_r' : pearsonr(test_df.predicted, test_df.observed)[0],
#     'logo_test_r2': pearsonr(test_df.predicted, test_df.observed)[0] ** 2,

#     'kfold_demean_R2': r2_score(crop_yield["demean_yield"], crop_yield["kfold_demean_prediction"]),
#     'kfold_demean_r':  pearsonr(crop_yield["demean_yield"], crop_yield["kfold_demean_prediction"])[0],
#     'kfold_demean_r2': pearsonr(crop_yield["demean_yield"], crop_yield["kfold_demean_prediction"])[0] ** 2,

#     'logo_demean_R2': r2_score(crop_yield["demean_yield"], crop_yield["logo_demean_prediction"]),
#     'logo_demean_r':  pearsonr(crop_yield["demean_yield"], crop_yield["logo_demean_prediction"])[0],
#     'logo_demean_r2': pearsonr(crop_yield["demean_yield"], crop_yield["logo_demean_prediction"])[0] ** 2,

# }

In [8]:
# x = pd.DataFrame(d, index = [0])
# x