In [1]:
## import warnings
import time
import math
import os
import glob
import multiprocessing
import numpy as np
import pandas as pd
import geopandas
import pyarrow
import itertools
from sklearn.linear_model import Ridge, RidgeCV
from sklearn.model_selection import train_test_split, KFold, LeaveOneGroupOut, cross_val_score, GridSearchCV, cross_val_predict
from sklearn.metrics import r2_score
from scipy.stats import spearmanr,  pearsonr
from pyhere import here
from datetime import date

In [2]:
def str2bool(string):
    return string.lower() in ("yes", "true", "t", "1")

In [3]:
data_dir = here("data")
directory = here("data", "random_features", "summary_2")
today = date.today().strftime("%Y-%m-%d")
files = os.listdir(directory)
files = [f for f in files if f not in ('.gitkeep', '.ipynb_checkpoints')]
len(files)

56

In [5]:
paramlist = list(itertools.product(files, [True, False]))
len(paramlist)

112

In [6]:
def model(params):
    #########################################     SET PARAMS    #########################################
    file         = params[0]
    hot_encode   = params[1]
    f            = file.split(sep="_")
    satellite    = f[0]
    bands        = f[1].replace("bands-", "")
    country_code = f[2]
    points       = f[3].replace("k-points", "")
    num_features = f[4].replace("-features", "")
    yrs          = f[5].replace("yr-", "").split(sep="-")
    mns          = f[6].replace("mn-", "").split(sep="-")
    limit_months = str2bool(f[7].replace("lm-", ""))
    crop_mask    = str2bool(f[8].replace("cm-", ""))
    weighted_avg = str2bool(f[9].replace("wa-", ""))
    years        = range(int(yrs[0]), int(yrs[1])+1)
    month_range  = list(range(int(mns[0]), int(mns[1])+1))
    
    #########################################     READ DATA    #########################################
    fn = f"{directory}/{file}"
    features = pd.read_feather(fn)
     
    drop_cols = ['district', 'year', 'yield_mt', "crop_perc"]
            
    if weighted_avg:
        drop_cols.remove("crop_perc")
    else:
        pass

    crop_yield = features.copy().loc[:, tuple(drop_cols)]
    
    if hot_encode:
        drop_cols.remove("district")
        features = pd.get_dummies(features, columns=["district"], drop_first=False)
    else:
        pass
    
    #########################################     SPLIT DATA    #########################################
    x_all = features.drop(drop_cols, axis = 1) 
    y_all = np.log10(features.yield_mt.to_numpy() + 1)
    g_all = features.year.ravel()
    
    x_train, x_test,\
    y_train, y_test,\
    g_train, g_test = train_test_split(x_all, y_all, g_all, test_size = 0.2, random_state = 0)

    #########################################     K-FOLD CV    ###########################################
    # ridge_kfold_cv = RidgeCV(cv = 5, alphas = np.logspace(-8, 8, base = 10, num = 17))
    # ridge_kfold_cv.fit(x_train, y_train)
    kfold = KFold()
    ridge = Ridge()
    parameters = {'alpha': np.logspace(-8, 8, base = 10, num = 17)}
    ### GRID SEARCH - FINDING BEST REGULARIZATION PARAMETER
    ridge_kfold_reg = GridSearchCV(ridge, parameters, scoring = 'r2', cv = kfold)
    ridge_kfold_reg.fit(x_train, y_train)
    best_kfold_model = ridge_kfold_reg.best_estimator_
    ### CV PREDICT - PREDICTING WITH BEST HYPERPARAMETER
    kfold_val_predictions = cross_val_predict(best_kfold_model, X = x_train, y = y_train, cv = kfold)   
    ### TRAIN BEST MODEL AND PREDICT
    best_kfold_model.fit(x_train, y_train)
    y_pred_train  = np.maximum(best_kfold_model.predict(x_train), 0)
    y_pred_test   = np.maximum(best_kfold_model.predict(x_test), 0)

    #########################################     LOGO CV    ###########################################
    logo = LeaveOneGroupOut()
    ridge = Ridge()
    parameters = {'alpha': np.logspace(-8, 8, base = 10, num = 17)}
    ### GRID SEARCH - FINDING BEST REGULARIZATION PARAMETER
    ridge_logo_reg = GridSearchCV(ridge, parameters, scoring = 'r2', cv = logo)
    ridge_logo_reg.fit(x_all, y_all, groups = g_all)
    best_logo_model = ridge_logo_reg.best_estimator_
    ### CV PREDICT - PREDICTING WITH BEST HYPERPARAMETER
    logo_val_predictions = cross_val_predict(best_logo_model, X = x_all, y = y_all, groups = g_all,  cv = logo)   
    
    #########################################     DE-MEAN R2    #########################################    
    crop_yield["log_yield"] = np.log10(crop_yield.yield_mt.to_numpy() + 1)
    crop_yield["prediction"] = np.maximum(best_kfold_model.predict(x_all), 0)
    # crop_yield["residual"] = crop_yield["log_yield"] - crop_yield["prediction"]
    crop_yield["district_yield_mean"] = crop_yield.groupby('district')['log_yield'].transform('mean')
    crop_yield["district_prediction_mean"] = crop_yield.groupby('district')['prediction'].transform('mean')
    crop_yield["demean_yield"] = crop_yield["log_yield"] - crop_yield["district_yield_mean"]
    crop_yield["demean_prediction"] = crop_yield["prediction"] - crop_yield["district_prediction_mean"]
    
    #########################################     SAVE RESULTS    #########################################
    d = {
        'country': country_code,
        'satellite': satellite,
        'bands': bands,
        'num_features': num_features,
        'points': points, 
        'month_range': f'{min(month_range)}-{max(month_range)}',
        
        'limit_months': limit_months,
        'crop_mask': crop_mask,
        'weighted_avg': weighted_avg,
        'hot_encode': hot_encode,
        
        'total_n': len(x_all),
        'train_n': len(x_train),
        'test_n': len(x_test),
        
        'kfold_best_reg_param': list(ridge_kfold_reg.best_params_.values())[0],
        'kfold_mean_of_val_R2s': ridge_kfold_reg.best_score_,
        'kfold_val_R2': r2_score(y_train, kfold_val_predictions),
        'kfold_val_r' : pearsonr(kfold_val_predictions, y_train)[0],
        'kfold_val_r2' : pearsonr(kfold_val_predictions, y_train)[0] ** 2,
        
        'kfold_train_R2': r2_score(y_train, y_pred_train),
        'kfold_train_r': pearsonr(y_pred_train, y_train)[0],
        'kfold_train_r2': pearsonr(y_pred_train, y_train)[0] ** 2,
        
        'kfold_test_R2': r2_score(y_test, y_pred_test),
        'kfold_test_r': pearsonr(y_pred_test, y_test)[0],
        'kfold_test_r2': pearsonr(y_pred_test, y_test)[0] ** 2,
        
        'kfold_demean_R2': r2_score(crop_yield["demean_yield"], crop_yield["demean_prediction"]),
        'kfold_demean_r': pearsonr(crop_yield["demean_yield"], crop_yield["demean_prediction"])[0],
        'kfold_demean_r2': pearsonr(crop_yield["demean_yield"], crop_yield["demean_prediction"])[0] ** 2,
        
        'logo_best_reg_param': list(ridge_logo_reg.best_params_.values())[0],      
        'logo_mean_of_val_R2s' : ridge_logo_reg.best_score_,
        'logo_val_R2' : r2_score(y_all, logo_val_predictions),
        'logo_val_r' : pearsonr(logo_val_predictions, y_all)[0],
        'logo_val_r2' : pearsonr(logo_val_predictions, y_all)[0] ** 2
    }
    return pd.DataFrame(data=d, index=[0])

In [7]:
%%time
if __name__ == "__main__":
    with multiprocessing.Pool(processes=os.cpu_count()) as pool:
        output = []
        for result in pool.imap_unordered(model, paramlist, chunksize=2):
            output.append(result)
    results = pd.concat(output).reset_index(drop=True)
    file_name = f'results_{today}.csv'
    print(f"Saving results as: {file_name}\n\n")           
    results.to_csv(here("data","results", file_name))

Saving results as: results_2022-10-26.csv


CPU times: user 659 ms, sys: 647 ms, total: 1.31 s
Wall time: 49min


In [8]:
results

Unnamed: 0,country,satellite,bands,num_features,points,month_range,limit_months,crop_mask,weighted_avg,hot_encode,...,kfold_test_r,kfold_test_r2,kfold_demean_R2,kfold_demean_r,kfold_demean_r2,logo_best_reg_param,logo_mean_of_val_R2s,logo_val_R2,logo_val_r,logo_val_r2
0,ZMB,sentinel-2-l2a,2-3-4,1000,20,4-9,True,True,False,True,...,0.871884,0.760182,0.363550,0.606426,0.367753,1000000.0,-0.049517,0.002579,0.062462,0.003901
1,ZMB,sentinel-2-l2a,2-3-4,1000,20,4-9,True,True,False,False,...,0.667601,0.445692,0.079113,0.458222,0.209967,1000000.0,-0.049527,0.002570,0.062418,0.003896
2,ZMB,sentinel-2-l2a,2-3-4,1000,20,4-9,True,False,True,True,...,0.865033,0.748282,0.348934,0.595136,0.354187,100.0,-0.031967,0.080194,0.404990,0.164017
3,ZMB,sentinel-2-l2a,2-3-4,1000,20,4-9,True,False,True,False,...,0.635609,0.403998,0.051520,0.442394,0.195713,1000000.0,-0.052396,-0.000340,0.045694,0.002088
4,ZMB,sentinel-2-l2a,2-3-4,1000,4,4-9,True,False,True,True,...,0.886642,0.786133,0.369623,0.610693,0.372946,1.0,0.505366,0.545170,0.754467,0.569221
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
107,ZMB,landsat-c2-l2,r-g-b-nir-swir16-swir22,1024,20,1-12,False,True,False,False,...,0.667718,0.445847,0.313864,0.567517,0.322075,10000.0,0.175605,0.251579,0.503990,0.254006
108,ZMB,landsat-c2-l2,r-g-b-nir-swir16-swir22,1024,20,1-12,False,True,True,True,...,0.704725,0.496638,0.347582,0.591030,0.349316,10000.0,0.179609,0.254007,0.506394,0.256435
109,ZMB,landsat-c2-l2,r-g-b-nir-swir16-swir22,1024,20,1-12,False,True,True,False,...,0.663095,0.439694,0.308500,0.562415,0.316311,10000.0,0.178909,0.253486,0.505916,0.255951
110,ZMB,landsat-c2-l2,r-g-b-nir-swir16-swir22,1024,20,1-12,False,False,True,True,...,0.704725,0.496638,0.347582,0.591030,0.349316,10000.0,0.179609,0.254007,0.506394,0.256435


In [9]:
# %%time
# output = []
# for file in files[0:1]:
    
#     #########################################     SET PARAMS    #########################################
#     f            = file.split(sep="_")
#     satellite    = f[0]
#     bands        = f[1].replace("bands-", "")
#     country_code = f[2]
#     points       = f[3].replace("k-points", "")
#     num_features = f[4].replace("-features", "")
#     yrs          = f[5].replace("yr-", "").split(sep="-")
#     mns          = f[6].replace("mn-", "").split(sep="-")
#     limit_months = str2bool(f[7].replace("lm-", ""))
#     crop_mask    = str2bool(f[8].replace("cm-", ""))
#     weighted_avg = str2bool(f[9].replace("wa-", ""))
#     hot_encode   = str2bool(f[10].replace("he-", ""))
#     years        = range(int(yrs[0]), int(yrs[1])+1)
#     month_range  = list(range(int(mns[0]), int(mns[1])+1))
    
#     #########################################     READ DATA    #########################################
#     fn = f"{directory}/{file}"
#     features = pd.read_feather(fn)
    
#     drop_cols = ['district', 'year', 'yield_mt', "crop_perc"]
            
#     crop_yield = features.copy().loc[:, tuple(drop_cols)]
    
#     if weighted_avg:
#         drop_cols.remove("crop_perc")
#     else:
#         pass

#     if hot_encode:
#         drop_cols.remove("district")
#         features = pd.get_dummies(features, columns=["district"], drop_first=False)
#     else:
#         pass
    
#     #########################################     SPLIT DATA    #########################################
#     x_all = features.drop(drop_cols, axis = 1) 
#     y_all = np.log10(features.yield_mt.to_numpy() + 1)
#     g_all = features.year.ravel()
    
#     x_train, x_test,\
#     y_train, y_test,\
#     g_train, g_test = train_test_split(
#         x_all, y_all, g_all, test_size = 0.2, random_state = 0
#     )

#     #########################################     K-FOLD CV    ###########################################
#     # ridge_kfold_cv = RidgeCV(cv = 5, alphas = np.logspace(-8, 8, base = 10, num = 17))
#     # ridge_kfold_cv.fit(x_train, y_train)
#     kfold = KFold()
#     ridge = Ridge()
#     parameters = {'alpha': np.logspace(-8, 8, base = 10, num = 17)}
#     ### GRID SEARCH - FINDING BEST REGULARIZATION PARAMETER
#     ridge_kfold_reg = GridSearchCV(ridge, parameters, scoring = 'r2', cv = kfold)
#     ridge_kfold_reg.fit(x_train, y_train)
#     best_kfold_model = ridge_kfold_reg.best_estimator_
#     ### CV PREDICT - PREDICTING WITH BEST HYPERPARAMETER
#     kfold_val_predictions = cross_val_predict(best_kfold_model, X = x_train, y = y_train, cv = kfold)   
#     ### TRAIN BEST MODEL AND PREDICT
#     best_kfold_model.fit(x_train, y_train)
#     y_pred_train  = np.maximum(best_kfold_model.predict(x_train), 0)
#     y_pred_test   = np.maximum(best_kfold_model.predict(x_test), 0)

#     #########################################     LOGO CV    ###########################################
#     logo = LeaveOneGroupOut()
#     ridge = Ridge()
#     parameters = {'alpha': np.logspace(-8, 8, base = 10, num = 17)}
#     ### GRID SEARCH - FINDING BEST REGULARIZATION PARAMETER
#     ridge_logo_reg = GridSearchCV(ridge, parameters, scoring = 'r2', cv = logo)
#     ridge_logo_reg.fit(x_all, y_all, groups = g_all)
#     best_logo_model = ridge_logo_reg.best_estimator_
#     ### CV PREDICT - PREDICTING WITH BEST HYPERPARAMETER
#     logo_val_predictions = cross_val_predict(best_logo_model, X = x_all, y = y_all, groups = g_all,  cv = logo)   
    
#     #########################################     DE-MEAN R2    #########################################    
#     crop_yield["log_yield"] = np.log10(crop_yield.yield_mt.to_numpy() + 1)
#     crop_yield["prediction"] = np.maximum(best_kfold_model.predict(x_all), 0)
#     # crop_yield["residual"] = crop_yield["log_yield"] - crop_yield["prediction"]
#     crop_yield["district_yield_mean"] = crop_yield.groupby('district')['log_yield'].transform('mean')
#     crop_yield["district_prediction_mean"] = crop_yield.groupby('district')['prediction'].transform('mean')
#     crop_yield["demean_yield"] = crop_yield["log_yield"] - crop_yield["district_yield_mean"]
#     crop_yield["demean_prediction"] = crop_yield["prediction"] - crop_yield["district_prediction_mean"]
    
#     #########################################     SAVE RESULTS    #########################################
#     d = {
#         'country': country_code,
#         'satellite': satellite,
#         'bands': bands,
#         'num_features': num_features,
#         'points': points, 
#         'month_range': f'{min(month_range)}-{max(month_range)}',
        
#         'limit_months': limit_months,
#         'crop_mask': crop_mask,
#         'weighted_avg': weighted_avg,
#         'hot_encode': hot_encode,
        
#         'total_n': len(x_all),
#         'train_n': len(x_train),
#         'test_n': len(x_test),
        
#         'kfold_best_reg_param': list(ridge_kfold_reg.best_params_.values())[0],
#         'kfold_mean_of_val_R2s': ridge_kfold_reg.best_score_,
#         'kfold_val_R2': r2_score(y_train, kfold_val_predictions),
#         'kfold_val_r' : pearsonr(kfold_val_predictions, y_train)[0],
#         'kfold_val_r2' : pearsonr(kfold_val_predictions, y_train)[0] ** 2,
        
#         'kfold_train_R2': r2_score(y_train, y_pred_train),
#         'kfold_train_r': pearsonr(y_pred_train, y_train)[0],
#         'kfold_train_r2': pearsonr(y_pred_train, y_train)[0] ** 2,
        
#         'kfold_test_R2': r2_score(y_test, y_pred_test),
#         'kfold_test_r': pearsonr(y_pred_test, y_test)[0],
#         'kfold_test_r2': pearsonr(y_pred_test, y_test)[0] ** 2,
        
#         'kfold_demean_R2': r2_score(crop_yield["demean_yield"], crop_yield["demean_prediction"]),
#         'kfold_demean_r': pearsonr(crop_yield["demean_yield"], crop_yield["demean_prediction"])[0],
#         'kfold_demean_r2': pearsonr(crop_yield["demean_yield"], crop_yield["demean_prediction"])[0] ** 2,
        
#         'logo_best_reg_param': list(ridge_logo_reg.best_params_.values())[0],      
#         'logo_mean_of_val_R2s' : ridge_logo_reg.best_score_,
#         'logo_val_R2' : r2_score(y_all, logo_val_predictions),
#         'logo_val_r' : pearsonr(logo_val_predictions, y_all)[0],
#         'logo_val_r2' : pearsonr(logo_val_predictions, y_all)[0] ** 2
#     }
#     df = pd.DataFrame(data=d, index=[0])
#     output.append(df)

# results = pd.concat(output).reset_index(drop=True)
# file_name = f'results_{today}.csv'
# print(f"Saving results as: {file_name}\n\n")           
# # results.to_csv(here("data","results", file_name))
# results