# Modeling Crop Yield: Landsat + Sentinel
## Python modules

In [1]:
## import warnings
import time
import math
import os
import glob
from pyhere import here
from datetime import date
import re
from collections import Counter

import numpy as np
import pandas as pd
import geopandas
import pickle

import pyarrow
import itertools
import multiprocessing
import p_tqdm

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge, RidgeCV
from sklearn.model_selection import train_test_split, KFold, LeaveOneGroupOut, cross_val_score, GridSearchCV, cross_val_predict
from sklearn.metrics import r2_score
from scipy.stats import spearmanr,  pearsonr

from task_modeling_utils import *
from prediction_utils import *

In [5]:
np.random.seed(0)

In [15]:
file_pattern = str(here('data', 'results', '2_sensor_results_*_*.csv'))
files = glob.glob(pathname=file_pattern)
results = merge_files(files)
files

['/Users/vermilirockfish/Documents/Github/crop-modeling/code/3_task_modeling/../../data/results/2_sensor_results_6_2023-03-20.csv',
 '/Users/vermilirockfish/Documents/Github/crop-modeling/code/3_task_modeling/../../data/results/2_sensor_results_2_2023-03-18.csv',
 '/Users/vermilirockfish/Documents/Github/crop-modeling/code/3_task_modeling/../../data/results/2_sensor_results_4_2023-03-19.csv',
 '/Users/vermilirockfish/Documents/Github/crop-modeling/code/3_task_modeling/../../data/results/2_sensor_results_5_2023-03-21.csv',
 '/Users/vermilirockfish/Documents/Github/crop-modeling/code/3_task_modeling/../../data/results/2_sensor_results_3_2023-03-18.csv',
 '/Users/vermilirockfish/Documents/Github/crop-modeling/code/3_task_modeling/../../data/results/2_sensor_results_1_2023-03-17.csv']

In [16]:
# top = results.test_R2.sort_values().index[-1]
# results.iloc[top:top+1, 1:20]

top = results.val_R2.sort_values().index[-1]
results.iloc[top:top+1, 24:-1]

Unnamed: 0,val_R2,val_r,val_r2,train_R2,train_r,train_r2,test_R2,test_r,test_r2,demean_cv_R2,demean_cv_r
151,0.846564,0.920591,0.847488,0.967641,0.984107,0.968467,0.66043,0.837338,0.701135,0.310614,0.569358


In [2]:
directory = here("data", "random_features", "summary")
files = os.listdir(directory)
files = list(f for f in files if f not in ('.gitkeep', '.ipynb_checkpoints'))
paramlist = list(itertools.combinations(files, 2))
paramlist = list(itertools.product(paramlist, [True, False]))
paramlist = list(tuple(merge(paramlist[i])) for i in range(len(paramlist)))
paramlist = sorted(paramlist, key=lambda tup: tup[2])
paramlist = paramlist[3:4]
len(paramlist)
paramlist = (i for i in paramlist)
paramlist

<generator object <genexpr> at 0x7f491535c270>

In [None]:
model_2_sensor(paramlist)

In [3]:
from mpi4py.futures import MPIPoolExecutor
i = 1

if __name__ == "__main__":
    max_workers = 1 # int(os.environ.get("SLURM_NTASKS", 4)) - 1

    executor = MPIPoolExecutor(max_workers=max_workers)
    output = executor.starmap(model_2_sensor, paramlist)
    results = pd.DataFrame(output)
    executor.shutdown()

    today = date.today().strftime("%Y-%m-%d")
    file_name = f'2_sensor_results_{i}_{today}.csv'
    print(f"Saving results as: {file_name}\n\n")
    results.to_csv(here("data","results", file_name), index=False)

[mpiexec@pod-login1.podcluster] match_arg (utils/args/args.c:166): unrecognized argument pmi_args
[mpiexec@pod-login1.podcluster] HYDU_parse_array (utils/args/args.c:181): argument matching returned error
[mpiexec@pod-login1.podcluster] parse_args (ui/mpich/utils.c:1639): error parsing input array
[mpiexec@pod-login1.podcluster] HYD_uii_mpx_get_parameters (ui/mpich/utils.c:1691): unable to parse user arguments
[mpiexec@pod-login1.podcluster] main (ui/mpich/mpiexec.c:127): error parsing parameters


In [6]:
results

Unnamed: 0,country,year_range,satellite_1,bands_1,num_features_1,points_1,month_range_1,limit_months_1,crop_mask_1,weighted_avg_1,...,val_r2,train_R2,train_r,train_r2,test_R2,test_r,test_r2,demean_cv_R2,demean_cv_r,demean_cv_r2
0,ZMB,2013-2021,landsat-8-c2-l2,1-2-3-4-5-6-7,1000,15,4-9,True,False,False,...,0.784041,0.943938,0.972433,0.945625,0.725691,0.870657,0.758044,0.031688,0.40105,0.160841


In [5]:
# %%time     
# ##### With progress bar
# workers = os.cpu_count()
# if __name__ == "__main__":
#     output = []
#     for result in p_tqdm.p_map(model_2_sensor, paramlist):
#         output.append(result)
#     results = pd.concat(output).reset_index(drop=True)
#     today = date.today().strftime("%Y-%m-%d")
#     file_name = f'2_sensor_results_{today}.csv'
#     print(f"Saving results as: {file_name}\n\n")           
#     results.to_csv(here("data","results", file_name), index=False)

In [16]:
%%time
## TESTING  
f1 = 'landsat-8-c2-l2_bands-1-2-3-4-5-6-7_ZMB_15k-points_1000-features_yr-2013-2021_mn-4-9_lm-True_cm-False_wa-False_summary.feather'
f2 = 'sentinel-2-l2a_bands-2-3-4-8_ZMB_15k-points_1000-features_yr-2016-2022_mn-4-9_lm-True_cm-True_wa-False_summary.feather'
hot_encode = False
n_splits = 5

#########################################     SET PARAMS    #########################################    
# f1         = params[0]
# f2         = params[1]
# hot_encode = params[2]

satellite1, bands1, country_code, points1, yrs1, mns1,\
num_features1, limit_months1, crop_mask1, weighted_avg1 = split_fn(f1)

satellite2, bands2, country_code, points2, yrs2, mns2,\
num_features2, limit_months2, crop_mask2, weighted_avg2 = split_fn(f2)

print(f"\nBegin with paramters:\n\t{f1}\n\t{f2}\n\tOne-hot encoding: {hot_encode}\n")

#########################################     READ DATA    #########################################
features_1 = pd.read_feather(here('data', 'random_features', 'summary', f1))
features_2 = pd.read_feather(here('data', 'random_features', 'summary', f2))

#########################################     CLEAN DATA    #########################################  
min_year = max(min(features_1.year), min(features_2.year))
max_year = min(max(features_1.year), max(features_2.year))

features_1 = features_1[features_1.year >= min_year]
features_2 = features_2[features_2.year >= min_year]

features_1 = features_1[features_1.year <= max_year]
features_2 = features_2[features_2.year <= max_year]

features_1.drop(['crop_perc'], axis=1, errors='ignore', inplace=True)
features_2.drop(['crop_perc'], axis=1, errors='ignore', inplace=True)

#########################################     JOIN FEATURES    #########################################  
drop_cols = ['district', 'year', 'yield_mt']

features_1 = features_1.set_index(drop_cols).add_prefix("f1_")
features_2 = features_2.set_index(drop_cols).add_prefix("f2_")

features = features_1.join(features_2).reset_index()
features = features[~features.isna().any(axis = 1)]

#########################################     CLEAN AND COPY    ######################################### 
yrs = f'{min(features.year)}-{max(features.year)}'
n_fts_1 = features_1.shape[1]
n_districts = len(features.district.unique())

crop_yield = features.copy().loc[:, tuple(drop_cols)]
crop_yield["log_yield"] = np.log10(crop_yield.yield_mt.to_numpy() + 1)

del features_1, features_2; gc.collect()

#########################################    HOT ENCODE    ######################################### 
if hot_encode:
    drop_cols.remove('district')
    features = pd.get_dummies(features, columns = ["district"], drop_first = False)

#########################################    STANDARDIZE FEATURES    #########################################    
features = features.set_index(drop_cols) 
features_scaled = StandardScaler().fit_transform(features.values)
features = pd.DataFrame(features_scaled, index=features.index).reset_index()
features.columns = features.columns.astype(str)

#########################################     K-FOLD SPLIT    #########################################
x_all = features.drop(drop_cols, axis = 1) 
y_all = np.log10(features.yield_mt.to_numpy() + 1)
x_train, x_test, y_train, y_test = train_test_split(x_all, y_all, test_size=0.2, random_state=0)

del features; gc.collect()



Begin with paramters:
	landsat-8-c2-l2_bands-1-2-3-4-5-6-7_ZMB_15k-points_1000-features_yr-2013-2021_mn-4-9_lm-True_cm-False_wa-False_summary.feather
	sentinel-2-l2a_bands-2-3-4-8_ZMB_15k-points_1000-features_yr-2016-2022_mn-4-9_lm-True_cm-True_wa-False_summary.feather
	One-hot encoding: False

CPU times: user 1.7 s, sys: 1.17 s, total: 2.87 s
Wall time: 1.62 s


0

In [17]:
%%time
#########################################     K-FOLD CV    ###########################################
### SETUP
tic = time.time()
kfold  = KFold(n_splits=n_splits)
alphas = {'alpha': np.logspace(-8, 8, base = 10, num = 17)}
### LAMBDA INDICIES
start = [0, n_fts_1]
end   = [n_fts_1, x_train.shape[1]] 
if hot_encode:
    start.append(x_train.shape[1]-n_districts)
    end.append(x_train.shape[1]-n_districts)
    end.sort()
### GRID SEARCH - FINDING BEST REGULARIZATION PARAMETER(S)
best_lambdas, best_scores, best_model = kfold_rr_multi_lambda_tuning(
    X=x_train,
    y=y_train, 
    grid=alphas.get('alpha'), 
    n_splits=n_splits,
    start=start,
    end=end, 
    static_lam=1,
    verbose=2,
    show_linalg_warning=False,
    fit_model_after_tuning=True
)
### PREDICT WITH BEST HYPERPARAMETER(S)
val_predictions   = cross_val_predict(best_model, X=x_train, y=y_train, cv=kfold)   
train_predictions = best_model.predict(x_train)
test_predictions  = best_model.predict(x_test)
print(f"""
Finish:
    {f1}
    {f2}
    One-hot encoding: {hot_encode}
    Final Val R2:  {r2_score(y_train, val_predictions):0.4f} 
    Final Test R2: {r2_score(y_test, test_predictions):0.4f}
    Total time: {(time.time()-tic)/60:0.2f} minutes
""")

#########################################     DE-MEAN R2    #########################################    
crop_yield["prediction"] = np.maximum(best_model.predict(x_all), 0)

train_split = pd.DataFrame(np.repeat('train', len(x_train)), columns = ['split'], index = x_train.index)
train_split = train_split.join(crop_yield.copy()[crop_yield.index.isin(x_train.index)])
train_split['cv_prediction'] = np.maximum(val_predictions, 0)
train_split["demean_cv_yield"] = train_split["log_yield"]-train_split.groupby('district')['log_yield'].transform('mean')
train_split["demean_cv_prediction"] = train_split["cv_prediction"]-train_split.groupby('district')['cv_prediction'].transform('mean')

test_split = pd.DataFrame(np.repeat('test', len(x_test)), columns = ['split'], index = x_test.index)
test_split = test_split.join(crop_yield.copy()[crop_yield.index.isin(x_test.index)])
test_split['cv_prediction'] = np.repeat(np.nan, len(x_test))
test_split["demean_cv_yield"] = np.repeat(np.nan, len(x_test))
test_split["demean_cv_prediction"] = np.repeat(np.nan, len(x_test))

#########################################     SAVE RESULTS    #########################################
d = {
    'country': country_code[0],
    'year_range': yrs,

    'satellite_1'   : satellite1[0],
    'bands_1'       : bands1,
    'num_features_1': num_features1,
    'points_1'      : points1, 
    'month_range_1' : mns1,
    'limit_months_1': limit_months1,
    'crop_mask_1'   : crop_mask1,
    'weighted_avg_1': weighted_avg1,

    'satellite_2'   : satellite2[0],
    'bands_2'       : bands2,
    'num_features_2': num_features2,
    'points_2'      : points2, 
    'month_range_2' : mns2,
    'limit_months_2': limit_months2,
    'crop_mask_2'   : crop_mask2,
    'weighted_avg_2': weighted_avg2,

    'hot_encode': hot_encode,

    'total_n': len(x_all),
    'train_n': len(x_train),
    'test_n' : len(x_test),

    'best_reg_param': [best_lambdas],
    'mean_of_val_R2': [best_scores],
    'val_R2': r2_score(y_train, val_predictions),
    'val_r' : pearsonr(val_predictions, y_train)[0],
    'val_r2': pearsonr(val_predictions, y_train)[0] ** 2,

    'train_R2': r2_score(y_train, train_predictions),
    'train_r' : pearsonr(train_predictions, y_train)[0],
    'train_r2': pearsonr(train_predictions, y_train)[0] ** 2,

    'test_R2': r2_score(y_test, test_predictions),
    'test_r' : pearsonr(test_predictions, y_test)[0],
    'test_r2': pearsonr(test_predictions, y_test)[0] ** 2,

    'demean_cv_R2': r2_score(train_split.demean_cv_yield, train_split.demean_cv_prediction),
    'demean_cv_r':  pearsonr(train_split.demean_cv_yield, train_split.demean_cv_prediction)[0],
    'demean_cv_r2': pearsonr(train_split.demean_cv_yield, train_split.demean_cv_prediction)[0] ** 2,
}
df = pd.DataFrame(data=d, index=[0])
# return pd.DataFrame(data=d, index=[0])

1e-08 1e-07 1e-06 9.999999999999999e-06 9.999999999999999e-05 0.001 0.01 0.09999999999999999 1.0 10.0 100.0 1000.0 10000.0 100000.0 1000000.0 10000000.0 100000000.0 
	Best λ 1: 10.0
	Val R2 1: 0.6917

1e-08 1e-07 1e-06 9.999999999999999e-06 9.999999999999999e-05 0.001 0.01 0.09999999999999999 1.0 10.0 100.0 1000.0 10000.0 100000.0 1000000.0 10000000.0 100000000.0 
	Best λ 2: 1.0
	Val R2 2: 0.6917


Finish:
landsat-8-c2-l2_bands-1-2-3-4-5-6-7_ZMB_15k-points_1000-features_yr-2013-2021_mn-4-9_lm-True_cm-False_wa-False_summary.feather
sentinel-2-l2a_bands-2-3-4-8_ZMB_15k-points_1000-features_yr-2016-2022_mn-4-9_lm-True_cm-True_wa-False_summary.feather
One-hot encoding: False
Final Val R2:  0.7119 
Final Test R2: 0.6647
Total time: 71.85 minutes

CPU times: user 6h 11min 51s, sys: 3h 51min 19s, total: 10h 3min 11s
Wall time: 1h 11min 51s
