# Modeling Crop Yield: Landsat + Sentinel
## Python modules

In [1]:
%load_ext lab_black

In [2]:
## import warnings
import time
import math
import os
import glob
from pyhere import here
from datetime import date
import re
from collections import Counter

import numpy as np
import pandas as pd

os.environ["USE_PYGEOS"] = "0"
import geopandas
import pickle

import pyarrow
import itertools
import multiprocessing
import p_tqdm

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge, RidgeCV
from sklearn.model_selection import (
    train_test_split,
    KFold,
    LeaveOneGroupOut,
    cross_val_score,
    GridSearchCV,
    cross_val_predict,
)
from sklearn.metrics import r2_score
from scipy.stats import spearmanr, pearsonr

from task_modeling_utils import *
from prediction_utils import *

In [3]:
file_pattern = str(here("data", "results", "2_sensor_results_*_*.csv"))
files = glob.glob(pathname=file_pattern)
results = merge_files(files)
files

['C:\\Users\\Cullen\\Desktop\\GitHub\\crop-modeling\\code\\3_task_modeling\\..\\..\\data\\results\\2_sensor_results_1_2023-03-17.csv',
 'C:\\Users\\Cullen\\Desktop\\GitHub\\crop-modeling\\code\\3_task_modeling\\..\\..\\data\\results\\2_sensor_results_2_2023-03-18.csv',
 'C:\\Users\\Cullen\\Desktop\\GitHub\\crop-modeling\\code\\3_task_modeling\\..\\..\\data\\results\\2_sensor_results_3_2023-03-18.csv',
 'C:\\Users\\Cullen\\Desktop\\GitHub\\crop-modeling\\code\\3_task_modeling\\..\\..\\data\\results\\2_sensor_results_4_2023-03-19.csv',
 'C:\\Users\\Cullen\\Desktop\\GitHub\\crop-modeling\\code\\3_task_modeling\\..\\..\\data\\results\\2_sensor_results_5_2023-03-21.csv',
 'C:\\Users\\Cullen\\Desktop\\GitHub\\crop-modeling\\code\\3_task_modeling\\..\\..\\data\\results\\2_sensor_results_6_2023-03-20.csv',
 'C:\\Users\\Cullen\\Desktop\\GitHub\\crop-modeling\\code\\3_task_modeling\\..\\..\\data\\results\\2_sensor_results_7_2023-03-23.csv',
 'C:\\Users\\Cullen\\Desktop\\GitHub\\crop-modeling\\co

In [4]:
# top = results.test_R2.sort_values().index[-1]
# results.iloc[top:top+1, 1:20]

top = results.val_R2.sort_values().index[-1]
results.iloc[top : top + 1, 24:-1]
# results.iloc[top : top + 1, 1:20]

Unnamed: 0,val_R2,val_r,val_r2,train_R2,train_r,train_r2,test_R2,test_r,test_r2,demean_cv_R2,demean_cv_r
1396,0.846564,0.920591,0.847488,0.967641,0.984107,0.968467,0.66043,0.837338,0.701135,0.310614,0.569358


In [5]:
# f1 = 'landsat-c2-l2_bands-r-g-b-nir-swir16-swir22_ZMB_20k-points_1024-features_yr-2009-2021_mn-1-12_lm-False_cm-True_wa-False_summary.feather'
# f2 = 'sentinel-2-l2a_bands-2-3-4-8_ZMB_15k-points_1000-features_yr-2016-2022_mn-1-12_lm-False_cm-True_wa-False_summary.feather'

In [6]:
# directory = here("data", "random_features", "summary")
# files = os.listdir(directory)
# files = list(f for f in files if f not in ('.gitkeep', '.ipynb_checkpoints'))
# paramlist = list(itertools.combinations(files, 2))
# paramlist = list(itertools.product(paramlist, [True, False]))
# paramlist = list(tuple(merge(paramlist[i])) for i in range(len(paramlist)))
# paramlist = sorted(paramlist, key=lambda tup: tup[2])
# paramlist = paramlist[3:4]
# len(paramlist)
# paramlist = (i for i in paramlist)
# paramlist

In [7]:
# model_2_sensor(paramlist)

In [8]:
# from mpi4py.futures import MPIPoolExecutor
# i = 1

# if __name__ == "__main__":
#     max_workers = 1 # int(os.environ.get("SLURM_NTASKS", 4)) - 1

#     executor = MPIPoolExecutor(max_workers=max_workers)
#     output = executor.starmap(model_2_sensor, paramlist)
#     results = pd.DataFrame(output)
#     executor.shutdown()

#     today = date.today().strftime("%Y-%m-%d")
#     file_name = f'2_sensor_results_{i}_{today}.csv'
#     print(f"Saving results as: {file_name}\n\n")
#     results.to_csv(here("data","results", file_name), index=False)

In [9]:
# results

In [10]:
# %%time
# ##### With progress bar
# workers = os.cpu_count()
# if __name__ == "__main__":
#     output = []
#     for result in p_tqdm.p_map(model_2_sensor, paramlist):
#         output.append(result)
#     results = pd.concat(output).reset_index(drop=True)
#     today = date.today().strftime("%Y-%m-%d")
#     file_name = f'2_sensor_results_{today}.csv'
#     print(f"Saving results as: {file_name}\n\n")
#     results.to_csv(here("data","results", file_name), index=False)

In [11]:
%%time
## TESTING  
f1 = 'landsat-c2-l2_bands-r-g-b-nir-swir16-swir22_ZMB_20k-points_1024-features_yr-2009-2021_mn-1-12_lm-False_cm-True_wa-False_summary.feather'
f2 = 'sentinel-2-l2a_bands-2-3-4-8_ZMB_15k-points_1000-features_yr-2016-2022_mn-1-12_lm-False_cm-True_wa-False_summary.feather'
he = True
n_splits = 5
include_climate = True
# variable_groups = ['tmp', 'ndvi', 'pre']
# variable_groups = ['tmp', 'ndvi']
variable_groups = ['ndvi']

#########################################     SET PARAMS    #########################################    
satellite1, bands1, country_code, points1, yrs1, mns1,\
num_features1, limit_months1, crop_mask1, weighted_avg1 = split_fn(f1)

satellite2, bands2, country_code, points2, yrs2, mns2,\
num_features2, limit_months2, crop_mask2, weighted_avg2 = split_fn(f2)

print(f"\nBegin with paramters:\n\t{f1}\n\t{f2}\n\tOne-hot encoding: {he}\n", flush=True)

#########################################     READ DATA    #########################################
features_1 = pd.read_feather(here('data', 'random_features', 'summary', f1))
features_2 = pd.read_feather(here('data', 'random_features', 'summary', f2))
if include_climate:
    climate_df = pd.read_csv(here('data', 'climate', 'climate_summary.csv'))

#########################################     CLEAN DATA    #########################################  
min_year = max(min(features_1.year), min(features_2.year))
max_year = min(max(features_1.year), max(features_2.year))

features_1 = features_1[features_1.year >= min_year]
features_2 = features_2[features_2.year >= min_year]

features_1 = features_1[features_1.year <= max_year]
features_2 = features_2[features_2.year <= max_year]

features_1.drop(['crop_perc'], axis=1, errors='ignore', inplace=True)
features_2.drop(['crop_perc'], axis=1, errors='ignore', inplace=True)

#########################################     JOIN FEATURES    #########################################  
drop_cols = ['district', 'year', 'yield_mt']

features_1 = features_1.set_index(drop_cols).add_prefix("f1_")
features_2 = features_2.set_index(drop_cols).add_prefix("f2_")

features = features_1.join(features_2).reset_index()
features = features[~features.isna().any(axis = 1)]

#########################################    JOIN CLIMATE VARS    #########################################
if include_climate:
    keep_cols = []

    for var in variable_groups:
        tmp = climate_df.columns[climate_df.columns.to_series().str.contains(var)].tolist()
        keep_cols.append(tmp)

    keep_cols = [*drop_cols, *[col for cols in keep_cols for col in cols]]

    climate_df = climate_df.loc[:, keep_cols]

    features = (
        features.set_index(drop_cols).join(climate_df.set_index(drop_cols)).reset_index()
    )
    features = features[features.year <= max(climate_df.year)]

#########################################    STANDARDIZE FEATURES    #########################################
features = features.set_index(drop_cols)
features_scaled = StandardScaler().fit_transform(features.values)
features = pd.DataFrame(features_scaled, index=features.index).reset_index()
features.columns = features.columns.astype(str)

#########################################     CLEAN AND COPY    #########################################
yrs = f"{min(features.year)}-{max(features.year)}"
n_fts_1 = features_1.shape[1]
n_fts_2 = features_2.shape[1]
n_districts = len(features.district.unique())

if include_climate:
    n_climate_cols = climate_df.shape[1] - len(drop_cols)

    i = 0
    n_climate_groups = []
    for cols in range(n_climate_cols):
        if cols % 12 == 0:
            i += 1
            n_climate_groups.append(i)
    n_climate_groups

crop_yield = features.copy().loc[:, tuple(drop_cols)]
crop_yield["log_yield"] = np.log10(crop_yield.yield_mt.to_numpy() + 1)

del features_1, features_2
gc.collect()

#########################################    HOT ENCODE    #########################################
if he:
    drop_cols.remove("district")
    features = pd.get_dummies(features, columns=["district"], drop_first=False)
else:
    pass

#########################################     K-FOLD SPLIT    #########################################
x_all = features.drop(drop_cols, axis=1)
y_all = np.log10(features.yield_mt.to_numpy() + 1)
x_train, x_test, y_train, y_test = train_test_split(
    x_all, y_all, test_size=0.2, random_state=0
)

del features
gc.collect()

#########################################     K-FOLD CV    ###########################################
### SETUP
tic = time.time()
kfold = KFold(n_splits=n_splits)
alphas = {"alpha": np.logspace(-8, 8, base=10, num=17)}

### LAMBDA INDICIES
start = [0, n_fts_1]
end = [n_fts_1, x_train.shape[1]]

if include_climate:
    start.append(n_fts_1 + n_fts_2)  
    end.append(n_fts_1 + n_fts_2)  

    for n in n_climate_groups:
        x = n * 12
        y = n_fts_1 + n_fts_2 + x
        start.append(y)
        end.append(y)

if not include_climate and he:
    start.append(x_train.shape[1] - n_districts)
    end.append(x_train.shape[1] - n_districts)

end.sort()

print(f'Group indicies {start}\n\t\t  {end}', end='\n\n')

### GRID SEARCH - FINDING BEST REGULARIZATION PARAMETER(S)
best_lambdas, best_scores, best_model = kfold_rr_multi_lambda_tuning(
    X=x_train,
    y=y_train, 
    grid=alphas.get('alpha'), 
    n_splits=n_splits,
    start=start,
    end=end, 
    static_lam=1,
    verbose=2,
    show_linalg_warning=False,
    fit_model_after_tuning=True
)
### PREDICT WITH BEST HYPERPARAMETER(S)
val_predictions   = cross_val_predict(best_model, X=x_train, y=y_train, cv=kfold)   
train_predictions = best_model.predict(x_train)
test_predictions  = best_model.predict(x_test)
print(f"""
Finish:
{f1}
{f2}
One-hot encoding: {he}
Final Val R2:  {r2_score(y_train, val_predictions):0.4f} 
Final Test R2: {r2_score(y_test, test_predictions):0.4f}
Total time: {(time.time()-tic)/60:0.2f} minutes
""", flush=True)

#########################################     DE-MEAN R2    #########################################
crop_yield["prediction"] = np.maximum(best_model.predict(x_all), 0)

train_split = pd.DataFrame(
    np.repeat("train", len(x_train)), columns=["split"], index=x_train.index
)
train_split = train_split.join(crop_yield.copy()[crop_yield.index.isin(x_train.index)])
train_split["cv_prediction"] = np.maximum(val_predictions, 0)
train_split["demean_cv_yield"] = train_split["log_yield"] - train_split.groupby(
    "district"
)["log_yield"].transform("mean")
train_split["demean_cv_prediction"] = train_split[
    "cv_prediction"
] - train_split.groupby("district")["cv_prediction"].transform("mean")

train_split["demean_test_yield"] = np.repeat(np.nan, len(x_train))
train_split["demean_test_prediction"] = np.repeat(np.nan, len(x_train))

test_split = pd.DataFrame(
    np.repeat("test", len(x_test)), columns=["split"], index=x_test.index
)
test_split = test_split.join(crop_yield.copy()[crop_yield.index.isin(x_test.index)])
test_split["cv_prediction"] = np.repeat(np.nan, len(x_test))
test_split["demean_cv_yield"] = np.repeat(np.nan, len(x_test))
test_split["demean_cv_prediction"] = np.repeat(np.nan, len(x_test))
test_split["demean_test_yield"] = test_split["log_yield"] - test_split.groupby(
    "district"
)["log_yield"].transform("mean")
test_split["demean_test_prediction"] = test_split["prediction"] - test_split.groupby(
    "district"
)["prediction"].transform("mean")

#########################################     SAVE RESULTS    #########################################
d = {
    "country": country_code[0],
    "year_range": yrs,
    "satellite_1": satellite1[0],
    "bands_1": bands1,
    "num_features_1": num_features1,
    "points_1": points1,
    "month_range_1": mns1,
    "limit_months_1": limit_months1,
    "crop_mask_1": crop_mask1,
    "weighted_avg_1": weighted_avg1,
    "satellite_2": satellite2[0],
    "bands_2": bands2,
    "num_features_2": num_features2,
    "points_2": points2,
    "month_range_2": mns2,
    "limit_months_2": limit_months2,
    "crop_mask_2": crop_mask2,
    "weighted_avg_2": weighted_avg2,
    "hot_encode": he,
    "total_n": len(x_all),
    "train_n": len(x_train),
    "test_n": len(x_test),
    "best_reg_param": [best_lambdas],
    "mean_of_val_R2": [best_scores],
    "val_R2": r2_score(y_train, val_predictions),
    "val_r": pearsonr(val_predictions, y_train)[0],
    "val_r2": pearsonr(val_predictions, y_train)[0] ** 2,
    "train_R2": r2_score(y_train, train_predictions),
    "train_r": pearsonr(train_predictions, y_train)[0],
    "train_r2": pearsonr(train_predictions, y_train)[0] ** 2,
    "test_R2": r2_score(y_test, test_predictions),
    "test_r": pearsonr(test_predictions, y_test)[0],
    "test_r2": pearsonr(test_predictions, y_test)[0] ** 2,
    "demean_cv_R2": r2_score(
        train_split.demean_cv_yield, train_split.demean_cv_prediction
    ),
    "demean_cv_r": pearsonr(
        train_split.demean_cv_yield, train_split.demean_cv_prediction
    )[0],
    "demean_cv_r2": pearsonr(
        train_split.demean_cv_yield, train_split.demean_cv_prediction
    )[0]
    ** 2,
    "demean_test_R2": r2_score(
        test_split.demean_test_yield, test_split.demean_test_prediction
    ),
    "demean_test_r": pearsonr(
        test_split.demean_test_yield, test_split.demean_test_prediction
    )[0],
    "demean_test_r2": pearsonr(
        test_split.demean_test_yield, test_split.demean_test_prediction
    )[0]
    ** 2,
}

today = date.today().strftime("%Y-%m-%d")

if include_climate:
    file_name = f"2_sensor_{'_'.join(variable_groups)}_{today}.csv"
    pd_file_name = f"2_sensor_{'_'.join(variable_groups)}_predictions_{today}.csv"
else:
    file_name = f"2_sensor_{today}.csv"
    pd_file_name = f"2_sensor_predictions_{today}.csv"

data = pd.DataFrame(d)
data.to_csv(here("data", "results", file_name), index=False)


prediction_df = pd.concat([train_split, test_split])
prediction_df.to_csv(here("data", "results", pd_file_name), index=False)


Begin with paramters:
	landsat-c2-l2_bands-r-g-b-nir-swir16-swir22_ZMB_20k-points_1024-features_yr-2009-2021_mn-1-12_lm-False_cm-True_wa-False_summary.feather
	sentinel-2-l2a_bands-2-3-4-8_ZMB_15k-points_1000-features_yr-2016-2022_mn-1-12_lm-False_cm-True_wa-False_summary.feather
	One-hot encoding: True

Group indicies [0, 12288, 24288, 24300]
		  [12288, 24288, 24300, 24369]

1e-08 1e-07 1e-06 1e-05 1e-04 1e-03 1e-02 1e-01 1e+00 1e+01 1e+02 1e+03 1e+04 1e+05 1e+06 1e+07 1e+08 
	Best λ 1: 100.0
	Val R2 1: 0.7178

1e-08 1e-07 1e-06 1e-05 1e-04 1e-03 1e-02 1e-01 1e+00 1e+01 1e+02 1e+03 1e+04 1e+05 1e+06 1e+07 1e+08 
	Best λ 2: 10.0
	Val R2 2: 0.7463

1e-08 1e-07 1e-06 1e-05 1e-04 1e-03 1e-02 1e-01 1e+00 1e+01 1e+02 1e+03 1e+04 1e+05 1e+06 1e+07 1e+08 
	Best λ 3: 0.1
	Val R2 3: 0.7527

1e-08 1e-07 1e-06 1e-05 1e-04 1e-03 1e-02 1e-01 1e+00 1e+01 1e+02 1e+03 1e+04 1e+05 1e+06 1e+07 1e+08 
	Best λ 4: 1e-08
	Val R2 4: 0.8428


Finish:
landsat-c2-l2_bands-r-g-b-nir-swir16-swir22_ZMB_20k-point