# Modeling Crop Yield: Landsat + Sentinel
## Python modules

In [1]:
%load_ext lab_black

In [2]:
import os
import glob
from pyhere import here
import random

import itertools

from task_modeling_utils import *

In [8]:
n_splits = 10
random.seed(42)
random_seeds = [random.randint(0, 1_000_000) for _ in range(n_splits)]

directory = here("data", "random_features", "summary")
files = [
    f for f in os.listdir(directory) if f not in (".gitkeep", ".ipynb_checkpoints")
]
# files = [f for f in files if not (f.startswith("landsat-8") and "lm-False" in f)]
# files = [f for f in files if not (f.startswith("sentinel") and "lm-True" in f)]
files = [f for f in files if "cm-True" in f]
files = [f for f in files if "wa-False" in f]

combinations = list(itertools.combinations(files, 2))
combinations = [
    t for t in combinations if not ("landsat-c2" in t[0] and "landsat-c2" in t[1])
]

kwarg_list = [
    {
        "f1": f1,
        "f2": f2,
        "he": True,
        "anomaly": False,
        "split": split,
        "random_state": random_state,
        "include_climate": False,
        "variable_groups": None,
        "n_splits": 5,
        "return_oos_predictions": False,
    }
    for f1, f2 in combinations
    for split, random_state in enumerate(random_seeds)
]
len(kwarg_list)

900

In [None]:
results, oos_preds = model_2_sensor(
    f1="landsat-c2-l2_bands-r-g-b-nir-swir16-swir22_ZMB_20k-points_1024-features_yr-2009-2021_mn-4-9_lm-True_cm-True_wa-False_summary.feather",
    f2="sentinel-2-l2a_bands-2-3-4-8_ZMB_15k-points_1000-features_yr-2016-2022_mn-1-12_lm-False_cm-True_wa-False_summary.feather",
    he=True,
    anomaly=True,
    split=0,
    random_state=116739,
    include_climate=True,
    variable_groups=["ndvi"],
    n_splits=5,
    return_oos_predictions=True,
)

In [None]:
oos = pd.read_csv(
    here(
        "data",
        "results",
        "2_sensor_oos_predictions_n-splits-10_2023-05-22_rcf_climate.csv",
    )
)
oos = oos.loc[
    (oos.variables == "ndvi") & (oos.split == 0) & (oos.random_state == 670487)
]
oos

In [None]:
group = ["district"]
# group = ["district", "data_fold"]
# group = ["district", "val_fold"]
# group = ["district", "val_fold", "split", "random_state"]
oos["demean_log_yield"] = oos["log_yield"] - oos.groupby(group)["log_yield"].transform(
    "mean"
)
# group.append("data_fold")
oos["demean_oos_prediction"] = oos["oos_prediction"] - oos.groupby(group)[
    "log_yield"
].transform("mean")
oos

In [None]:
test = oos.copy()
test = test[test.data_fold == "test"]
train = oos.copy()
train = train[train.data_fold == "train"]

In [None]:
r2_score(train.demean_log_yield, train.demean_oos_prediction), r2_score(
    test.demean_log_yield, test.demean_oos_prediction
)

In [None]:
r2_score(test.log_yield, test.oos_prediction)

In [None]:
results = pd.read_csv(
    here("data", "results", "2_sensor_top-mod_n-splits-10_2023-05-22_rcf_climate.csv")
)
results = results[results.variables == "ndvi"]
np.mean(results.val_R2)
results

In [None]:
file_pattern = str(here("data", "results", "2_sensor_top-mod_n-splits-10_*_*.csv"))
files = glob.glob(pathname=file_pattern)
files

In [None]:
file_pattern = str(here("data", "results", "2_sensor_top-mod_n-splits-10_*_*.csv"))
files = glob.glob(pathname=file_pattern)
results = merge_files(files)
results

In [None]:
results.columns

In [None]:
max(results.val_R2)

In [None]:
groupby_cols = [
    "country",
    "year_range",
    "satellite_1",
    "bands_1",
    "num_features_1",
    "points_1",
    "month_range_1",
    "limit_months_1",
    "crop_mask_1",
    "weighted_avg_1",
    "satellite_2",
    "bands_2",
    "num_features_2",
    "points_2",
    "month_range_2",
    "limit_months_2",
    "crop_mask_2",
    "weighted_avg_2",
    "hot_encode",
]

In [None]:
results_summary = results.groupby(groupby_cols, as_index=False).agg(
    {
        "val_R2": "mean",
        "test_R2": "mean",
        "demean_cv_R2": "mean",
        "demean_cv_r2": "mean",
        "demean_test_R2": "mean",
    }
)
results_summary = results_summary.sort_values("val_R2", ascending=False)  # .head(20)
results_summary.iloc[0:1, :]

In [None]:
f1 = "landsat-c2-l2_bands-r-g-b-nir-swir16-swir22_ZMB_20k-points_1024-features_yr-2009-2021_mn-4-9_lm-True_cm-True_wa-False_summary.feather"
f2 = "sentinel-2-l2a_bands-2-3-4-8_ZMB_15k-points_1000-features_yr-2016-2022_mn-1-12_lm-False_cm-True_wa-False_summary.feather"

In [None]:
# f1 = "landsat-c2-l2_bands-r-g-b-nir-swir16-swir22_ZMB_20k-points_1024-features_yr-2009-2021_mn-4-9_lm-True_cm-True_wa-True_summary.feather"
pd.read_feather(here("data", "random_features", "summary", f2))

In [None]:
# Generate n random seeds
n_splits = 10
random_seeds = [random.randint(0, 1_000_000) for _ in range(n_splits)]

directory = here("data", "random_features", "summary")
files = [
    f for f in os.listdir(directory) if f not in (".gitkeep", ".ipynb_checkpoints")
]
files = [f for f in files if not (f.startswith("landsat-8") and "lm-False" in f)]
files = [f for f in files if not (f.startswith("sentinel") and "lm-True" in f)]
files = [f for f in files if "cm-True" in f]
# files = [f for f in files if "wa-False" in f]

combinations = list(itertools.combinations(files, 2))
combinations = [
    t for t in combinations if not ("landsat-c2" in t[0] and "landsat-c2" in t[1])
]

kwarg_list = [
    {
        "f1": f1,
        "f2": f2,
        "he": False,
        "split": split,
        "random_state": random_state,
        "include_climate": False,
        "variable_groups": None,
        "n_splits": 5,
    }
    for f1, f2 in combinations
    for split, random_state in enumerate(random_seeds)
]

chunked_kwarg_list = list(chunks(kwarg_list, 60))

In [None]:
for i, chunk in enumerate(chunked_kwarg_list[9:10]):
    print(i)

In [None]:
chunked_kwarg_list[12:]

In [None]:
file_pattern = str(here("data", "results", "2_sensor_results_*_*.csv"))
files = glob.glob(pathname=file_pattern)
results = merge_files(files)
sorted(files)

In [None]:
# top = results.test_R2.sort_values().index[-1]
# results.iloc[top:top+1, 1:20]

top = results.val_R2.sort_values().index[-1]
results.iloc[top : top + 1, 10:]
# results.iloc[top : top + 1, 1:20]

In [None]:
f1 = "landsat-c2-l2_bands-r-g-b-nir-swir16-swir22_ZMB_20k-points_1024-features_yr-2009-2021_mn-1-12_lm-False_cm-True_wa-False_summary.feather"
f2 = "sentinel-2-l2a_bands-2-3-4-8_ZMB_15k-points_1000-features_yr-2016-2022_mn-1-12_lm-False_cm-True_wa-False_summary.feather"

In [None]:
random.seed(42)

# Define the number of stratified random splits to perform
n_splits = 100  # Generate n random seeds
random_seeds = [random.randint(0, 1_000_000) for _ in range(n_splits)]

paramlist = [
    (
        f1,
        f2,
        "True",
        split,
        random_state,
    )
    for split, random_state in enumerate(random_seeds)
]

In [13]:
%%time
## TESTING  
f1 = "landsat-c2-l2_bands-r-g-b-nir-swir16-swir22_ZMB_20k-points_1024-features_yr-2009-2021_mn-4-9_lm-True_cm-True_wa-False_summary.feather"
f2 = "sentinel-2-l2a_bands-2-3-4-8_ZMB_15k-points_1000-features_yr-2016-2022_mn-1-12_lm-False_cm-True_wa-False_summary.feather"
he = True
anomaly=True
split=0
random_state=670487
n_splits = 5
include_climate = False
# variable_groups = ['tmp', 'ndvi']
# variable_groups = ['ndvi']
variable_groups = None 
return_oos_predictions = True
  
  
  
#########################################     SET PARAMS    #########################################    
satellite1, bands1, country_code, points1, yrs1, mns1,\
num_features1, limit_months1, crop_mask1, weighted_avg1 = split_fn(f1)

satellite2, bands2, country_code, points2, yrs2, mns2,\
num_features2, limit_months2, crop_mask2, weighted_avg2 = split_fn(f2)

if variable_groups is None:
    variable_groups_str = "rcf"
else:
    variable_groups_str = "_".join(variable_groups)

print(f"""
Begin with paramters:
    F1: {f1}
    F2: {f2}
    One-hot encoding: {he}
    Anomaly: {anomaly}
    Split: {split}
    Random state: {random_state}
    N-splits: {n_splits}
    Include climate: {include_climate}
    Climate vars: {variable_groups_str}
""", flush=True)

#########################################     READ DATA    #########################################
features_1 = pd.read_feather(here('data', 'random_features', 'summary', f1))
features_2 = pd.read_feather(here('data', 'random_features', 'summary', f2))
if include_climate:
    climate_df = pd.read_csv(here('data', 'climate', 'climate_summary.csv'))

#########################################     CLEAN DATA    #########################################  
min_year = max(min(features_1.year), min(features_2.year))
max_year = min(max(features_1.year), max(features_2.year))

features_1 = features_1[features_1.year >= min_year]
features_2 = features_2[features_2.year >= min_year]

features_1 = features_1[features_1.year <= max_year]
features_2 = features_2[features_2.year <= max_year]

features_1.drop(['crop_perc'], axis=1, errors='ignore', inplace=True)
features_2.drop(['crop_perc'], axis=1, errors='ignore', inplace=True)

#########################################     JOIN FEATURES    #########################################  
drop_cols = ['district', 'year', 'yield_mt']

features_1 = features_1.set_index(drop_cols).add_prefix("f1_")
features_2 = features_2.set_index(drop_cols).add_prefix("f2_")

features = features_1.join(features_2).reset_index()
features = features[~features.isna().any(axis = 1)]

features['log_yield'] = np.log10(features['yield_mt'] + 1)

#########################################    JOIN CLIMATE VARS    #########################################
if include_climate:
    keep_cols = []

    for var in variable_groups:
        tmp = climate_df.columns[climate_df.columns.to_series().str.contains(var)].tolist()
        keep_cols.append(tmp)

    keep_cols = [*drop_cols, *[col for cols in keep_cols for col in cols]]

    climate_df = climate_df.loc[:, keep_cols]

    features = (
        features.set_index(drop_cols).join(climate_df.set_index(drop_cols)).reset_index()
    )
    features = features[features.year <= max(climate_df.year)]

drop_cols.append('log_yield')

#########################################     CALCULATE ANOMALY   #########################################
if anomaly:
    features.set_index(['year', 'district'], inplace=True)
    var_cols = features.columns
    features = features[var_cols] - features.groupby(['district'], as_index=True)[var_cols].transform('mean')
    features.reset_index(drop=False, inplace=True)
else:
    pass

#########################################     CLEAN AND COPY    #########################################
yrs = f"{min(features.year)}-{max(features.year)}"
n_fts_1 = features_1.shape[1]
n_fts_2 = features_2.shape[1]
n_districts = len(features.district.unique())

if include_climate:
    n_climate_cols = climate_df.shape[1] - len(drop_cols)

    i = 0
    n_climate_groups = []
    for cols in range(n_climate_cols):
        if cols % 12 == 0:
            i += 1
            n_climate_groups.append(i)
    n_climate_groups

crop_yield = features.copy().loc[:, tuple(drop_cols)]

del features_1, features_2
gc.collect()

#########################################    HOT ENCODE    #########################################
if he:
    drop_cols.remove("district")
    features = pd.get_dummies(
        features, columns=["district"], drop_first=False, dtype=float
    )
else:
    pass

#########################################     TRAIN/TEST SPLIT    #########################################
x_all = features.drop(drop_cols, axis=1)
y_all = features.log_yield
x_train, x_test, y_train, y_test = train_test_split(
    x_all, y_all, test_size=0.2, random_state=random_state
)

del features
gc.collect()

#########################################    STANDARDIZE FEATURES    #########################################
scaler = StandardScaler().fit(x_train)
x_train = pd.DataFrame(scaler.transform(x_train), columns=x_train.columns, index=x_train.index)
x_test = pd.DataFrame(scaler.transform(x_test), columns=x_test.columns, index=x_test.index)


Begin with paramters:
    F1: landsat-c2-l2_bands-r-g-b-nir-swir16-swir22_ZMB_20k-points_1024-features_yr-2009-2021_mn-4-9_lm-True_cm-True_wa-False_summary.feather
    F2: sentinel-2-l2a_bands-2-3-4-8_ZMB_15k-points_1000-features_yr-2016-2022_mn-1-12_lm-False_cm-True_wa-False_summary.feather
    One-hot encoding: True
    Anomaly: True
    Split: 0
    Random state: 670487
    N-splits: 5
    Include climate: False
    Climate vars: rcf

CPU times: user 2.58 s, sys: 2.28 s, total: 4.87 s
Wall time: 2.88 s


In [None]:

#########################################     K-FOLD CV    ###########################################
### SETUP
tic = time.time()
kfold = KFold(n_splits=n_splits)
alphas = {"alpha": np.logspace(-1, 1, base=10, num=3)}

### LAMBDA INDICIES
start = [0, n_fts_1]
end = [n_fts_1, x_train.shape[1]]

if include_climate:
    start.append(n_fts_1 + n_fts_2)  
    end.append(n_fts_1 + n_fts_2)  

    for n in n_climate_groups:
        x = n * 12
        y = n_fts_1 + n_fts_2 + x
        start.append(y)
        end.append(y)

if not include_climate and he:
    start.append(x_train.shape[1] - n_districts)
    end.append(x_train.shape[1] - n_districts)

end.sort()

print(f'Group indicies {start}\n\t\t  {end}', end='\n\n')

### GRID SEARCH - FINDING BEST REGULARIZATION PARAMETER(S)
best_lambdas, best_scores, best_model = kfold_rr_multi_lambda_tuning(
    X=x_train,
    y=y_train, 
    grid=alphas.get('alpha'), 
    n_splits=n_splits,
    start=start,
    end=end, 
    static_lam=1,
    verbose=2,
    show_linalg_warning=False,
    fit_model_after_tuning=True
)
### PREDICT WITH BEST HYPERPARAMETER(S)
val_predictions   = cross_val_predict(best_model, X=x_train, y=y_train, cv=kfold)   
train_predictions = best_model.predict(x_train)
test_predictions  = best_model.predict(x_test)

if anomaly:
    pass
else:
    val_predictions   = np.maximum(val_predictions, 0)
    train_predictions = np.maximum(train_predictions, 0)
    test_predictions  = np.maximum(test_predictions, 0)
    
print(f"""
Finish:
    F1: {f1}
    F2: {f2}
    One-hot encoding: {he}
    Anomaly: {anomaly}
    Split: {split}
    Random state: {random_state}
    N-splits: {n_splits}
    Include climate: {include_climate}
    Climate vars: {variable_groups_str}
    Final Val R2:  {r2_score(y_train, val_predictions):0.4f} 
    Final Test R2: {r2_score(y_test, test_predictions):0.4f}
    Total time: {(time.time()-tic)/60:0.2f} minutes
""", flush=True)

#########################################     DE-MEAN TRAIN R2    #########################################
fold_list = []
for i in range(n_splits):
    idx = len(list(kfold.split(y_train))[i][1])
    fold = np.repeat(i + 1, idx).tolist()
    fold_list.append(fold)
fold_list = [item for sublist in fold_list for item in sublist]

train_split = pd.DataFrame(
    np.repeat("train", len(x_train)), columns=["data_fold"], index=x_train.index
)
train_split = train_split.join(
    crop_yield.copy()[crop_yield.index.isin(x_train.index)]
)
train_split["oos_prediction"] = val_predictions
train_split["val_fold"] = fold_list
train_split = demean_by_group(train_split, predicted="oos_prediction", group=["district"])

#########################################     DE-MEAN TEST R2    #########################################
test_split = pd.DataFrame({"data_fold": np.repeat("test", len(x_test))}, index=x_test.index)
test_split = test_split.join(crop_yield.copy()[crop_yield.index.isin(x_test.index)])
test_split["oos_prediction"] = test_predictions
test_split["val_fold"] = n_splits + 1
test_split = demean_by_group(test_split, predicted="oos_prediction", group=["district"])

#########################################     OUT OF SAMPLE PREDICTIONS    #########################################
oos_preds = pd.concat([train_split, test_split])
oos_preds[["split", "random_state"]] = split, random_state
oos_preds["variables"] = variable_groups_str
oos_preds["anomaly"] = anomaly

#########################################     SCORES    #########################################
val_R2 = r2_score(y_train, val_predictions)
val_r = pearsonr(val_predictions, y_train)[0]
train_R2 = r2_score(y_train, train_predictions)
train_r = pearsonr(train_predictions, y_train)[0]
test_R2 = r2_score(y_test, test_predictions)
test_r = pearsonr(test_predictions, y_test)[0]

if anomaly:
    demean_cv_R2   = np.nan
    demean_cv_r    = np.nan
    demean_test_R2 = np.nan
    demean_test_r  = np.nan
else:
    demean_cv_R2 = r2_score(train_split.demean_log_yield, train_split.demean_oos_prediction)
    demean_cv_r  = pearsonr(train_split.demean_log_yield, train_split.demean_oos_prediction)[0]
    demean_test_R2 = r2_score(test_split.demean_log_yield, test_split.demean_oos_prediction)
    demean_test_r  = pearsonr(test_split.demean_log_yield, test_split.demean_oos_prediction)[0]

#########################################     SAVE RESULTS    #########################################
d = {
    "split": split,
    "random_state": random_state,
    "variables": variable_groups_str,
    "anomaly": anomaly,
    "country": country_code[0],
    "year_range": yrs,
    "satellite_1": satellite1[0],
    "bands_1": bands1,
    "num_features_1": num_features1,
    "points_1": points1,
    "month_range_1": mns1,
    "limit_months_1": limit_months1,
    "crop_mask_1": crop_mask1,
    "weighted_avg_1": weighted_avg1,
    "satellite_2": satellite2[0],
    "bands_2": bands2,
    "num_features_2": num_features2,
    "points_2": points2,
    "month_range_2": mns2,
    "limit_months_2": limit_months2,
    "crop_mask_2": crop_mask2,
    "weighted_avg_2": weighted_avg2,
    "hot_encode": he,
    "total_n": len(x_all),
    "train_n": len(x_train),
    "test_n": len(x_test),
    "best_reg_param": [best_lambdas],
    "mean_of_val_R2": [best_scores],
    "val_R2": val_R2,
    "val_r": val_r,
    "val_r2": val_r ** 2,
    "train_R2": train_R2,
    "train_r": train_r,
    "train_r2": train_r ** 2,
    "test_R2": test_R2,
    "test_r": test_r,
    "test_r2": test_r ** 2,
    "demean_cv_R2": demean_cv_R2,
    "demean_cv_r": demean_cv_r,
    "demean_cv_r2": demean_cv_r ** 2,
    "demean_test_R2": demean_test_R2,
    "demean_test_r": demean_test_r,
    "demean_test_r2": demean_test_r ** 2,
}
# if return_oos_predictions:
#     return d, oos_preds
# else:
#     return d