# Modeling Crop Yield: Landsat + Sentinel
## Python modules

In [1]:
## import warnings
import time
import math
import os
import glob
from pyhere import here
from datetime import date
import re

import numpy as np
import pandas as pd
import geopandas
import pickle

import pyarrow
import itertools
import multiprocessing
import p_tqdm

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge, RidgeCV
from sklearn.model_selection import train_test_split, KFold, LeaveOneGroupOut, cross_val_score, GridSearchCV, cross_val_predict
from sklearn.metrics import r2_score
from scipy.stats import spearmanr,  pearsonr

In [2]:
def split_fn(file_name):
    f            = file_name.split(sep="_")
    satellite    = f[0],
    bands        = f[1].replace("bands-", "")
    country_code = f[2],
    points       = f[3].replace("k-points", "")
    num_features = f[4].replace("-features", "")
    yrs          = f[5].replace("yr-", "")
    mns          = f[6].replace("mn-", "")
    limit_months = f[7].replace("lm-", "")
    crop_mask    = f[8].replace("cm-", "")
    weighted_avg = f[9].replace("wa-", "")
    
    return satellite, bands, country_code, points, yrs, mns, num_features, limit_months, crop_mask, weighted_avg

def merge(x, bases = (tuple, list)):
    for e in x:
        if type(e) in bases:
            for e in merge(e, bases):
                yield e
        else:
            yield e

In [3]:
files = os.listdir(here("data", "random_features", 'summary'))
files = [f for f in files if f not in ('.gitkeep', '.ipynb_checkpoints')]
# files = files[0:8]
paramlist = list(itertools.product(files, files))
paramlist = [tuple(set(paramlist[i])) for i in range(len(paramlist))]
paramlist = [x for x in paramlist if len(x) > 1] 
for i in range(len(paramlist)):
    paramlist[i] = tuple(merge(paramlist[i]))

In [4]:
point_pattern = re.compile("20k-points")
wa_pattern = re.compile("cm-False")

paramlist = [t for t in paramlist if not (bool(point_pattern.search(t[0])) & bool(wa_pattern.search(t[0])))]
paramlist = [t for t in paramlist if not (bool(point_pattern.search(t[1])) & bool(wa_pattern.search(t[1])))]
len(paramlist)

1892

In [5]:
paramlist[0]

('sentinel-2-l2a_bands-2-3-4_ZMB_4k-points_1000-features_yr-2016-2022_mn-4-9_lm-True_cm-True_wa-True_summary.feather',
 'sentinel-2-l2a_bands-2-3-4-8_ZMB_15k-points_1000-features_yr-2016-2022_mn-1-12_lm-False_cm-False_wa-False_summary.feather')

In [6]:
# for params in paramlist[0:1]:
def model_2_sensors(params):
#########################################     SET PARAMS    #########################################    
    f1         = params[0]
    f2         = params[1]

    satellite1, bands1, country_code, points1, yrs1, mns1,\
    num_features1, limit_months1, crop_mask1, weighted_avg1 = split_fn(f1)
    
    satellite2, bands2, country_code, points2, yrs2, mns2,\
    num_features2, limit_months2, crop_mask2, weighted_avg2 = split_fn(f2)
    alphas = {'alpha': np.logspace(-8, 8, base = 10, num = 17)}
    
#########################################     READ DATA    #########################################
    features_1 = pd.read_feather(here('data', 'random_features', 'summary', f1))
    features_2 = pd.read_feather(here('data', 'random_features', 'summary', f2))
    
#########################################     CLEAN DATA    #########################################  
    min_year = max(min(features_1.year), min(features_2.year))
    max_year = min(max(features_1.year), max(features_2.year))
    
    features_1 = features_1[features_1.year >= min_year]
    features_2 = features_2[features_2.year >= min_year]
    
    features_1 = features_1[features_1.year <= max_year]
    features_2 = features_2[features_2.year <= max_year]
    
    features_1.drop(['crop_perc'], axis=1, errors='ignore', inplace=True)
    features_2.drop(['crop_perc'], axis=1, errors='ignore', inplace=True)
    
    index_cols = ['district', 'year', 'yield_mt']
    
    features_1 = features_1.set_index(index_cols).add_prefix("f1_")
    features_2 = features_2.set_index(index_cols).add_prefix("f2_")
    
#########################################     JOIN DATA    #########################################  
    features = features_1.join(features_2).reset_index()
    features = features[~features.isna().any(axis = 1)]

#########################################     CALCULATE ANOMALY   #########################################
    features['yield_mt'] = np.log10(features['yield_mt'] + 1)
    features.set_index(['year', 'district'], inplace=True)
    var_cols = features.columns
    features = features[var_cols] - features.groupby(['district'], as_index=True)[var_cols].transform('mean')
    features.reset_index(drop=False, inplace=True)

#########################################     K-FOLD SPLIT    #########################################    
    x_all = features.drop(index_cols, axis=1)
    x_all = StandardScaler().fit_transform(x_all)
    x_all = pd.DataFrame(x_all)
    y_all = np.log10(features.yield_mt.to_numpy() + 1)
    x_train, x_test, y_train, y_test = train_test_split(x_all, y_all, test_size=0.2, random_state=0)

#########################################     K-FOLD CV    ###########################################
    kfold = KFold()
    ridge = Ridge()
    ### GRID SEARCH - FINDING BEST REGULARIZATION PARAMETER
    kfold_ridge_reg = GridSearchCV(ridge, alphas, scoring = 'r2', cv = kfold)
    kfold_ridge_reg.fit(x_train, y_train)
    kfold_best_model = kfold_ridge_reg.best_estimator_
    ### VALIDATION PREDICT - PREDICTING WITH BEST HYPERPARAMETER
    kfold_val_predictions = cross_val_predict(kfold_best_model, X = x_train, y = y_train, cv = kfold)   
    ### TRAIN AND TEST PREDICT
    y_pred_train_k = kfold_best_model.predict(x_train)
    y_pred_test_k  = kfold_best_model.predict(x_test)

#########################################     LOGO SPLIT   ###########################################
    x_train_g = features[features.year < max(features.year)].drop(index_cols, axis=1)
    x_train_g = StandardScaler().fit_transform(x_train_g)
    x_train_g = pd.DataFrame(x_train_g)
    y_train_g = features[features.year < max(features.year)].yield_mt
    g_train_g = features[features.year < max(features.year)].year.ravel()

    x_test_g = features[features.year == max(features.year)].drop(index_cols, axis=1)
    x_test_g = StandardScaler().fit_transform(x_test_g)
    x_test_g = pd.DataFrame(x_test_g)
    y_test_g = features[features.year == max(features.year)].yield_mt
    g_test_g = features[features.year == max(features.year)].year

#########################################     LOGO CV    ###########################################
    logo = LeaveOneGroupOut()
    ridge = Ridge()
    ### GRID SEARCH - FINDING BEST REGULARIZATION PARAMETER
    logo_ridge_reg = GridSearchCV(ridge, alphas, scoring='r2', cv=logo)
    logo_ridge_reg.fit(x_train_g, y_train_g, groups=g_train_g)
    logo_best_model = logo_ridge_reg.best_estimator_
    ### VALIDATION PREDICT - PREDICTING WITH BEST HYPERPARAMETER
    logo_val_predictions = cross_val_predict(logo_best_model, X=x_train_g, y=y_train_g, groups=g_train_g, cv=logo)   
    ### TRAIN AND TEST PREDICT
    logo_train_pred = logo_best_model.predict(x_train_g)
    logo_test_pred  = logo_best_model.predict(x_test_g)

#########################################     SAVE RESULTS    #########################################
    d = {
        'country': country_code,
        
        'satellite_1': satellite1[0],
        'bands_1': bands1,
        'num_features_1': num_features1,
        'points_1': points1, 
        'month_range_1': mns1,
        'limit_months_1': limit_months1,
        'crop_mask_1': crop_mask1,
        'weighted_avg_1': weighted_avg1,
        
        'satellite_2': satellite2[0],
        'bands_2': bands2,
        'num_features_2': num_features2,
        'points_2': points2, 
        'month_range_2': mns2,
        'limit_months_2': limit_months2,
        'crop_mask_2': crop_mask2,
        'weighted_avg_2': weighted_avg2,

        'kfold_total_n': len(x_all),
        'kfold_train_n': len(x_train),
        'kfold_test_n': len(x_test),
        
        'kfold_best_reg_param': list(kfold_ridge_reg.best_params_.values())[0],
        'kfold_mean_of_val_R2s': kfold_ridge_reg.best_score_,
        'kfold_val_R2': r2_score(y_train, kfold_val_predictions),
        'kfold_val_r' : pearsonr(kfold_val_predictions, y_train)[0],
        'kfold_val_r2' : pearsonr(kfold_val_predictions, y_train)[0] ** 2,
        
        'kfold_train_R2': r2_score(y_train, y_pred_train_k),
        'kfold_train_r': pearsonr(y_pred_train_k, y_train)[0],
        'kfold_train_r2': pearsonr(y_pred_train_k, y_train)[0] ** 2,
        
        'kfold_test_R2': r2_score(y_test, y_pred_test_k),
        'kfold_test_r': pearsonr(y_pred_test_k, y_test)[0],
        'kfold_test_r2': pearsonr(y_pred_test_k, y_test)[0] ** 2,
        
        'logo_total_n': len(x_all),
        'logo_train_n': len(x_train),
        'logo_test_n': len(x_test),    
        
        'logo_best_reg_param': list(logo_ridge_reg.best_params_.values())[0],      
        'logo_mean_of_val_R2s' : logo_ridge_reg.best_score_,
        'logo_val_R2' : r2_score(y_train_g, logo_val_predictions),
        'logo_val_r' : pearsonr(logo_val_predictions, y_train_g)[0],
        'logo_val_r2' : pearsonr(logo_val_predictions, y_train_g)[0] ** 2,
        
        'logo_train_R2': r2_score(y_train_g, logo_train_pred),
        'logo_train_r': pearsonr(logo_train_pred, y_train_g)[0],
        'logo_train_r2': pearsonr(logo_train_pred, y_train_g)[0] ** 2,
        
        'logo_test_R2': r2_score(y_test_g, logo_test_pred),
        'logo_test_r': pearsonr(logo_test_pred, y_test_g)[0],
        'logo_test_r2': pearsonr(logo_test_pred, y_test_g)[0] ** 2,
    }
    df = pd.DataFrame(data=d)
    return df

In [7]:
%%time     
##### With progress bar
workers = os.cpu_count()
if __name__ == "__main__":
    output = []
    for result in p_tqdm.p_map(model_2_sensors, paramlist):
        output.append(result)
    results = pd.concat(output).reset_index(drop=True)
    today = date.today().strftime("%Y-%m-%d")
    file_name = f'2_sensor_anomaly_results_{today}.csv'
    print(f"Saving results as: {file_name}\n\n")           
    results.to_csv(here("data","results", file_name))

  0%|          | 0/1892 [00:00<?, ?it/s]

Saving results as: 2_sensor_anomaly_results_2022-11-15.csv


CPU times: user 18.8 s, sys: 2.57 s, total: 21.4 s
Wall time: 9h 24min 32s


In [8]:
results

Unnamed: 0,country,satellite_1,bands_1,num_features_1,points_1,month_range_1,limit_months_1,crop_mask_1,weighted_avg_1,satellite_2,...,logo_mean_of_val_R2s,logo_val_R2,logo_val_r,logo_val_r2,logo_train_R2,logo_train_r,logo_train_r2,logo_test_R2,logo_test_r,logo_test_r2
0,ZMB,sentinel-2-l2a,2-3-4,1000,4,4-9,True,True,True,sentinel-2-l2a,...,-0.287676,0.262484,0.539384,0.290935,0.930499,0.967624,0.936297,-0.050588,0.310775,0.096581
1,ZMB,sentinel-2-l2a,2-3-4,1000,4,4-9,True,True,True,sentinel-2-l2a,...,-0.528276,-0.129925,-0.492002,0.242066,0.043886,0.437188,0.191133,-0.038661,-0.008884,0.000079
2,ZMB,landsat-8-c2-l2,1-2-3-4-5-6-7,1000,15,4-9,True,False,True,sentinel-2-l2a,...,-0.484686,-0.158398,-0.515209,0.265441,0.036615,0.563672,0.317726,-1.020978,0.213480,0.045574
3,ZMB,sentinel-2-l2a,2-3-4,1000,4,4-9,True,True,True,sentinel-2-l2a,...,-0.408665,0.160481,0.446408,0.199280,0.873003,0.939166,0.882032,-0.262414,0.175847,0.030922
4,ZMB,sentinel-2-l2a,2-3-4,1000,4,4-9,True,True,True,sentinel-2-l2a,...,-0.499447,0.092069,0.331974,0.110207,0.753265,0.879979,0.774363,-0.268475,0.079334,0.006294
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1887,ZMB,sentinel-2-l2a,2-3-4,1000,4,1-12,False,False,True,landsat-c2-l2,...,-0.010845,0.397930,0.639995,0.409594,0.809821,0.906941,0.822541,-1.212373,0.211317,0.044655
1888,ZMB,sentinel-2-l2a,2-3-4,1000,15,1-12,False,False,False,landsat-c2-l2,...,-0.497999,-0.171332,-0.551250,0.303876,0.007024,0.543661,0.295568,-1.033038,0.062578,0.003916
1889,ZMB,landsat-c2-l2,r-g-b-nir-swir16-swir22,1024,20,4-9,True,True,False,sentinel-2-l2a,...,-0.408318,0.130289,0.374908,0.140556,0.720236,0.859134,0.738111,-1.212826,0.097841,0.009573
1890,ZMB,landsat-8-c2-l2,1-2-3-4-5-6-7,1000,20,1-12,False,True,False,landsat-c2-l2,...,-0.574162,-0.112908,-0.366819,0.134556,0.165278,0.626016,0.391896,-0.654291,0.028956,0.000838
