# Modeling Crop Yield
## Python modules

In [1]:
## import warnings
import time
import math
import os
import glob
from pyhere import here
from datetime import date

import numpy as np
import pandas as pd
import geopandas
import pickle

import seaborn as sns
import matplotlib.pyplot as plt

import pyarrow
import itertools
import multiprocessing
import p_tqdm

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge, RidgeCV
from sklearn.model_selection import train_test_split, KFold, LeaveOneGroupOut, cross_val_score, GridSearchCV, cross_val_predict
from sklearn.metrics import r2_score
from scipy.stats import spearmanr,  pearsonr

In [2]:
country_shp = geopandas.read_file(here('data', 'geo_boundaries', 'gadm36_ZMB_2.shp'))
country_shp = country_shp.set_index('district')

crop_df = pd.read_csv(here('data', 'crop_yield', 'cfs_maize_districts_zambia_2009_2022.csv'))
crop_df = crop_df.set_index(['district', 'year'])[['yield_mt']]
                             
weights_4_fn = 'ZMB_cropland_percentage_4k-points.feather'
weights_15_fn = 'ZMB_cropland_percentage_15k-points.feather'
weights_20_fn = 'ZMB_cropland_percentage_20k-points.feather'
  
weights_4 = pd.read_feather(here("data", "land_cover", weights_4_fn))
weights_15 = pd.read_feather(here("data", "land_cover", weights_15_fn))
weights_20 = pd.read_feather(here("data", "land_cover", weights_20_fn))
                           
weights_4.lon, weights_4.lat = round(weights_4.lon, 5), round(weights_4.lat, 5)
weights_15.lon, weights_15.lat = round(weights_15.lon, 5), round(weights_15.lat, 5)
weights_20.lon, weights_20.lat = round(weights_20.lon, 5), round(weights_20.lat, 5)

In [3]:
def get_merged_files(flist, **kwargs):
    return pd.concat([pd.read_feather(f, **kwargs) for f in flist], axis=0).reset_index(drop=True)

def merge_tuple(x, bases = (tuple, list)):
    for e in x:
        if type(e) in bases:
            for e in merge_tuple(e, bases):
                yield e
        else:
            yield e

In [4]:
file_groups = pd.DataFrame()
satellites = ["sentinel-2-l2a","landsat-8-c2-l2","landsat-c2-l2"]
for satellite in satellites:
    
    directory = here("data", "random_features", satellite)
    files = os.listdir(directory)
    files = [f for f in files if f not in ('.gitkeep', '.ipynb_checkpoints')]
    files.sort()
    
    for file in files:
        f = file.split(sep="_")
        d = {
            'satellite'    : f[0],
            'bands'        : f[1].replace("bands-", ""),
            'country_code' : f[2],
            'points'       : int(f[3].replace("k-points", "")),
            'num_features' : f[4].replace("-features", ""),
            'pattern'      : f[0]+'_'+f[1]+'_'+f[2]+'_'+f[3]+'_'+f[4]+'_*'
        }
        df = pd.DataFrame(data=d, index=[0])
        file_groups = pd.concat([file_groups, df])
        
file_groups = file_groups.sort_values(by=['points'], ascending=True)
file_groups = file_groups.drop_duplicates().reset_index(drop=True)
file_groups

Unnamed: 0,satellite,bands,country_code,points,num_features,pattern
0,sentinel-2-l2a,2-3-4,ZMB,4,1000,sentinel-2-l2a_bands-2-3-4_ZMB_4k-points_1000-...
1,landsat-8-c2-l2,1-2-3-4-5-6-7,ZMB,15,1000,landsat-8-c2-l2_bands-1-2-3-4-5-6-7_ZMB_15k-po...
2,sentinel-2-l2a,2-3-4-8,ZMB,15,1000,sentinel-2-l2a_bands-2-3-4-8_ZMB_15k-points_10...
3,sentinel-2-l2a,2-3-4,ZMB,15,1000,sentinel-2-l2a_bands-2-3-4_ZMB_15k-points_1000...
4,landsat-c2-l2,r-g-b-nir-swir16-swir22,ZMB,20,1024,landsat-c2-l2_bands-r-g-b-nir-swir16-swir22_ZM...
5,landsat-8-c2-l2,1-2-3-4-5-6-7,ZMB,20,1000,landsat-8-c2-l2_bands-1-2-3-4-5-6-7_ZMB_20k-po...
6,sentinel-2-l2a,2-3-4,ZMB,20,1000,sentinel-2-l2a_bands-2-3-4_ZMB_20k-points_1000...


In [5]:
file_groups = file_groups[file_groups.satellite == "sentinel-2-l2a"]
file_groups = file_groups[file_groups.points == 15]
file_groups = file_groups[file_groups.bands == '2-3-4-8']
file_groups

Unnamed: 0,satellite,bands,country_code,points,num_features,pattern
2,sentinel-2-l2a,2-3-4-8,ZMB,15,1000,sentinel-2-l2a_bands-2-3-4-8_ZMB_15k-points_10...


In [6]:
names = 'limit_months crop_mask'.split()
paramlist = list(itertools.product([False,True], repeat = len(names)))
paramlist = list(itertools.product(file_groups.pattern.to_list(), paramlist))
for i in range(len(paramlist)):
    paramlist[i] = tuple(merge_tuple(paramlist[i]))
paramlist = [t for t in paramlist if (t[1] == False) & (t[2] == True)][0]
paramlist

('sentinel-2-l2a_bands-2-3-4-8_ZMB_15k-points_1000-features_*', False, True)

In [7]:
paramlist[0]

'sentinel-2-l2a_bands-2-3-4-8_ZMB_15k-points_1000-features_*'

In [8]:
# def impute_features(params):
file         = paramlist[0]
limit_months = paramlist[1]
crop_mask    = paramlist[2]
# weighted_avg = params[3]
f            = file.split(sep="_")
satellite    = f[0]
points       = int(f[3].replace("k-points", ""))
num_features = int(f[4].replace("-features", ""))

path = str(here("data", "random_features", satellite, file))
files = glob.glob(pathname=path)

print('Opening')

features = get_merged_files(files)

year_end = max(features.year)

if satellite == "landsat-c2-l2":
    year_start = 2008
elif satellite == "landsat-8-c2-l2":
    year_start = 2013 
else:
    year_start = 2015 

month_range = range(4, 10) if limit_months else range(1, 13)

if (satellite == "landsat-8-c2-l2") & (limit_months):
    month_start = 4
else:
    month_start = 10

keep = np.where(
    ((features.year == year_start) & (features.month >= month_start)) | (features.year > year_start), True, False)

features = features[keep]

features['year'] = np.where(
    features['month'].isin([10, 11, 12]),
    features['year'] + 1, 
    features['year']
)
features = features[features.year <= year_end]

features.lon, features.lat = round(features.lon, 5), round(features.lat, 5)

features = features[features.month.isin(month_range)]

features = features.set_index(['lon','lat', "year", 'month']).unstack()
features.columns = features.columns.map(lambda x: '{}_{}'.format(*x))

Opening


In [11]:
a = features.copy().reset_index()

In [12]:
a[a.year == 2016]

Unnamed: 0,lon,lat,year,0_1,0_2,0_3,0_4,0_5,0_6,0_7,...,999_3,999_4,999_5,999_6,999_7,999_8,999_9,999_10,999_11,999_12
0,21.98176,-14.01693,2016,,0.0,,,,0.0,0.0,...,,,,3.532144,3.849683,4.034249,4.161150,3.833862,,3.656577
7,21.98287,-15.95433,2016,,0.0,,,,0.0,,...,,,,5.227335,,5.875107,5.971619,6.791008,7.082443,6.455033
14,21.98372,-13.92686,2016,,0.0,,,,0.0,0.0,...,,,,4.575454,5.457288,5.407687,5.276629,5.851517,,5.352674
21,21.98510,-15.86427,2016,,0.0,,,,0.0,,...,,,,5.169197,,5.884258,5.573695,5.966973,6.100280,5.397137
28,21.98566,-13.83679,2016,,0.0,,,,0.0,0.0,...,,,,4.001415,4.761360,4.753737,4.706692,5.137783,,5.313816
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
105371,33.54503,-10.56450,2016,,,,,,0.0,0.0,...,,,,2.937199,3.020481,,3.550606,,,
105378,33.54695,-10.65435,2016,,,,,,0.0,0.0,...,,,,3.038828,3.151504,,3.970561,,,
105385,33.58948,-10.51863,2016,,,,,,0.0,0.0,...,,,,2.940908,3.235936,,3.876681,,,
105392,33.59140,-10.60847,2016,,,,,,0.0,0.0,...,,,,2.827517,2.792906,,3.642956,,,


In [13]:
features.replace([np.inf, -np.inf], np.nan, inplace=True)
features.reset_index(inplace = True)

if points == 4:
    weights = weights_4.copy()
elif points == 15:
    weights = weights_15.copy()
elif points == 20:
    weights = weights_20.copy()

features = features.join(weights.set_index(['lon', 'lat']), on = ['lon', 'lat'])

if crop_mask:
    features = features[features.crop_perc > 0]
else:
    pass   

features = geopandas.GeoDataFrame(
    features, 
    geometry = geopandas.points_from_xy(x = features.lon, y = features.lat), 
    crs='EPSG:4326'
)

features = (
    features
    .sjoin(country_shp, how = 'left', predicate = 'within')
    .drop(['geometry'], axis = 1)
    .rename(columns = {"index_right": "district"})
    .dropna(subset=['district'])
    .reset_index(drop = True)
)

In [17]:
len(features[features.year == 2016].district.unique())

72

In [18]:
print('Imputing')

num_cells = len(features) * len(month_range) * int(num_features)
ln_ft = len(features); ln_na = len(features.dropna())
features.fillna(features.groupby(['year', 'district'], as_index=False).transform('mean'), inplace=True)

ln_ft = len(features); ln_na = len(features.dropna())
features.fillna(features.groupby(['district'], as_index=False).transform('mean'), inplace=True)

Imputing


In [19]:
len(features[features.year == 2016].district.unique())

72

In [23]:
features

Unnamed: 0,lon,lat,year,0_1,0_2,0_3,0_4,0_5,0_6,0_7,...,999_5,999_6,999_7,999_8,999_9,999_10,999_11,999_12,crop_perc,district
0,22.04991,-15.09977,2016,0.000001,0.0,5.893386e-07,0.000000e+00,0.0,0.0,0.0,...,4.358994,4.089959,4.271688,4.246182,4.490125,5.122961,4.164947,3.906469,0.000152,Kalabo
1,22.04991,-15.09977,2017,0.000001,0.0,5.893386e-07,0.000000e+00,0.0,0.0,0.0,...,3.364898,3.231562,3.249797,3.097034,3.413488,5.010098,4.399866,4.079709,0.000152,Kalabo
2,22.04991,-15.09977,2018,0.000000,0.0,0.000000e+00,0.000000e+00,0.0,0.0,0.0,...,2.912843,3.176933,3.221959,3.212218,3.395274,3.536010,4.040872,3.866383,0.000152,Kalabo
3,22.04991,-15.09977,2019,0.000000,0.0,0.000000e+00,0.000000e+00,0.0,0.0,0.0,...,4.398332,4.801321,4.528609,4.880805,4.875570,3.211890,3.884175,4.118165,0.000152,Kalabo
4,22.04991,-15.09977,2020,0.000000,0.0,0.000000e+00,0.000000e+00,0.0,0.0,0.0,...,3.304991,3.381151,3.405476,3.506648,3.967765,5.478981,4.227735,3.813649,0.000152,Kalabo
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45012,33.63583,-10.56260,2018,0.000000,0.0,0.000000e+00,0.000000e+00,0.0,0.0,0.0,...,2.696334,2.691303,2.941995,3.205833,3.413213,3.438436,3.784972,3.546447,0.001067,Isoka
45013,33.63583,-10.56260,2019,0.000000,0.0,0.000000e+00,0.000000e+00,0.0,0.0,0.0,...,2.588395,2.720627,3.015472,3.048735,3.852941,3.595639,3.977738,3.187459,0.001067,Isoka
45014,33.63583,-10.56260,2020,0.000000,0.0,0.000000e+00,0.000000e+00,0.0,0.0,0.0,...,2.852466,3.071886,3.032688,3.325195,3.146947,3.649997,4.993696,3.639261,0.001067,Isoka
45015,33.63583,-10.56260,2021,0.000000,0.0,0.000000e+00,0.000000e+00,0.0,0.0,0.0,...,2.524389,2.627626,3.038066,3.401843,3.490526,3.658384,3.551093,3.706571,0.001067,Isoka


In [26]:
ln_ft = len(features); ln_na = len(features.dropna())
features = features.dropna(axis=0)

In [27]:
len(features[features.year == 2016].district.unique())

70

In [30]:
def impute_features(params):
    file         = params[0]
    limit_months = params[1]
    crop_mask    = params[2]
    # weighted_avg = params[3]
    f            = file.split(sep="_")
    satellite    = f[0]
    points       = int(f[3].replace("k-points", ""))
    num_features = int(f[4].replace("-features", ""))
 
    path = str(here("data", "random_features", satellite, file))
    files = glob.glob(pathname=path)
    
    print('Opening')
    
    features = get_merged_files(files)

    year_end = max(features.year)
    
    if satellite == "landsat-c2-l2":
        year_start = 2008
    elif satellite == "landsat-8-c2-l2":
        year_start = 2013 
    else:
        year_start = 2015 
        
    month_range = range(4, 10) if limit_months else range(1, 13)

    if (satellite == "landsat-8-c2-l2") & (limit_months):
        month_start = 4
    else:
        month_start = 10

    keep = np.where(
        ((features.year == year_start) & (features.month >= month_start)) | (features.year > year_start), True, False)

    features = features[keep]

    features['year'] = np.where(
        features['month'].isin([10, 11, 12]),
        features['year'] + 1, 
        features['year']
    )
    features = features[features.year <= year_end]

    features.lon, features.lat = round(features.lon, 5), round(features.lat, 5)

    features = features[features.month.isin(month_range)]

    features = features.set_index(['lon','lat', "year", 'month']).unstack()
    features.columns = features.columns.map(lambda x: '{}_{}'.format(*x))

    features.replace([np.inf, -np.inf], np.nan, inplace=True)
    features.reset_index(inplace = True)

    if points == 4:
        weights = weights_4.copy()
    elif points == 15:
        weights = weights_15.copy()
    elif points == 20:
        weights = weights_20.copy()

    features = features.join(weights.set_index(['lon', 'lat']), on = ['lon', 'lat'])

    if crop_mask:
        features = features[features.crop_perc > 0]
    else:
        pass   

    features = geopandas.GeoDataFrame(
        features, 
        geometry = geopandas.points_from_xy(x = features.lon, y = features.lat), 
        crs='EPSG:4326'
    )

    features = (
        features
        .sjoin(country_shp, how = 'left', predicate = 'within')
        .drop(['geometry'], axis = 1)
        .rename(columns = {"index_right": "district"})
        .dropna(subset=['district'])
        .reset_index(drop = True)
    )

    print('Imputing')
    
    num_cells = len(features) * len(month_range) * int(num_features)
    ln_ft = len(features); ln_na = len(features.dropna())
    features.fillna(features.groupby(['year', 'district'], as_index=False).transform('mean'), inplace=True)

    ln_ft = len(features); ln_na = len(features.dropna())
    features.fillna(features.groupby(['district'], as_index=False).transform('mean'), inplace=True)

    ln_ft = len(features); ln_na = len(features.dropna())
    features = features.dropna(axis=0)

    min_yr = min(features.year); max_yr = max(features.year)
    min_mn = min(month_range);   max_mn = max(month_range)

    f = f'{file[:-1]}yr-{min_yr}-{max_yr}_mn-{min_mn}-{max_mn}_lm-{limit_months}'+\
        f'_cm-{crop_mask}_full.feather'
    full_file = here('data', 'random_features', 'full_files', f)

    print('Saving')
    
    features.reset_index(drop=True).to_feather(full_file)

In [31]:
%%time
impute_features(paramlist)

Opening
Imputing
Saving
CPU times: user 11min 34s, sys: 4min 14s, total: 15min 48s
Wall time: 13min 53s


## Load the "best" model

In [56]:
hot_encode = True
weighted_avg = True
file_suffix = 'landsat-8-c2-l2_bands-1-2-3-4-5-6-7_ZMB_20k-points_1000-features_yr-2013-2021_mn-4-9_lm-True_cm-True'
model_fn_suffix = f'{file_suffix}_wa-{weighted_avg}_he-{hot_encode}'

k_model_fn = f'k-fold-cv_rr-model_{model_fn_suffix}.pkl'
logo_model_fn = f'logo-cv_rr-model_{model_fn_suffix}.pkl'
       
with open(here('models', k_model_fn), 'rb') as f:
    best_kfold_model = pickle.load(f)

## Make high resolution predictions

In [15]:
high_res_f = f'{file_suffix}_full.feather'
high_res_fn = here('data', 'random_features', 'full_files', high_res_f)
high_res_features = pd.read_feather(high_res_fn)

drop_cols = ['year', 'lon', 'lat', 'crop_perc', 'district']

if weighted_avg:
    high_res_features = high_res_features.set_index(drop_cols)
    high_res_features.rename(columns={x:y for x,y in zip(high_res_features.columns,range(0,len(high_res_features.columns)))}, inplace=True)
    high_res_features = high_res_features.reset_index()
    high_res_features.columns = high_res_features.columns.astype(str)

if hot_encode:
    drop_cols.remove('district')
    high_res_features = pd.get_dummies(high_res_features, columns=["district"], drop_first=False)
else:
    pass

high_res_predictions = high_res_features.copy()[drop_cols]

high_res_x_all = high_res_features.drop(drop_cols, axis = 1) 
high_res_predictions['prediction'] = np.maximum(best_kfold_model.predict(high_res_x_all), 0)

high_res_f_pred = f'high-res-pred_k-fold-cv_{model_fn_suffix}.feather'
high_res_fn_pred = here('data', 'results', high_res_f_pred)
high_res_predictions.to_feather(str(high_res_fn_pred))

## Make summary predictions

In [85]:
hot_encode = True
weighted_avg = True
file_suffix = 'landsat-8-c2-l2_bands-1-2-3-4-5-6-7_ZMB_20k-points_1000-features_yr-2013-2021_mn-4-9_lm-True_cm-True'
model_fn_suffix = f'{file_suffix}_wa-{weighted_avg}_he-{hot_encode}'

k_model_fn = f'k-fold-cv_rr-model_{model_fn_suffix}.pkl'
logo_model_fn = f'logo-cv_rr-model_{model_fn_suffix}.pkl'
       
with open(here('models', k_model_fn), 'rb') as f:
    best_kfold_model = pickle.load(f)

summary_f = f'{file_suffix}_wa-{weighted_avg}_summary.feather'
summary_fn = here('data', 'random_features', 'summary', summary_f)
summary_features = pd.read_feather(summary_fn)

drop_cols = ['district', 'year', 'yield_mt']
summary_predictions = summary_features.copy().loc[:, tuple(drop_cols)]

if hot_encode:
    drop_cols.remove("district")
    summary_features = pd.get_dummies(summary_features, columns=["district"], drop_first=False)
else:
    pass

summary_x_all = summary_features.drop(drop_cols, axis = 1) 
summary_y_all = np.log10(summary_features.yield_mt.to_numpy() + 1)
summary_predictions['log_yield'] = summary_y_all
summary_predictions['prediction'] = np.maximum(best_kfold_model.predict(summary_x_all), 0)
x_train, x_test, y_train, y_test = train_test_split(
    summary_x_all, summary_y_all, test_size = 0.2, random_state = 0)

kfold = KFold()
x_train['kfold_cv_predictions'] = np.maximum(cross_val_predict(best_kfold_model, X = x_train, y = y_train, cv=kfold), 0)
x_train['split'], x_test['split']= 'train', 'test'
train_test = pd.concat([x_train, x_test])[['split', 'kfold_cv_predictions']]
summary_predictions = summary_predictions.join(train_test)

summary_f_pred = f'summary-pred_k-fold-cv_{model_fn_suffix}.csv'
summary_fn_pred = here('data', 'results', summary_f_pred)
summary_predictions.to_csv(summary_fn_pred, index=False)

## Make high resolution predictions with two sensors

In [None]:
file_suffix_1 = 'landsat-8-c2-l2_bands-1-2-3-4-5-6-7_ZMB_15k-points_1000-features_yr-2013-2021_mn-4-9_lm-True_cm-False'
file_suffix_2 = 'sentinel-2-l2a_bands-2-3-4-8_ZMB_15k-points_1000-features_yr-2016-2022_mn-1-12_lm-False_cm-True'
model_fn_suffix = f'best-k-fold-2-sensor-params_he-{True}'

k_model_fn = f'k-fold-cv_rr-model_{model_fn_suffix}.pkl'
logo_model_fn = f'logo-cv_rr-model_{model_fn_suffix}.pkl'
       
with open(here('models', k_model_fn), 'rb') as f:
    best_kfold_model = pickle.load(f)
    
high_res_f_1 = f'{file_suffix_1}_full.feather'
high_res_fn_1 = here('data', 'random_features', 'full_files', high_res_f_1)
high_res_features_1 = pd.read_feather(high_res_fn_1)

high_res_f_2 = f'{file_suffix_2}_full.feather'
high_res_fn_2 = here('data', 'random_features', 'full_files', high_res_f_2)
high_res_features_2 = pd.read_feather(high_res_fn_2)

index_cols = ['district', 'year', 'crop_perc', 'lon', 'lat']
    
high_res_features_1 = high_res_features_1.set_index(index_cols).add_prefix("f1_")
high_res_features_2 = high_res_features_2.set_index(index_cols).add_prefix("f2_")

high_res_features = high_res_features_1.join(high_res_features_2).reset_index()
high_res_features = high_res_features[~high_res_features.isna().any(axis = 1)]

drop_cols = ['year', 'lon', 'lat', 'crop_perc', 'district']

if weighted_avg:
    high_res_features = high_res_features.set_index(drop_cols)
    high_res_features.rename(columns={x:y for x,y in zip(high_res_features.columns,range(0,len(high_res_features.columns)))}, inplace=True)
    high_res_features = high_res_features.reset_index()
    high_res_features.columns = high_res_features.columns.astype(str)

if hot_encode:
    drop_cols.remove('district')
    high_res_features = pd.get_dummies(high_res_features, columns=["district"], drop_first=False)
else:
    pass

high_res_predictions = high_res_features.copy()[drop_cols]

high_res_x_all = high_res_features.drop(drop_cols, axis = 1) 
high_res_predictions['prediction'] = best_kfold_model.predict(high_res_x_all)
high_res_predictions.reset_index(drop=True, inplace=True)

high_res_f_pred = f'high-res-pred_k-fold-cv_{model_fn_suffix}.feather'
high_res_fn_pred = here('data', 'results', high_res_f_pred)
high_res_predictions.to_feather(str(high_res_fn_pred))