# Modeling Crop Yield
## Python modules

In [7]:
import warnings
import time
import os
import glob

import dask
from dask.distributed import Client

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.pyplot as plt
import matplotlib.colors as colors

import geopandas

import pyarrow
from sklearn.linear_model import RidgeCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.impute import SimpleImputer
from scipy.stats import spearmanr
from scipy.linalg import LinAlgWarning
from scipy.stats import pearsonr

from pyhere import here

import math
import seaborn as sns

from pyhere import here

In [8]:
satellite = "landsat-c2-l2"
# satellite = "landsat-8-c2-l2"
# satellite = "sentinel-2-l2a"

In [9]:
#bands = "2-3-4"
# bands = "2-3-4-8"
# bands = "1-2-3-4-5-6-7"
bands = "r-g-b-nir-swir16-swir22"
# bands = "2-3-4-5-6-7-8-11-12"

In [10]:
# points = 15
points = 20

In [11]:
impute_manual = True
# impute_manual = False

In [12]:
include_2013 = True
# include_2013 = False

In [13]:
country_code = "ZMB"

# num_features = 1000
num_features = 1024

if satellite == "landsat-c2-l2":
    year_start = 2008
elif satellite == "landsat-8-c2-l2":
    year_start = 2013 # Landsat 8 only
else:
    year_start = 2015 # Sentinel
    
year_end = 2021
year_end_crops = 2021

data_dir = "/capstone/cropmosaiks/data"  
data_dir = here("data")  

feature_file_name = (f'{satellite}_bands-{bands}_{country_code}_{points}k-points_{num_features}-features')
# ZMB_cropland_percentage_20k-points
weight_file_name = (f'{country_code}_cropland_percentage_{points}k-points')

if points == "4":
    marker_sz = 60
elif points == "15":
    marker_sz = 15
elif points == "24":
    marker_sz = 10
else:
    marker_sz = 8

In [14]:
country_shp = geopandas.read_file(f'{data_dir}/geo_boundaries/gadm36_{country_code}_2.shp')
country_districts = country_shp.district.sort_values().unique().tolist()
country_shp = country_shp.set_index('district')
country_shp.shape

(72, 1)

In [15]:
crop_df_full = pd.read_csv(f'{data_dir}/crop_yield/cfs_maize_districts_zambia_2009_2022.csv')
crop_df_full = crop_df_full[crop_df_full.year <= year_end_crops]
crop_districts = crop_df_full.district.sort_values().unique().tolist()
crop_df = crop_df_full[['district', 'year', 'yield_mt']]
ln = len(crop_df[crop_df.year == 2016].district)
crop_df = crop_df.set_index('district')
ln

74

In [16]:
list(set(crop_districts) - set(country_districts))

['Mafinga', 'Ikelenge']

In [17]:
list(set(country_districts) - set(crop_districts))

[]

In [18]:
country_crop = geopandas.GeoDataFrame(crop_df.join(country_shp), crs = country_shp.crs)

In [19]:
weights = pd.read_feather(f"{data_dir}/land_cover/{weight_file_name}.feather")
# weights

In [21]:
weights_gdf = geopandas.GeoDataFrame(
    weights, 
    geometry = geopandas.points_from_xy(x = weights.lon, y = weights.lat), 
    crs='EPSG:4326'
)
# weights_gdf.plot(figsize = (10,10),
#                  cmap = 'inferno',
#                  markersize = marker_sz,
#                  alpha = .9,
#                  column = 'crop_perc')
# plt.axis('off')

In [22]:
weights.crop_perc = weights.crop_perc.fillna(0)
# #weights.crop_perc = weights.crop_perc + 0.0001

In [23]:
def get_merged_files(flist, **kwargs):
    return pd.concat([pd.read_feather(f, **kwargs) for f in flist], axis=0).reset_index(drop=True)

In [24]:
file = (f'{satellite}_bands-{bands}_{country_code}_{points}'+
             f'k-points_{num_features}-features_*')
path = str(here('data', 'random_features', satellite))+f'/{file}'
files = glob.glob(pathname=path)
files

['/home/computevmuser/crop-modeling/code/4_model_crop_yield/../../data/random_features/landsat-c2-l2/landsat-c2-l2_bands-r-g-b-nir-swir16-swir22_ZMB_20k-points_1024-features_2011.feather',
 '/home/computevmuser/crop-modeling/code/4_model_crop_yield/../../data/random_features/landsat-c2-l2/landsat-c2-l2_bands-r-g-b-nir-swir16-swir22_ZMB_20k-points_1024-features_2019.feather',
 '/home/computevmuser/crop-modeling/code/4_model_crop_yield/../../data/random_features/landsat-c2-l2/landsat-c2-l2_bands-r-g-b-nir-swir16-swir22_ZMB_20k-points_1024-features_2018.feather',
 '/home/computevmuser/crop-modeling/code/4_model_crop_yield/../../data/random_features/landsat-c2-l2/landsat-c2-l2_bands-r-g-b-nir-swir16-swir22_ZMB_20k-points_1024-features_2014.feather',
 '/home/computevmuser/crop-modeling/code/4_model_crop_yield/../../data/random_features/landsat-c2-l2/landsat-c2-l2_bands-r-g-b-nir-swir16-swir22_ZMB_20k-points_1024-features_2021.feather',
 '/home/computevmuser/crop-modeling/code/4_model_crop_y

In [25]:
features = get_merged_files(files)
features

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1018,1019,1020,1021,1022,1023,lon,lat,year,month
0,0.586715,0.056000,0.113268,2.971165,0.00013,1.397094,2.787793,0.023934,0.0,0.033516,...,0.086621,0.127456,0.029246,0.582002,3.334538,0.119787,23.584878,-16.674232,2011,1
1,0.017292,0.001049,0.001213,0.429904,0.00000,0.052236,0.566839,0.000000,0.0,0.000429,...,0.225703,0.238376,0.146510,0.822375,1.824303,0.305837,23.684878,-16.724232,2011,1
2,0.294023,0.000000,0.096162,2.964857,0.00000,1.353411,3.139675,0.000000,0.0,0.000000,...,0.000000,0.000000,0.000000,1.066237,3.549354,0.001892,23.594878,-16.664232,2011,1
3,0.291976,0.000599,0.037567,3.419004,0.00000,1.459579,1.512373,0.000000,0.0,0.000000,...,0.038818,0.000000,0.000000,0.590984,3.005162,0.007689,24.794878,-15.614232,2011,1
4,0.560434,0.130134,0.057544,3.514282,0.00000,1.527490,1.755638,0.000000,0.0,0.004293,...,0.065718,0.000032,0.001308,0.519383,3.078929,0.034964,24.784878,-15.604232,2011,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2216804,0.252617,0.000000,0.039377,3.405659,0.00000,1.580517,2.890502,0.000000,0.0,0.000000,...,0.000044,0.000000,0.000000,0.961802,3.520149,0.000243,27.844878,-16.784232,2009,12
2216805,0.267551,0.000000,0.085289,3.082278,0.00000,1.454876,3.068731,0.000000,0.0,0.000000,...,0.004166,0.000000,0.000000,0.961204,3.530401,0.005412,27.854878,-16.784232,2009,12
2216806,0.305801,0.002425,0.037220,1.813636,0.00000,0.748918,1.145494,0.011064,0.0,0.000000,...,0.356887,0.233187,0.140352,0.494821,2.309845,0.291549,27.864878,-16.784232,2009,12
2216807,0.396878,0.001825,0.041189,2.672703,0.00000,1.232169,1.515177,0.006089,0.0,0.000000,...,0.250736,0.119917,0.059930,0.381136,2.697180,0.130419,27.864878,-16.794232,2009,12


In [26]:
features = features.sort_values(by = ['year', 'month']).reset_index(drop=True)
features

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1018,1019,1020,1021,1022,1023,lon,lat,year,month
0,0.442860,0.000000,0.089895,3.363738,0.000000,1.685055,5.395831,0.000000,0.0,0.000000,...,0.000000,0.000000,0.000000,1.468006,4.457739,0.000000,22.144878,-16.384232,2008,10
1,0.002744,0.000000,0.000097,0.111959,0.000443,0.005558,0.539194,0.000000,0.0,0.000000,...,0.113944,0.218043,0.071209,1.102861,1.829382,0.353255,22.124878,-16.384232,2008,10
2,0.452845,0.000000,0.163848,3.274206,0.000000,1.650905,5.638133,0.000000,0.0,0.000000,...,0.000000,0.000000,0.000000,1.385134,4.584863,0.000000,22.134878,-16.384232,2008,10
3,0.000008,0.000000,0.004044,0.217342,0.000000,0.003674,0.988060,0.000000,0.0,0.000000,...,0.000000,0.185636,0.000151,1.120443,2.079663,0.153255,22.134878,-16.394232,2008,10
4,0.018467,0.000242,0.002535,0.535540,0.001643,0.048299,1.314301,0.000000,0.0,0.000000,...,0.000050,0.022284,0.000497,1.056919,2.251067,0.062361,22.104878,-16.324232,2008,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2216804,0.210573,0.000000,0.084464,3.095834,0.000000,1.585276,5.248778,0.000000,0.0,0.000000,...,0.000000,0.000000,0.000000,1.520998,4.340216,0.000000,27.844878,-16.784232,2021,12
2216805,0.324603,0.000000,0.095184,3.185496,0.000000,1.652028,5.330277,0.000000,0.0,0.000000,...,0.000000,0.000000,0.000000,1.428862,4.383515,0.000000,27.854878,-16.784232,2021,12
2216806,0.633870,0.016938,0.104171,3.081136,0.000000,1.618830,4.660454,0.005108,0.0,0.006480,...,0.029489,0.065852,0.003629,0.842736,4.073915,0.045046,27.864878,-16.784232,2021,12
2216807,0.704258,0.007378,0.122458,3.533937,0.000000,1.862260,5.338318,0.003826,0.0,0.003131,...,0.019022,0.042529,0.004982,0.963174,4.422058,0.024029,27.864878,-16.794232,2021,12


In [27]:
# if include_2013 & (satellite == "landsat-8-c2-l2") & (month_range == [4, 5, 6, 7, 8, 9]):
#     month_start = 4
# else:
#     month_start = 10
    
# keep = np.where(
#     ((features.year == year_start) & (features.month >= month_start)) | (features.year > year_start),
#     True, False
# )
# features = features[keep]

In [29]:
# carry months October, November, and December over to the following year's data
# these months represent the start of the growing season for the following year's maize yield
features['year'] = np.where(
    features['month'].isin([10, 11, 12]),
    features['year'] + 1, 
    features['year']
)

features = features[features['year'] <= year_end]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  features['year'] = np.where(


In [31]:
features = features.set_index(['lon','lat', "year", 'month']).unstack()
features.columns = features.columns.map(lambda x: '{}_{}'.format(*x))

In [33]:
features.replace([np.inf, -np.inf], np.nan, inplace=True)
features = features.reset_index()
# features

In [34]:
features = features.join(weights.set_index(['lon', 'lat']), on = ['lon', 'lat'])
features = features.drop(["geometry"], axis = 1)
# features

In [35]:
features = geopandas.GeoDataFrame(
    features, 
    geometry = geopandas.points_from_xy(x = features.lon, y = features.lat), 
    crs='EPSG:4326'
)

In [36]:
features = features.sjoin(country_shp, how = 'left', predicate = 'within')
# features

In [37]:
features = (
    features
    .dropna(subset=['index_right'])
    .rename(columns = {"index_right": "district",})
    .reset_index(drop = True)
)
features = features.drop(['geometry'], axis = 1)
features

Unnamed: 0,lon,lat,year,0_1,0_2,0_3,0_4,0_5,0_6,0_7,...,1023_5,1023_6,1023_7,1023_8,1023_9,1023_10,1023_11,1023_12,crop_perc,district
0,22.074878,-14.864232,2009,,,0.068686,0.250473,0.337390,0.485332,0.647679,...,0.001152,0.000046,0.000000,0.643495,0.005516,,,,0.12790,Kalabo
1,22.074878,-14.864232,2010,,,,0.169845,,,,...,,,,,,0.003225,0.004282,0.125096,0.12790,Kalabo
2,22.074878,-14.864232,2011,,,0.111051,0.159344,0.185883,,,...,0.000520,,,0.028948,0.014299,,,,0.12790,Kalabo
3,22.074878,-14.864232,2012,,,,0.095310,0.181125,0.366679,,...,0.001354,0.000000,,,,0.009329,,,0.12790,Kalabo
4,22.074878,-14.864232,2013,0.046617,,0.080899,0.098845,0.150621,0.399008,0.484192,...,0.000199,0.000000,0.018012,0.039917,0.011897,,,,0.12790,Kalabo
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
254769,33.524878,-10.324232,2017,,,,,0.408296,0.417698,0.389805,...,0.000000,0.000000,0.000000,0.000000,0.000273,0.000000,0.000000,,0.30577,Isoka
254770,33.524878,-10.324232,2018,,0.204087,0.071166,,0.413954,0.382255,0.342236,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000260,0.000000,0.000085,0.30577,Isoka
254771,33.524878,-10.324232,2019,,0.106647,0.096535,,,0.336427,0.314573,...,,0.000000,0.000000,0.001463,0.000000,0.000000,0.000000,,0.30577,Isoka
254772,33.524878,-10.324232,2020,,,,0.145304,0.416148,,0.365570,...,0.000081,,0.000000,0.000000,0.000087,0.001722,0.000000,0.000000,0.30577,Isoka


In [38]:
class bcolors:
    BL = '\x1b[1;34m' # BLUE
    GR = '\x1b[1;36m' # GREEN
    YL = '\x1b[1;33m' # YELLOW
    RD = '\x1b[1;31m' # RED
    RESET = '\033[0m' # RESET COLOR

In [39]:
%%time
if impute_manual:
    num_cells = len(features) * 12 * num_features
    ln_ft = len(features)
    ln_na = len(features.dropna())
    print(f'Starting total row count: {bcolors.BL}{ln_ft}{bcolors.RESET}',
          f'\nPre-Impute NaN row count: {bcolors.RD}{ln_ft - ln_na}{bcolors.RESET}',
          f'\nPre-Impute NaN row %: {bcolors.RD}{((ln_ft - ln_na) / ln_ft)*100:.02f}{bcolors.RESET}',
          f'\nPre-Impute NaN cell %: {bcolors.RD}{(features.isna().sum().sum() / num_cells)*100:.02f}{bcolors.RESET}',
          f'\n\nStep 1: Filling NaN values by month, year, and district group average')
    features = (
        features
        .fillna(features
                .groupby(['year', 'district'], as_index=False)
                .transform('mean')
               )
    )
    ln_ft = len(features)
    ln_na = len(features.dropna())
    print(f'Post step 1 NaN row count: {bcolors.YL}{ln_ft - ln_na}{bcolors.RESET}',
          f'\nPost step 1 NaN row %: {bcolors.YL}{((ln_ft - ln_na) / ln_ft)*100:.02f}{bcolors.RESET}',
          f'\nPost step 1 NaN cell %: {bcolors.YL}{(features.isna().sum().sum() / num_cells)*100:.02f}{bcolors.RESET}',
          f'\n\nStep 2: Filling NaN values by month and district across group average')
    features = (
        features
        .fillna(features
                .groupby(['district'], as_index=False)
                .transform('mean')
               )
    )
    ln_ft = len(features)
    ln_na = len(features.dropna())
    print(f'Post step 2 NaN row count: {bcolors.GR}{ln_ft - ln_na}{bcolors.RESET}',
          f'\nPost step 2 NaN row %: {bcolors.GR}{((ln_ft - ln_na) / ln_ft)*100:.02f}{bcolors.RESET}',
          f'\nPost step 2 NaN cell %: {bcolors.GR}{(features.isna().sum().sum() / num_cells)*100:.02f}{bcolors.RESET}',
          f'\n\nStep 3: Drop remaining NaN values\n')
    features = features.dropna(axis=0)
    print(f'Ending total row count: {bcolors.BL}{len(features)}{bcolors.RESET}')
else:
    features = features.set_index(['year', 'district'])
    imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
    imputer.fit_transform(features)
    features[:] = imputer.transform(features)
    features = features.reset_index()
features

Starting total row count: [1;34m254774[0m 
Pre-Impute NaN row count: [1;31m248124[0m 
Pre-Impute NaN row %: [1;31m97.39[0m 
Pre-Impute NaN cell %: [1;31m30.74[0m 

Step 1: Filling NaN values by month, year, and district group average
Post step 1 NaN row count: [1;33m223820[0m 
Post step 1 NaN row %: [1;33m87.85[0m 
Post step 1 NaN cell %: [1;33m20.56[0m 

Step 2: Filling NaN values by month and district across group average
Post step 2 NaN row count: [1;36m520[0m 
Post step 2 NaN row %: [1;36m0.20[0m 
Post step 2 NaN cell %: [1;36m0.02[0m 

Step 3: Drop remaining NaN values

Ending total row count: [1;34m254254[0m
CPU times: user 16min 8s, sys: 7min 45s, total: 23min 54s
Wall time: 23min 36s


Unnamed: 0,lon,lat,year,0_1,0_2,0_3,0_4,0_5,0_6,0_7,...,1023_5,1023_6,1023_7,1023_8,1023_9,1023_10,1023_11,1023_12,crop_perc,district
0,22.074878,-14.864232,2009,0.454439,0.502102,0.068686,0.250473,0.337390,0.485332,0.647679,...,0.001152,0.000046,0.000000,0.643495,0.005516,0.011729,0.005984,0.014699,0.12790,Kalabo
1,22.074878,-14.864232,2010,0.956612,0.368983,0.329758,0.169845,0.313996,0.287629,0.295129,...,0.023167,0.006424,0.007702,0.013589,0.013095,0.003225,0.004282,0.125096,0.12790,Kalabo
2,22.074878,-14.864232,2011,0.454439,0.424619,0.111051,0.159344,0.185883,0.287629,0.295129,...,0.000520,0.006424,0.007702,0.028948,0.014299,0.011729,0.005984,0.014699,0.12790,Kalabo
3,22.074878,-14.864232,2012,0.454439,0.502102,0.329758,0.095310,0.181125,0.366679,0.295129,...,0.001354,0.000000,0.007702,0.013589,0.013095,0.009329,0.018889,0.029986,0.12790,Kalabo
4,22.074878,-14.864232,2013,0.046617,0.304945,0.080899,0.098845,0.150621,0.399008,0.484192,...,0.000199,0.000000,0.018012,0.039917,0.011897,0.009112,0.005984,0.014699,0.12790,Kalabo
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
254769,33.524878,-10.324232,2017,0.260766,0.441030,0.551799,0.188113,0.408296,0.417698,0.389805,...,0.000000,0.000000,0.000000,0.000000,0.000273,0.000000,0.000000,0.040489,0.30577,Isoka
254770,33.524878,-10.324232,2018,0.260766,0.204087,0.071166,0.182352,0.413954,0.382255,0.342236,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000260,0.000000,0.000085,0.30577,Isoka
254771,33.524878,-10.324232,2019,0.260766,0.106647,0.096535,0.333516,0.550334,0.336427,0.314573,...,0.009803,0.000000,0.000000,0.001463,0.000000,0.000000,0.000000,0.040489,0.30577,Isoka
254772,33.524878,-10.324232,2020,0.260766,0.441030,0.789669,0.145304,0.416148,0.366756,0.365570,...,0.000081,0.003932,0.000000,0.000000,0.000087,0.001722,0.000000,0.000000,0.30577,Isoka


In [40]:
features

Unnamed: 0,lon,lat,year,0_1,0_2,0_3,0_4,0_5,0_6,0_7,...,1023_5,1023_6,1023_7,1023_8,1023_9,1023_10,1023_11,1023_12,crop_perc,district
0,22.074878,-14.864232,2009,0.454439,0.502102,0.068686,0.250473,0.337390,0.485332,0.647679,...,0.001152,0.000046,0.000000,0.643495,0.005516,0.011729,0.005984,0.014699,0.12790,Kalabo
1,22.074878,-14.864232,2010,0.956612,0.368983,0.329758,0.169845,0.313996,0.287629,0.295129,...,0.023167,0.006424,0.007702,0.013589,0.013095,0.003225,0.004282,0.125096,0.12790,Kalabo
2,22.074878,-14.864232,2011,0.454439,0.424619,0.111051,0.159344,0.185883,0.287629,0.295129,...,0.000520,0.006424,0.007702,0.028948,0.014299,0.011729,0.005984,0.014699,0.12790,Kalabo
3,22.074878,-14.864232,2012,0.454439,0.502102,0.329758,0.095310,0.181125,0.366679,0.295129,...,0.001354,0.000000,0.007702,0.013589,0.013095,0.009329,0.018889,0.029986,0.12790,Kalabo
4,22.074878,-14.864232,2013,0.046617,0.304945,0.080899,0.098845,0.150621,0.399008,0.484192,...,0.000199,0.000000,0.018012,0.039917,0.011897,0.009112,0.005984,0.014699,0.12790,Kalabo
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
254769,33.524878,-10.324232,2017,0.260766,0.441030,0.551799,0.188113,0.408296,0.417698,0.389805,...,0.000000,0.000000,0.000000,0.000000,0.000273,0.000000,0.000000,0.040489,0.30577,Isoka
254770,33.524878,-10.324232,2018,0.260766,0.204087,0.071166,0.182352,0.413954,0.382255,0.342236,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000260,0.000000,0.000085,0.30577,Isoka
254771,33.524878,-10.324232,2019,0.260766,0.106647,0.096535,0.333516,0.550334,0.336427,0.314573,...,0.009803,0.000000,0.000000,0.001463,0.000000,0.000000,0.000000,0.040489,0.30577,Isoka
254772,33.524878,-10.324232,2020,0.260766,0.441030,0.789669,0.145304,0.416148,0.366756,0.365570,...,0.000081,0.003932,0.000000,0.000000,0.000087,0.001722,0.000000,0.000000,0.30577,Isoka


In [41]:
full_file = here(
    'data', 
    'random_features', 
    'full_files',
    f'{file[:-1]}yr-{min(features.year)}-{max(features.year)}.feather')
full_file

PosixPath('/home/computevmuser/crop-modeling/code/4_model_crop_yield/../../data/random_features/full_files/landsat-c2-l2_bands-r-g-b-nir-swir16-swir22_ZMB_20k-points_1024-features_yr-2009-2021.feather')

In [42]:
features.reset_index(drop=True).to_feather(full_file)

In [None]:
# features.filter(regex=fr'_[{min(month_range)}-{max(month_range)}]|year|crop_perc|district', axis=1)