# Modeling Crop Yield
## Python modules

In [1]:
import warnings
import time
import os

import dask
from dask.distributed import Client

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.pyplot as plt
import matplotlib.colors as colors

import geopandas

import pyarrow
from sklearn.linear_model import RidgeCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.impute import SimpleImputer
from scipy.stats import spearmanr
from scipy.linalg import LinAlgWarning
from scipy.stats import pearsonr

from pyhere import here

import math
import seaborn as sns

from pyhere import here

In [2]:
satellite = "landsat-c2-l2"
# satellite = "landsat-8-c2-l2"
# satellite = "sentinel-2-l2a"

In [3]:
#bands = "2-3-4"
# bands = "2-3-4-8"
# bands = "1-2-3-4-5-6-7"
bands = "r-g-b-nir-swir16-swir22"
# bands = "2-3-4-5-6-7-8-11-12"

In [4]:
# points = 15
points = 20

In [5]:
# crop_mask = True
crop_mask = False

In [6]:
#weighted_avg = True
weighted_avg = False

In [7]:
# month_range = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
# month_range = [      3, 4, 5, 6, 7, 8, 9            ]
month_range = [         4, 5, 6, 7, 8, 9            ]
# month_range = [            5, 6, 7, 8, 9            ]
# month_range = [         4, 5, 6, 7, 8               ]
# month_range = [            5, 6, 7, 8               ]

In [8]:
impute_manual = True
# impute_manual = False

In [9]:
include_2013 = True
# include_2013 = False

In [10]:
country_code = "ZMB"

# num_features = 1000
num_features = 1024

if satellite == "landsat-c2-l2":
    year_start = 2008
elif satellite == "landsat-8-c2-l2":
    year_start = 2013 # Landsat 8 only
else:
    year_start = 2015 # Sentinel
    
year_end = 2018
year_end_crops = 2018

data_dir = "/capstone/cropmosaiks/data"  
data_dir = here("data")  

feature_file_name = (f'{satellite}_bands-{bands}_{country_code}_{points}k-points_{num_features}-features')
# ZMB_cropland_percentage_20k-points
weight_file_name = (f'{country_code}_cropland_percentage_{points}k-points')

if points == "4":
    marker_sz = 60
elif points == "15":
    marker_sz = 15
elif points == "24":
    marker_sz = 10
else:
    marker_sz = 8

In [4]:
country_shp = geopandas.read_file(f'{data_dir}/geo_boundaries/gadm36_{country_code}_2.shp')
country_shp = country_shp.rename(columns = {'NAME_2': 'district'})[['district', 'geometry']]
country_shp.district = country_shp.district.replace("MPongwe", 'Mpongwe', regex=True)
country_districts = country_shp.district.sort_values().unique().tolist()
country_shp = country_shp.set_index('district')
country_shp.shape
# country_shp.plot(figsize = (12,10), linewidth = 1, edgecolor = 'black' )
# country_shp.plot()

In [12]:
# crop_df_full = pd.read_csv(data_dir+'/crops/cfs_maize_districts_zambia_2009_2018.csv')
crop_df_full = pd.read_csv(f'{data_dir}/crop_yield/cfs_maize_districts_zambia_2009_2022.csv')
crop_df_full = crop_df_full[crop_df_full.year <= year_end_crops]
# crop_df_full.district = crop_df_full.district.replace(
#     {"Itezhi-tezhi": 'Itezhi-Tezhi',
#      "Kapiri-Mposhi": 'Kapiri Mposhi',
#      "Kapiri mposhi": 'Kapiri Mposhi',
#      "Shang'ombo": 'Shangombo',
#      "Chienge": 'Chiengi'
#     }, regex=True)
crop_districts = crop_df_full.district.sort_values().unique().tolist()
crop_df = crop_df_full[['district', 'year', 'yield_mt']]
ln = len(crop_df[crop_df.year == 2016].district)
crop_df = crop_df.set_index('district')
ln
# crop_df_full
# crop_df

74

In [13]:
list(set(crop_districts) - set(country_districts))

['Mafinga', 'Ikelenge']

In [14]:
list(set(country_districts) - set(crop_districts))

[]

In [15]:
country_crop = geopandas.GeoDataFrame(crop_df.join(country_shp), crs = country_shp.crs)

In [16]:
weights = pd.read_feather(f"{data_dir}/land_cover/{weight_file_name}.feather")
# weights

In [3]:
weights_gdf = geopandas.GeoDataFrame(
    weights, 
    geometry = geopandas.points_from_xy(x = weights.lon, y = weights.lat), 
    crs='EPSG:4326'
)
weights_gdf.plot(figsize = (10,10),
                 cmap = 'inferno',
                 markersize = marker_sz,
                 alpha = .9,
                 column = 'crop_perc')
# plt.axis('off')

In [18]:
weights.crop_perc = weights.crop_perc.fillna(0)
# #weights.crop_perc = weights.crop_perc + 0.0001

In [None]:
def get_merged_files(flist, **kwargs):
    return pd.concat([pd.read_feather(f, **kwargs) for f in flist], axis=0)

# satellite = "landsat-8-c2-l2"
# satellite = "sentinel-2-l2a"
satellite = "landsat-c2-l2"

country_code = 'ZMB' # 'TZA' # 'NGA'

# points = 15
points = 20

num_features = 1024

# bands_short = "2-3-4"
# bands_short = "2-3-4-8"
# bands_short = "1-2-3-4-5-6-7"
bands_short = "r-g-b-nir-swir16-swir22"
# bands_short = "2-3-4-5-6-7-8-11-12"

files = (f'{satellite}_bands-{bands_short}_{country_code}_{points}'+
             f'k-points_{num_features}-features_*')
files

path = str(here('data', 'random_features', satellite))+'\\*.feather'
files = glob.glob(pathname=path)
features = get_merged_files(files)
features = features.sort_values(by = ['year', 'month']).reset_index()

In [1]:
# features = geopandas.GeoDataFrame()

# for yr in range(year_start, year_end + 1):
#     print(f"Opening: {feature_file_name}_{yr}.feather")
#     features_x = pd.read_feather(f"{data_dir}/random_features/{satellite}/{feature_file_name}_{yr}.feather")

#     # concatenate the feather files together, axis = 0 specifies to stack rows (rather than adding columns)
#     features = pd.concat([features, features_x], axis=0)
    
#     print("feature.shape", features.shape)
#     print("Appending:", yr)
#     print("")

In [20]:
if include_2013 & (satellite == "landsat-8-c2-l2") & (month_range == [4, 5, 6, 7, 8, 9]):
    month_start = 4
else:
    month_start = 10
    
keep = np.where(
    ((features.year == year_start) & (features.month >= month_start)) | (features.year > year_start),
    True, False
)
features = features[keep]

10


In [21]:
# carry months October, November, and December over to the following year's data
# these months represent the start of the growing season for the following year's maize yield
features['year'] = np.where(
    features['month'].isin([10, 11, 12]),
    features['year'] + 1, 
    features['year'])

features = features[features['year'] <= year_end]

features.sort_values(['year', 'month'], inplace=True)

In [22]:
# subset the features to only the month range selected at the top of the notebook
features = features[features.month.isin(month_range)]

In [23]:
features = features.set_index(['lon','lat', "year", 'month']).unstack()
features.columns = features.columns.map(lambda x: '{}_{}'.format(*x))

In [24]:
features.replace([np.inf, -np.inf], np.nan, inplace=True)
features = features.reset_index()
# features

In [25]:
features = features.join(weights.set_index(['lon', 'lat']), on = ['lon', 'lat'])
features = features.drop(["geometry"], axis = 1)
# features

In [26]:
# any 1 km^2 cell with a crop percentage > 0 will be retained
# the mask will not be applied if crop_mask is set to False at the top of this notebook
if crop_mask:
    features = features[features.crop_perc > 0]
else:
    pass
# features

In [27]:
features = geopandas.GeoDataFrame(
    features, 
    geometry = geopandas.points_from_xy(x = features.lon, y = features.lat), 
    crs='EPSG:4326'
)

In [28]:
# Drop the redundant independent lon and lat columns because now that they are in a separate geometry column, there is no need for these columns
features = features.drop(['lon', 'lat'], axis = 1)

In [5]:
features = features.sjoin(country_shp, how = 'left', predicate = 'within')
# features

In [6]:
# Drop NA's from the district column (called index_right) then rename the column index_right to district
features = (
    features
    # drop NA values in the `index_right` column, then rename that column to `district` for clarity
    .dropna(subset=['index_right'])
    .rename(columns = {"index_right": "district",})
    .reset_index(drop = True)
)
# make a copy of the features, assigned to an object so we can plot the points as they are in this state
# later, after imputation, we also copy the entire features df and save that as an object in case we want to plot or work with them in that stage 
points = features.copy()
# save the geometries as an object to join them later to the rows of crop yield predictions
points = features[['geometry']]
# save the years as an object to join them later to the rows
year = features[['year']]
# drop geometry column for 20/21 features
features = features.drop(['geometry'], axis = 1)
# features

In [32]:
# compute the number of cells in the features dataframe, based on the amount of rows (images), months, and feature columns
num_cells = len(features) * len(month_range) * num_features

In [33]:
class bcolors:
    BL = '\x1b[1;34m' #GREEN
    GR = '\x1b[1;36m' #GREEN
    YL = '\x1b[1;33m' #YELLOW
    RD = '\x1b[1;31m' #RED
    RESET = '\033[0m' #RESET COLOR

In [34]:
%%time
if impute_manual:
    ln_ft = len(features)
    ln_na = len(features.dropna())
    print(f'Starting total row count: {bcolors.BL}{ln_ft}{bcolors.RESET}',
          f'\nPre-Impute NaN row count: {bcolors.RD}{ln_ft - ln_na}{bcolors.RESET}',
          f'\nPre-Impute NaN row %: {bcolors.RD}{((ln_ft - ln_na) / ln_ft)*100:.02f}{bcolors.RESET}',
          f'\nPre-Impute NaN cell %: {bcolors.RD}{(features.isna().sum().sum() / num_cells)*100:.02f}{bcolors.RESET}',
          f'\n\nStep 1: Filling NaN values by month, year, and district group average')
    features = (
        features
        .fillna(features
                .groupby(['year', 'district'], as_index=False)
                .transform('mean')
               )
    )
    ln_ft = len(features)
    ln_na = len(features.dropna())
    print(f'Post step 1 NaN row count: {bcolors.YL}{ln_ft - ln_na}{bcolors.RESET}',
          f'\nPost step 1 NaN row %: {bcolors.YL}{((ln_ft - ln_na) / ln_ft)*100:.02f}{bcolors.RESET}',
          f'\nPost step 1 NaN cell %: {bcolors.YL}{(features.isna().sum().sum() / num_cells)*100:.02f}{bcolors.RESET}',
          f'\n\nStep 2: Filling NaN values by month and district across group average')
    features = (
        features
        .fillna(features
                .groupby(['district'], as_index=False)
                .transform('mean')
               )
    )
    ln_ft = len(features)
    ln_na = len(features.dropna())
    print(f'Post step 2 NaN row count: {bcolors.GR}{ln_ft - ln_na}{bcolors.RESET}',
          f'\nPost step 2 NaN row %: {bcolors.GR}{((ln_ft - ln_na) / ln_ft)*100:.02f}{bcolors.RESET}',
          f'\nPost step 2 NaN cell %: {bcolors.GR}{(features.isna().sum().sum() / num_cells)*100:.02f}{bcolors.RESET}',
          f'\n\nStep 3: Drop remaining NaN values\n')
    features = features.dropna(axis=0)
    print(f'Ending total row count: {bcolors.BL}{len(features)}{bcolors.RESET}')
else:
    features = features.set_index(['year', 'district'])
    imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
    imputer.fit_transform(features)
    features[:] = imputer.transform(features)
    features = features.reset_index()

Starting total row count: [1;34m195980[0m 
Pre-Impute NaN row count: [1;31m84692[0m 
Pre-Impute NaN row %: [1;31m43.21[0m 
Pre-Impute NaN cell %: [1;31m15.23[0m 

Step 1: Filling NaN values by month, year, and district group average
Post step 1 NaN row count: [1;33m55632[0m 
Post step 1 NaN row %: [1;33m28.39[0m 
Post step 1 NaN cell %: [1;33m8.55[0m 

Step 2: Filling NaN values by month and district across group average
Post step 2 NaN row count: [1;36m0[0m 
Post step 2 NaN row %: [1;36m0.00[0m 
Post step 2 NaN cell %: [1;36m0.00[0m 

Step 3: Drop remaining NaN values

Ending total row count: [1;34m195980[0m
CPU times: total: 9min 54s
Wall time: 20min 6s
