# Modeling Crop Yield
## Python modules

In [1]:
import warnings
import time
import os
import glob

import dask
from dask.distributed import Client

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.pyplot as plt
import matplotlib.colors as colors

import geopandas

import pyarrow
from sklearn.linear_model import RidgeCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.impute import SimpleImputer
from scipy.stats import spearmanr
from scipy.linalg import LinAlgWarning
from scipy.stats import pearsonr

from pyhere import here

import math
import seaborn as sns

from pyhere import here

In [2]:
satellite = "landsat-c2-l2"
# satellite = "landsat-8-c2-l2"
# satellite = "sentinel-2-l2a"

#bands = "2-3-4"
# bands = "2-3-4-8"
# bands = "1-2-3-4-5-6-7"
bands = "r-g-b-nir-swir16-swir22"
# bands = "2-3-4-5-6-7-8-11-12"

# month_range = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
month_range = [         4, 5, 6, 7, 8, 9            ]

# points = 15
points = 20

impute_manual = True
# impute_manual = False

include_2013 = True
# include_2013 = False

cloud_limit = 20
na_limit = 25

country_code = "ZMB"

# num_features = 1000
num_features = 1024

if satellite == "landsat-c2-l2":
    year_start = 2008
elif satellite == "landsat-8-c2-l2":
    year_start = 2013 
else:
    year_start = 2015 
    
year_end = 2021
year_end_crops = 2021
  
data_dir = here("data")  

feature_file_name = (f'{satellite}_bands-{bands}_{country_code}_{points}k-points_{num_features}-features')
weight_file_name = (f'{country_code}_cropland_percentage_{points}k-points')

In [3]:
country_shp = geopandas.read_file(f'{data_dir}/geo_boundaries/gadm36_{country_code}_2.shp')
country_districts = country_shp.district.sort_values().unique().tolist()
country_shp = country_shp.set_index('district')
# country_shp.shape

In [4]:
crop_df_full = pd.read_csv(f'{data_dir}/crop_yield/cfs_maize_districts_zambia_2009_2022.csv')
crop_df_full = crop_df_full[crop_df_full.year <= year_end_crops]
crop_districts = crop_df_full.district.sort_values().unique().tolist()
crop_df = crop_df_full[['district', 'year', 'yield_mt']]
# ln = len(crop_df[crop_df.year == 2016].district)
crop_df = crop_df.set_index('district')
# ln

In [5]:
list(set(crop_districts) - set(country_districts))

['Mafinga', 'Ikelenge']

In [6]:
list(set(country_districts) - set(crop_districts))

[]

In [7]:
country_crop = geopandas.GeoDataFrame(crop_df.join(country_shp), crs = country_shp.crs)

In [8]:
weights = pd.read_feather(f"{data_dir}/land_cover/{weight_file_name}.feather")
weights.lon, weights.lat = round(weights.lon, 5), round(weights.lat, 5)
# weights

In [9]:
weights_gdf = geopandas.GeoDataFrame(
    weights, 
    geometry = geopandas.points_from_xy(x = weights.lon, y = weights.lat), 
    crs='EPSG:4326'
)

In [10]:
def get_merged_files(flist, **kwargs):
    return pd.concat([pd.read_feather(f, **kwargs) for f in flist], axis=0).reset_index(drop=True)

In [11]:
file = (f'{satellite}_bands-{bands}_{country_code}_{points}'+
             f'k-points_{num_features}-features_*')
path = str(here('data', 'random_features', satellite))+f'/{file}'
files = glob.glob(pathname=path)
# files

In [None]:
features = get_merged_files(files)
features.na_percent = features.na_percent * 100

In [None]:
features = features.sort_values(by = ['year', 'month']).reset_index(drop=True)
# features

In [None]:
if include_2013 & (satellite == "landsat-8-c2-l2") & (month_range == [4, 5, 6, 7, 8, 9]):
    month_start = 4
else:
    month_start = 10
    
keep = np.where(
    ((features.year == year_start) & (features.month >= month_start)) | (features.year > year_start),
    True, False
)
features = features[keep]

In [None]:
features['year'] = np.where(
    features['month'].isin([10, 11, 12]),
    features['year'] + 1, 
    features['year']
)

features = features[features['year'] <= year_end]
# features

Add logic  to calculate summary stats here. Carry over cloud cover

In [None]:
feat_name = (f'{satellite}_bands-{bands}_{country_code}_{points}'+
             f'k-points_{num_features}-features_meta')
meta_file = str(here('data', 'random_features', 'feature_meta_data', f'{file[:-2]}_meta.feather'))
meta_df = features[['year',  'month', 'crop_perc', 'stac_id', 'platform', 'cloud_cover', 'na_percent']]
meta_df.to_feather(meta_file)

In [None]:
# meta_summary = pd\
#     .get_dummies(
#         features[['year',  'month', 'crop_perc', 'platform', 'cloud_cover', 'na_percent']],
#         columns=["platform"], 
#         drop_first=False,
#         prefix = "", prefix_sep = "")\
#     .groupby(
#         ['year',  'month'], as_index=False)\
#     .agg(
#         {
#             'crop_perc':'mean', 
#             'cloud_cover': 'mean',
#             'na_percent': 'mean',
#             'landsat-5': 'sum',
#             'landsat-7': 'sum',
#             'landsat-8': 'sum',
#             # 'landsat-9': 'sum'
#         }
# )
# meta_summary

In [None]:
features = features[features.month.isin(month_range)]
features = features[features.cloud_cover < cloud_limit]
features = features[features.na_percent < na_limit]
features.drop(['crop_perc','stac_id', 'cloud_cover', 'na_percent', 'platform'],axis=1, inplace=True)
# features

In [None]:
features = features.set_index(['lon','lat', "year", 'month']).unstack()
features.columns = features.columns.map(lambda x: '{}_{}'.format(*x))
# features

In [None]:
features.replace([np.inf, -np.inf], np.nan, inplace=True)
features = features.reset_index()
# features

In [None]:
features = features.join(weights.set_index(['lon', 'lat']), on = ['lon', 'lat'])
features = features.drop(["geometry"], axis = 1)
# features

In [None]:
features = geopandas.GeoDataFrame(
    features, 
    geometry = geopandas.points_from_xy(x = features.lon, y = features.lat), 
    crs='EPSG:4326'
)

In [None]:
features = features.sjoin(country_shp, how = 'left', predicate = 'within')
# features

In [None]:
features = (
    features
    .dropna(subset=['index_right'])
    .rename(columns = {"index_right": "district",})
    .reset_index(drop = True)
)
features = features.drop(['geometry'], axis = 1)
# features

In [None]:
class bcolors:
    BL = '\x1b[1;34m' # BLUE
    GR = '\x1b[1;36m' # GREEN
    YL = '\x1b[1;33m' # YELLOW
    RD = '\x1b[1;31m' # RED
    RESET = '\033[0m' # RESET COLOR

In [None]:
%%time
num_cells = len(features) * len(month_range) * num_features
ln_ft = len(features)
ln_na = len(features.dropna())
print(f'Starting total row count: {bcolors.BL}{ln_ft}{bcolors.RESET}',
      f'\nPre-Impute NaN row count: {bcolors.RD}{ln_ft - ln_na}{bcolors.RESET}',
      f'\nPre-Impute NaN row %: {bcolors.RD}{((ln_ft - ln_na) / ln_ft)*100:.02f}{bcolors.RESET}',
      f'\nPre-Impute NaN cell %: {bcolors.RD}{(features.isna().sum().sum() / num_cells)*100:.02f}{bcolors.RESET}',
      f'\n\nStep 1: Filling NaN values by month, year, and district group average')
features = (
    features
    .fillna(features
            .groupby(['year', 'district'], as_index=False)
            .transform('mean')
           )
)
ln_ft = len(features)
ln_na = len(features.dropna())
print(f'Post step 1 NaN row count: {bcolors.YL}{ln_ft - ln_na}{bcolors.RESET}',
      f'\nPost step 1 NaN row %: {bcolors.YL}{((ln_ft - ln_na) / ln_ft)*100:.02f}{bcolors.RESET}',
      f'\nPost step 1 NaN cell %: {bcolors.YL}{(features.isna().sum().sum() / num_cells)*100:.02f}{bcolors.RESET}',
      f'\n\nStep 2: Filling NaN values by month and district across group average')
features = (
    features
    .fillna(features
            .groupby(['district'], as_index=False)
            .transform('mean')
           )
)
ln_ft = len(features)
ln_na = len(features.dropna())
print(f'Post step 2 NaN row count: {bcolors.GR}{ln_ft - ln_na}{bcolors.RESET}',
      f'\nPost step 2 NaN row %: {bcolors.GR}{((ln_ft - ln_na) / ln_ft)*100:.02f}{bcolors.RESET}',
      f'\nPost step 2 NaN cell %: {bcolors.GR}{(features.isna().sum().sum() / num_cells)*100:.02f}{bcolors.RESET}',
      f'\n\nStep 3: Drop remaining NaN values\n')
features = features.dropna(axis=0)
print(f'Ending total row count: {bcolors.BL}{len(features)}{bcolors.RESET}')

In [None]:
features

In [None]:
full_file = here(
    'data', 
    'random_features', 
    'full_files',
    f'{file[:-1]}yr-{min(features.year)}-{max(features.year)}_mn-{min(month_range)}-\
{max(month_range)}_cloud-limit-{cloud_limit}_na-limit-{na_limit}.feather')
full_file

In [None]:
features.reset_index(drop=True).to_feather(full_file)

In [None]:
# features.filter(regex=fr'_[{min(month_range)}-{max(month_range)}]|year|crop_perc|district', axis=1)