# Modeling Crop Yield
## Python modules

In [6]:
import warnings
import time
import os
import glob

import dask
from dask.distributed import Client
import multiprocessing

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.pyplot as plt
import matplotlib.colors as colors
import itertools

import geopandas

import pyarrow
from sklearn.linear_model import RidgeCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.impute import SimpleImputer
from scipy.stats import spearmanr
from scipy.linalg import LinAlgWarning
from scipy.stats import pearsonr

from pyhere import here

import math
import seaborn as sns

from pyhere import here

In [7]:
data_dir = here("data")  
satellites = ["sentinel-2-l2a","landsat-8-c2-l2","landsat-c2-l2"]

year_end = 2021

weights_4_fn = 'ZMB_cropland_percentage_4k-points.feather'
weights_15_fn = 'ZMB_cropland_percentage_15k-points.feather'
weights_20_fn = 'ZMB_cropland_percentage_20k-points.feather'


In [8]:
country_shp = geopandas.read_file(f'{data_dir}/geo_boundaries/gadm36_ZMB_2.shp')
country_shp = country_shp.set_index('district')

crop_df_full = pd.read_csv(f'{data_dir}/crop_yield/cfs_maize_districts_zambia_2009_2022.csv')
crop_df_full = crop_df_full[['district', 'year', 'yield_mt']]
crop_df_full = crop_df_full.set_index(['district', 'year'])

weights_4 = pd.read_feather(here("data", "land_cover", weights_4_fn))
weights_15 = pd.read_feather(here("data", "land_cover", weights_15_fn))
weights_20 = pd.read_feather(here("data", "land_cover", weights_20_fn))
                             
                             
weights_4.lon, weights_4.lat = round(weights_4.lon, 5), round(weights_4.lat, 5)
weights_15.lon, weights_15.lat = round(weights_15.lon, 5), round(weights_15.lat, 5)
weights_20.lon, weights_20.lat = round(weights_20.lon, 5), round(weights_20.lat, 5)

In [9]:
def get_merged_files(flist, **kwargs):
    return pd.concat([pd.read_feather(f, **kwargs) for f in flist], axis=0).reset_index(drop=True)

def merge(x, bases = (tuple, list)):
    for e in x:
        if type(e) in bases:
            for e in merge(e, bases):
                yield e
        else:
            yield e


def split_fn(file):
    f            = file.split(sep="_")
    satellite    = f[0]
    points       = int(f[3].replace("k-points", ""))
    num_features = int(f[4].replace("-features", ""))
    return satellite, points, num_features


def start_year(satellite):
    if satellite == "landsat-c2-l2":
        year_start = 2008
    elif satellite == "landsat-8-c2-l2":
        year_start = 2013 
    else:
        year_start = 2015 
    return year_start


class bcolors:
    BL = '\x1b[1;34m' # BLUE
    GR = '\x1b[1;36m' # GREEN
    YL = '\x1b[1;33m' # YELLOW
    RD = '\x1b[1;31m' # RED
    RESET = '\033[0m' # RESET COLOR

In [10]:
file_groups = pd.DataFrame()
for satellite in satellites:
    
    directory = here("data", "random_features", satellite)
    files = os.listdir(directory)
    files = [f for f in files if f not in ('.gitkeep', '.ipynb_checkpoints')]
    files.sort()
    
    for file in files:
        f = file.split(sep="_")
        d = {
            'satellite'    : f[0],
            'bands'        : f[1].replace("bands-", ""),
            'country_code' : f[2],
            'points'       : f[3].replace("k-points", ""),
            'num_features' : f[4].replace("-features", ""),
            'pattern'      : f[0]+'_'+f[1]+'_'+f[2]+'_'+f[3]+'_'+f[4]+'_*'
            # 'yrs'          : f[5].replace("yr-", "").repla|ce(".feather", "")
        }
        df = pd.DataFrame(data=d, index=[0])
        file_groups = pd.concat([file_groups, df])
        
file_groups = file_groups.sort_values(by=['points'], ascending=True)
file_groups = file_groups.drop_duplicates().reset_index(drop=True)
file_groups

Unnamed: 0,satellite,bands,country_code,points,num_features,pattern
0,sentinel-2-l2a,2-3-4-8,ZMB,15,1000,sentinel-2-l2a_bands-2-3-4-8_ZMB_15k-points_10...
1,landsat-8-c2-l2,1-2-3-4-5-6-7,ZMB,15,1000,landsat-8-c2-l2_bands-1-2-3-4-5-6-7_ZMB_15k-po...
2,sentinel-2-l2a,2-3-4,ZMB,15,1000,sentinel-2-l2a_bands-2-3-4_ZMB_15k-points_1000...
3,landsat-8-c2-l2,1-2-3-4-5-6-7,ZMB,20,1000,landsat-8-c2-l2_bands-1-2-3-4-5-6-7_ZMB_20k-po...
4,landsat-c2-l2,r-g-b-nir-swir16-swir22,ZMB,20,1024,landsat-c2-l2_bands-r-g-b-nir-swir16-swir22_ZM...
5,sentinel-2-l2a,2-3-4,ZMB,20,1000,sentinel-2-l2a_bands-2-3-4_ZMB_20k-points_1000...
6,sentinel-2-l2a,2-3-4,ZMB,4,1000,sentinel-2-l2a_bands-2-3-4_ZMB_4k-points_1000-...


In [11]:
# file_groups = file_groups[file_groups.index == 0]
# file_groups

In [12]:
names = 'limit_months crop_mask weighted_avg hot_encode'.split()
paramlist = list(itertools.product([False,True],repeat=len(names)))
paramlist

[(False, False, False, False),
 (False, False, False, True),
 (False, False, True, False),
 (False, False, True, True),
 (False, True, False, False),
 (False, True, False, True),
 (False, True, True, False),
 (False, True, True, True),
 (True, False, False, False),
 (True, False, False, True),
 (True, False, True, False),
 (True, False, True, True),
 (True, True, False, False),
 (True, True, False, True),
 (True, True, True, False),
 (True, True, True, True)]

In [13]:
names = 'limit_months crop_mask weighted_avg'.split()
paramlist = list(itertools.product([False,True],repeat=len(names)))
paramlist

[(False, False, False),
 (False, False, True),
 (False, True, False),
 (False, True, True),
 (True, False, False),
 (True, False, True),
 (True, True, False),
 (True, True, True)]

In [14]:
names = 'limit_months crop_mask weighted_avg'.split()
paramlist = list(itertools.product([False,True],repeat=len(names)))
def summarize_features(params):
    limit_months = params[0]
    crop_mask = params[1]
    weighted_avg = params[2]

    lm_color = bcolors.GR if limit_months else bcolors.RD
    cm_color = bcolors.GR if crop_mask    else bcolors.RD
    wa_color = bcolors.GR if weighted_avg else bcolors.RD
    print(
f"""
limit_months: {lm_color}{limit_months}{bcolors.RESET}
crop_mask: {cm_color}{crop_mask}{bcolors.RESET}
weighted_avg: {wa_color}{weighted_avg}{bcolors.RESET}
""")

    features = features_og.copy()

    month_range = range(4, 10) if limit_months else range(1, 13)

    if (satellite == "landsat-8-c2-l2") & (limit_months):
        month_start = 4
    else:
        month_start = 10

    keep = np.where(
        ((features.year == year_start) & (features.month >= month_start)) | (features.year > year_start), True, False)

    features = features[keep]

    features['year'] = np.where(
        features['month'].isin([10, 11, 12]),
        features['year'] + 1, 
        features['year']
    )
    features = features[features.year <= year_end]

    features.lon, features.lat = round(features.lon, 5), round(features.lat, 5)

    features = features[features.month.isin(month_range)]

    features = features.set_index(['lon','lat', "year", 'month']).unstack()
    features.columns = features.columns.map(lambda x: '{}_{}'.format(*x))

    features.replace([np.inf, -np.inf], np.nan, inplace=True)
    features.reset_index(inplace = True)

    if points == 4:
        weights = weights_4.copy()
    elif points == 15:
        weights = weights_15.copy()
    elif points == 20:
        weights = weights_20.copy()

    features = features.join(weights.set_index(['lon', 'lat']), on = ['lon', 'lat'])

    if crop_mask:
        features = features[features.crop_perc > 0]
    else:
        pass   

    features = geopandas.GeoDataFrame(
        features, 
        geometry = geopandas.points_from_xy(x = features.lon, y = features.lat), 
        crs='EPSG:4326'
    )

    features = (
        features
        .sjoin(country_shp, how = 'left', predicate = 'within')
        .drop(['geometry', 'lon', 'lat'], axis = 1)
        .rename(columns = {"index_right": "district"})
        .dropna(subset=['district'])
        .reset_index(drop = True)
    )

    num_cells = len(features) * len(month_range) * int(num_features)
    ln_ft = len(features)
    ln_na = len(features.dropna())

    features.fillna(features.groupby(['year', 'district'], as_index=False).transform('mean'), inplace=True)

    ln_ft = len(features)
    ln_na = len(features.dropna())

    features.fillna(features.groupby(['district'], as_index=False).transform('mean'), inplace=True)

    ln_ft = len(features)
    ln_na = len(features.dropna())
 
    features = features.dropna(axis=0)

    if weighted_avg:
        feature_cols = features.columns[1:-2].values.tolist()
        features_summary = (
            features
            .groupby(['year', 'district'], as_index=False)
            .apply(lambda x: pd.Series([sum(x[feature] * x.crop_perc) / sum(x.crop_perc) for feature in feature_cols]))
        )
    else:
        features_summary = features.groupby(['district',"year"], as_index = False).mean()  

    features_summary = features_summary.set_index(["district", "year"]).join(other = crop_df_full).reset_index()

    features_summary.columns = features_summary.columns.astype(str)

    features_summary = features_summary[~features_summary.isna().any(axis = 1)]

    min_yr = min(features_summary.year); max_yr = max(features_summary.year)
    min_mn = min(month_range); max_mn = max(month_range)

    f = f'{fn[:-1]}yr-{min_yr}-{max_yr}_mn-{min_mn}-{max_mn}_lm-{limit_months}'+\
        f'_cm-{crop_mask}_wa-{weighted_avg}_summary.feather'

    full_file = here('data', 'random_features', 'summary_2', f)

    features_summary.reset_index(drop=True).to_feather(full_file)


In [16]:
%%time
for i in file_groups.index:
    satellite    = file_groups.satellite[i]
    bands        = file_groups.bands[i]
    country      = file_groups.country_code[i]
    points       = int(file_groups.points[i])
    num_features = file_groups.num_features[i]

    fn = f'{satellite}_bands-{bands}_{country}_{points}k-points_{num_features}-features_*'
    path = str(here("data", "random_features", satellite, fn))
    files = glob.glob(pathname=path)

    print(f"\n\nMerging files matching pattern:\n\t{fn}")

    features_og = get_merged_files(files)
    features_og = features_og.sort_values(by = ['year', 'month']).reset_index(drop=True)

    year_end = max(features_og.year)

    if satellite == "landsat-c2-l2":
        year_start = 2008
    elif satellite == "landsat-8-c2-l2":
        year_start = 2013 
    else:
        year_start = 2015 
    if __name__ == "__main__":
        with multiprocessing.Pool(processes=8) as pool:
            pool.map(summarize_features, paramlist)
    



Merging files matching pattern:
	sentinel-2-l2a_bands-2-3-4-8_ZMB_15k-points_1000-features_*

limit_months: [1;36mTrue[0m
crop_mask: [1;36mTrue[0m
weighted_avg: [1;31mFalse[0m

limit_months: [1;31mFalse[0m
crop_mask: [1;31mFalse[0m
weighted_avg: [1;31mFalse[0m

limit_months: [1;31mFalse[0m
crop_mask: [1;36mTrue[0m
weighted_avg: [1;31mFalse[0m

limit_months: [1;31mFalse[0m
crop_mask: [1;31mFalse[0m
weighted_avg: [1;36mTrue[0m

limit_months: [1;36mTrue[0m
crop_mask: [1;31mFalse[0m
weighted_avg: [1;31mFalse[0m

limit_months: [1;31mFalse[0m
crop_mask: [1;36mTrue[0m
weighted_avg: [1;36mTrue[0m

limit_months: [1;36mTrue[0m
crop_mask: [1;31mFalse[0m
weighted_avg: [1;36mTrue[0m

limit_months: [1;36mTrue[0m
crop_mask: [1;36mTrue[0m
weighted_avg: [1;36mTrue[0m










Merging files matching pattern:
	landsat-8-c2-l2_bands-1-2-3-4-5-6-7_ZMB_15k-points_1000-features_*

limit_months: [1;31mFalse[0m
crop_mask: [1;31mFalse[0m
weighted_avg: [1;3

In [6]:
# paramlist = list(itertools.product([False,True],repeat=len(names)))
# paramlist = list(itertools.product(file_groups.pattern.to_list(), paramlist))
# for i in range(len(paramlist)):
#     paramlist[i] = tuple(merge(paramlist[i]))
# len(paramlist)

112

In [10]:
# def summarize_features(params):
#     file = params[0]
#     limit_months = params[1]
#     crop_mask = params[2]
#     weighted_avg = params[3]
#     hot_encode = params[4]

#     print(
# f"""
# file pattern: {file}
# limit_months: {limit_months}
# crop_mask:    {crop_mask}
# weighted_avg: {weighted_avg}
# hot_encode:   {hot_encode}
# """)
    
#     satellite, points, num_features = split_fn(file)
#     path = str(here("data", "random_features", satellite, file))
#     files = glob.glob(pathname=path)
    
#     features = get_merged_files(files)

#     year_end = max(features.year)

#     year_start = start_year(satellite)
        
#     month_range = range(4, 10) if limit_months else range(1, 13)

#     if (satellite == "landsat-8-c2-l2") & (limit_months):
#         month_start = 4
#     else:
#         month_start = 10

#     keep = np.where(
#         ((features.year == year_start) & (features.month >= month_start)) | (features.year > year_start), True, False)

#     features = features[keep]

#     features['year'] = np.where(
#         features['month'].isin([10, 11, 12]),
#         features['year'] + 1, 
#         features['year']
#     )
#     features = features[features.year <= year_end]

#     features.lon, features.lat = round(features.lon, 5), round(features.lat, 5)

#     features = features[features.month.isin(month_range)]

#     features = features.set_index(['lon','lat', "year", 'month']).unstack()
#     features.columns = features.columns.map(lambda x: '{}_{}'.format(*x))

#     features.replace([np.inf, -np.inf], np.nan, inplace=True)
#     features.reset_index(inplace = True)

#     if points == 4:
#         weights = weights_4.copy()
#     elif points == 15:
#         weights = weights_15.copy()
#     elif points == 20:
#         weights = weights_20.copy()

#     features = features.join(weights.set_index(['lon', 'lat']), on = ['lon', 'lat'])

#     if crop_mask:
#         features = features[features.crop_perc > 0]
#     else:
#         pass   

#     features = geopandas.GeoDataFrame(
#         features, 
#         geometry = geopandas.points_from_xy(x = features.lon, y = features.lat), 
#         crs='EPSG:4326'
#     )

#     features = (
#         features
#         .sjoin(country_shp, how = 'left', predicate = 'within')
#         .drop(['geometry', 'lon', 'lat'], axis = 1)
#         .rename(columns = {"index_right": "district"})
#         .dropna(subset=['district'])
#         .reset_index(drop = True)
#     )

#     num_cells = len(features) * len(month_range) * int(num_features)
#     ln_ft = len(features)
#     ln_na = len(features.dropna())

#     features.fillna(features.groupby(['year', 'district'], as_index=False).transform('mean'), inplace=True)

#     ln_ft = len(features)
#     ln_na = len(features.dropna())

#     features.fillna(features.groupby(['district'], as_index=False).transform('mean'), inplace=True)

#     ln_ft = len(features)
#     ln_na = len(features.dropna())
 
#     features = features.dropna(axis=0)

#     if weighted_avg:
#         feature_cols = features.columns[1:-2].values.tolist()
#         features_summary = (
#             features
#             .groupby(['year', 'district'], as_index=False)
#             .apply(lambda x: pd.Series([sum(x[feature] * x.crop_perc) / sum(x.crop_perc) for feature in feature_cols]))
#         )
#     else:
#         features_summary = features.groupby(['district',"year"], as_index = False).mean()  

#     features_summary = features_summary.set_index(["district", "year"]).join(other = crop_df_full).reset_index()

#     features_summary.columns = features_summary.columns.astype(str)

#     features_summary = features_summary[~features_summary.isna().any(axis = 1)]


#     if hot_encode:
#         features_summary = pd.get_dummies(
#             features_summary, 
#             columns=["district"], 
#             drop_first=False
#         )
#     else:
#         pass


#     min_yr = min(features_summary.year); max_yr = max(features_summary.year)
#     min_mn = min(month_range); max_mn = max(month_range)

#     f = f'{fn[:-1]}yr-{min_yr}-{max_yr}_mn-{min_mn}-{max_mn}_lm-{limit_months}'+\
#         f'_cm-{crop_mask}_wa-{weighted_avg}_he-{hot_encode}_summary.feather'

#     full_file = here('data', 'random_features', 'summary_2', f)

#     features_summary.reset_index(drop=True).to_feather(full_file)
#     print('Done')


In [None]:
# if __name__ == "__main__":
#     with multiprocessing.Pool(processes=10) as pool:
#         pool.map(summarize_features, paramlist,chunksize=2)
#         # pool.imap(summarize_features, paramlist, chunksize=2)


file pattern: sentinel-2-l2a_bands-2-3-4-8_ZMB_15k-points_1000-features_*
limit_months: False
crop_mask:    False
weighted_avg: False
hot_encode:   False


file pattern: sentinel-2-l2a_bands-2-3-4-8_ZMB_15k-points_1000-features_*
limit_months: False
crop_mask:    False
weighted_avg: True
hot_encode:   False

file pattern: sentinel-2-l2a_bands-2-3-4-8_ZMB_15k-points_1000-features_*
limit_months: True
crop_mask:    True
weighted_avg: True
hot_encode:   False

file pattern: sentinel-2-l2a_bands-2-3-4-8_ZMB_15k-points_1000-features_*
limit_months: False
crop_mask:    True
weighted_avg: False
hot_encode:   False

file pattern: sentinel-2-l2a_bands-2-3-4-8_ZMB_15k-points_1000-features_*
limit_months: False
crop_mask:    True
weighted_avg: True
hot_encode:   False


file pattern: sentinel-2-l2a_bands-2-3-4-8_ZMB_15k-points_1000-features_*
limit_months: True
crop_mask:    False
weighted_avg: True
hot_encode:   False

file pattern: sentinel-2-l2a_bands-2-3-4-8_ZMB_15k-points_1000-features_*
li

Process ForkPoolWorker-7:
Process ForkPoolWorker-10:
Process ForkPoolWorker-6:
Process ForkPoolWorker-3:
Traceback (most recent call last):
  File "/home/computevmuser/miniconda3/envs/mosaiks-env/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/home/computevmuser/miniconda3/envs/mosaiks-env/lib/python3.8/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/home/computevmuser/miniconda3/envs/mosaiks-env/lib/python3.8/multiprocessing/pool.py", line 125, in worker
    result = (True, func(*args, **kwds))
  File "/home/computevmuser/miniconda3/envs/mosaiks-env/lib/python3.8/multiprocessing/pool.py", line 48, in mapstar
    return list(map(*args))
Process ForkPoolWorker-9:
  File "/tmp/ipykernel_607268/2542047236.py", line 104, in summarize_features
    features
  File "/home/computevmuser/miniconda3/envs/mosaiks-env/lib/python3.8/site-packages/pandas/core/groupby/groupby.py", line 1423, in apply
    r

In [12]:
# %%time
# names = 'limit_months crop_mask weighted_avg hot_encode'.split()
# for i in file_groups.index:
#     satellite    = file_groups.satellite[i]
#     bands        = file_groups.bands[i]
#     country      = file_groups.country_code[i]
#     points       = file_groups.points[i]
#     num_features = file_groups.num_features[i]
    
#     fn = f'{satellite}_bands-{bands}_{country}_{points}k-points_{num_features}-features_*'
#     path = str(here("data", "random_features", satellite, fn))
#     files = glob.glob(pathname=path)
    
#     print(f"\n\nMerging files matching pattern:\n\t{fn}")
    
#     features_og = get_merged_files(files)
#     features_og = features_og.sort_values(by = ['year', 'month']).reset_index(drop=True)
    
#     year_end = max(features_og.year)
        
#     if satellite == "landsat-c2-l2":
#         year_start = 2008
#     elif satellite == "landsat-8-c2-l2":
#         year_start = 2013 
#     else:
#         year_start = 2015 
    
#     for p in itertools.product([False,True],repeat=len(names)):
#         tic = time.time()
#         params = dict(zip(names,p))
#         limit_months = params['limit_months']
#         crop_mask = params["crop_mask"]
#         weighted_avg = params["weighted_avg"]
#         hot_encode = params["hot_encode"]
        
#         lm_color = bcolors.GR if limit_months else bcolors.RD
#         cm_color = bcolors.GR if crop_mask else bcolors.RD
#         wa_color = bcolors.GR if weighted_avg else bcolors.RD
#         he_color = bcolors.GR if hot_encode else bcolors.RD
        
#         print(f"Setting parameters:",
#               f"\n\tlimit_months: {lm_color}{limit_months}{bcolors.RESET}",
#               f"\n\tcrop_mask: {cm_color}{crop_mask}{bcolors.RESET}",
#               f"\n\tweighted_avg: {wa_color}{weighted_avg}{bcolors.RESET}",
#               f"\n\thot_encode: {he_color}{hot_encode}{bcolors.RESET}")
                
#         features = features_og.copy()
        
#         month_range = range(4, 10) if limit_months else range(1, 13)
             
#         if (satellite == "landsat-8-c2-l2") & (limit_months):
#             month_start = 4
#         else:
#             month_start = 10
            
#         print("Cleaning data\n\tSelecting months\n\tShifting Oct, Nov, and Dec to next year")
            
#         keep = np.where(
#             ((features.year == year_start) & (features.month >= month_start)) | (features.year > year_start), True, False)
        
#         features = features[keep]
        
#         features['year'] = np.where(
#             features['month'].isin([10, 11, 12]),
#             features['year'] + 1, 
#             features['year']
#         )
#         features = features[features.year <= year_end]
        
#         features.lon, features.lat = round(features.lon, 5), round(features.lat, 5)

        
        
        
        
#         ### META DATA here
        
        
        
        
#         features = features[features.month.isin(month_range)]
#         # features = features[features.cloud_cover < cloud_limit]
#         # features = features[features.na_percent  < na_limit]
#         # features.drop(['crop_perc','stac_id', 'cloud_cover', 'na_percent', 'platform'],axis=1, inplace=True)
        
#         print("Pivoting data (long to wide)\n\tFeature1_month1, feature1_month2, etc..")
        
#         features = features.set_index(['lon','lat', "year", 'month']).unstack()
#         features.columns = features.columns.map(lambda x: '{}_{}'.format(*x))
        
#         features.replace([np.inf, -np.inf], np.nan, inplace=True)
#         features.reset_index(inplace = True)
        
#         if points == 4:
#             weights = weights_4.copy()
#         elif points == 15:
#             weights = weights_15.copy()
#         elif points == 20:
#             weights = weights_20.copy()

#         features = features.join(weights.set_index(['lon', 'lat']), on = ['lon', 'lat'])
        
#         if crop_mask:
#             features = features[features.crop_perc > 0]
#         else:
#             pass   
                
#         features = geopandas.GeoDataFrame(
#             features, 
#             geometry = geopandas.points_from_xy(x = features.lon, y = features.lat), 
#             crs='EPSG:4326'
#         )
        
#         features = (
#             features
#             .sjoin(country_shp, how = 'left', predicate = 'within')
#             .drop(['geometry', 'lon', 'lat'], axis = 1)
#             .rename(columns = {"index_right": "district"})
#             .dropna(subset=['district'])
#             .reset_index(drop = True)
#         )
        
#         num_cells = len(features) * len(month_range) * int(num_features)
#         ln_ft = len(features)
#         ln_na = len(features.dropna())
        
#         print('Imputing features',
#               f'\n\tStarting total row count: {bcolors.BL}{ln_ft}{bcolors.RESET}',
#               f'\n\tPre-Impute NaN row count: {bcolors.RD}{ln_ft - ln_na}{bcolors.RESET}',
#               f'\n\tPre-Impute NaN row %: {bcolors.RD}{((ln_ft - ln_na) / ln_ft)*100:.02f}{bcolors.RESET}',
#               f'\n\tPre-Impute NaN cell %: {bcolors.RD}{(features.isna().sum().sum() / num_cells)*100:.02f}{bcolors.RESET}',
#               f'\n    Step 1: Filling NaN values by month, year, and district group average')
        
#         features.fillna(features.groupby(['year', 'district'], as_index=False).transform('mean'), inplace=True)
        
#         ln_ft = len(features)
#         ln_na = len(features.dropna())
#         print(f'\tPost step 1 NaN row count: {bcolors.YL}{ln_ft - ln_na}{bcolors.RESET}',
#               f'\n\tPost step 1 NaN row %: {bcolors.YL}{((ln_ft - ln_na) / ln_ft)*100:.02f}{bcolors.RESET}',
#               f'\n\tPost step 1 NaN cell %: {bcolors.YL}{(features.isna().sum().sum() / num_cells)*100:.02f}{bcolors.RESET}',
#               f'\n    Step 2: Filling NaN values by month and district across group average')
        
#         features.fillna(features.groupby(['district'], as_index=False).transform('mean'), inplace=True)
        
#         ln_ft = len(features)
#         ln_na = len(features.dropna())
#         print(f'\tPost step 2 NaN row count: {bcolors.GR}{ln_ft - ln_na}{bcolors.RESET}',
#               f'\n\tPost step 2 NaN row %: {bcolors.GR}{((ln_ft - ln_na) / ln_ft)*100:.02f}{bcolors.RESET}',
#               f'\n\tPost step 2 NaN cell %: {bcolors.GR}{(features.isna().sum().sum() / num_cells)*100:.02f}{bcolors.RESET}',
#               f'\n    Step 3: Drop remaining NaN values')
        
#         features = features.dropna(axis=0)
        
#         print(f'\tEnding total row count: {bcolors.BL}{len(features)}{bcolors.RESET}',
#               f'\nSummarizing features to district level\n\tWeighted average: {weighted_avg}')
        
#         if weighted_avg:
#             feature_cols = features.columns[1:-2].values.tolist()
#             features_summary = (
#                 features
#                 .groupby(['year', 'district'], as_index=False)
#                 .apply(lambda x: pd.Series([sum(x[feature] * x.crop_perc) / sum(x.crop_perc) for feature in feature_cols]))
#             )
#         else:
#             features_summary = features.groupby(['district',"year"], as_index = False).mean()  
        
#         print(f'Joining with crop yield')
        
#         features_summary = features_summary.set_index(["district", "year"]).join(other = crop_df_full).reset_index()
        
#         features_summary.columns = features_summary.columns.astype(str)
        
#         features_summary = features_summary[~features_summary.isna().any(axis = 1)]
        
        
#         if hot_encode:
#             features_summary = pd.get_dummies(
#                 features_summary, 
#                 columns=["district"], 
#                 drop_first=False
#             )
#         else:
#             pass
        
        
        
        
#         min_yr = min(features_summary.year); max_yr = max(features_summary.year)
#         min_mn = min(month_range); max_mn = max(month_range)
        
#         f = f'{fn[:-1]}yr-{min_yr}-{max_yr}_mn-{min_mn}-{max_mn}_lm-{limit_months}'+\
#             f'_cm-{crop_mask}_wa-{weighted_avg}_he-{hot_encode}_summary.feather'
        
#         print(f'Saving file as:\n    {f}',
#               f'\nFinished in: {(time.time()-tic)/60:0.2f} minutes\n\n')
        
#         full_file = here('data', 'random_features', 'summary', f)

#         # features_summary.reset_index(drop=True).to_feather(full_file)
                



Merging files matching pattern:
	sentinel-2-l2a_bands-2-3-4_ZMB_4k-points_1000-features_*
Setting parameters: 
	limit_months: [1;31mFalse[0m 
	crop_mask: [1;31mFalse[0m 
	weighted_avg: [1;31mFalse[0m 
	hot_encode: [1;31mFalse[0m
Cleaning data
	Selecting months
	Shifting Oct, Nov, and Dec to next year
Pivoting data (long to wide)
	Feature1_month1, feature1_month2, etc..
Imputing features 
	Starting total row count: [1;34m26215[0m 
	Pre-Impute NaN row count: [1;31m25344[0m 
	Pre-Impute NaN row %: [1;31m96.68[0m 
	Pre-Impute NaN cell %: [1;31m25.97[0m 
    Step 1: Filling NaN values by month, year, and district group average
	Post step 1 NaN row count: [1;33m19746[0m 
	Post step 1 NaN row %: [1;33m75.32[0m 
	Post step 1 NaN cell %: [1;33m13.44[0m 
    Step 2: Filling NaN values by month and district across group average
	Post step 2 NaN row count: [1;36m147[0m 
	Post step 2 NaN row %: [1;36m0.56[0m 
	Post step 2 NaN cell %: [1;36m0.05[0m 
    Step 3: Drop rem

KeyboardInterrupt: 

In [1]:
# features_summary

In [2]:
# features_og

In [3]:
# min_yr = min(features_summary.year); max_yr = max(features_summary.year)
# min_mn = min(month_range); max_mn = max(month_range)

# f = f'{fn[:-1]}yr-{min_yr}-{max_yr}_mn-{min_mn}-{max_mn}_lm-{limit_months}'+\
#     f'_cm-{crop_mask}_wa-{weighted_avg}_he-{hot_encode}_summary.feather'

# print(f'Saving file as:\n    {f}',
#       f'\nFinished in: {(time.time()-tic)/60:0.2f} minutes\n\n')

# full_file = here('data', 'random_features', 'full_files', 'summary', f)
# f
# features_summary.reset_index(drop=True).to_feather(full_file)

In [4]:
# features_summary

In [160]:
# features = get_merged_files(files)
# features_summary['log_yield'] = np.log10(features_summary.yield_mt.to_numpy() + 1)
features_summary.columns = features_summary.columns.astype(str)
features_summary = features_summary[~features_summary.isna().any(axis = 1)]
features_summary

Unnamed: 0,district,year,0_1,0_2,0_3,0_4,0_5,0_6,0_7,0_8,...,999_5,999_6,999_7,999_8,999_9,999_10,999_11,999_12,crop_perc,yield_mt
0,Chadiza,2016,0.004272,0.001818,0.085061,0.000116,0.001074,0.000977,0.000082,0.000366,...,0.043295,0.027509,0.003858,0.004520,0.003079,0.003409,0.004954,0.063499,0.381913,1.876427
1,Chadiza,2017,0.002981,0.001818,0.047673,0.088637,0.004830,0.000914,0.000732,0.000564,...,0.126630,0.078335,0.051738,0.006275,0.003435,0.002926,0.002944,0.122289,0.381913,2.882947
2,Chadiza,2018,0.009446,0.003262,0.036355,0.002062,0.001585,0.001120,0.000795,0.000497,...,0.142304,0.075960,0.039313,0.008548,0.003252,0.003404,0.003875,0.106135,0.381913,1.299279
3,Chadiza,2019,0.005566,0.001183,0.065058,0.002258,0.000926,0.000817,0.000509,0.000349,...,0.074028,0.070610,0.014644,0.006120,0.003145,0.003236,0.004202,0.063499,0.381913,2.131008
4,Chadiza,2020,0.005566,0.001009,0.051095,0.002267,0.001194,0.001441,0.000850,0.000230,...,0.059954,0.045868,0.030804,0.003590,0.003234,0.004737,0.010896,0.030506,0.381913,2.626538
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
493,Zambezi,2018,0.007275,0.023318,0.003185,0.002004,0.001727,0.001155,0.000687,0.000205,...,0.265652,0.201325,0.120433,0.040692,0.034962,0.017143,0.027968,0.071155,0.013294,1.622273
494,Zambezi,2019,0.030689,0.009172,0.003172,0.003763,0.002209,0.001445,0.001053,0.000595,...,0.187701,0.134524,0.068055,0.027539,0.003993,0.021956,0.056402,0.006002,0.013294,1.184923
495,Zambezi,2020,0.017266,0.023318,0.007270,0.002259,0.001990,0.001044,0.000723,0.000404,...,0.239021,0.135596,0.106608,0.025361,0.012158,0.015129,0.007757,0.134643,0.013294,1.689628
496,Zambezi,2021,0.026252,0.004763,0.002882,0.003188,0.002003,0.000972,0.001044,0.000720,...,0.243124,0.157832,0.112836,0.060004,0.023231,0.019192,0.028617,0.071155,0.013294,2.981741


In [162]:
### https://stackoverflow.com/questions/48188865/grouped-cross-validation-lassocv-scikit-learn
### highest mean cross-validated test score (corresponds where alpha=best alpha)
from sklearn.model_selection import LeaveOneGroupOut

drop_cols = ['district', 'year', 
             # 'log_yield',
             'yield_mt',
             "crop_perc", ]
x_all = features_summary.drop(drop_cols, axis = 1).values
y_all = np.log10(features_summary.yield_mt.to_numpy() + 1)
g_all = features_summary.year.ravel()

x_train, x_test,\
y_train, y_test,\
g_train, g_test = train_test_split(
    x_all, y_all, g_all,
    test_size = 0.2, 
    random_state = 2022
)

In [187]:
ridge_cv_random = RidgeCV(cv = 5, alphas = np.logspace(-8, 8, base = 10, num = 17))
ridge_cv_random.fit(x_train, y_train)
ridge_cv_random.best_score_

0.5749732621956999

In [178]:
logo = LeaveOneGroupOut()
gen_logo = logo.split(x_train, y_train, groups=g_train)
ridge_logo_cv = RidgeCV(cv=gen_logo, alphas = np.logspace(-8, 8, base = 10, num = 17))
ridge_logo_cv.fit(x_train, y_train)

In [179]:
ridge_logo_cv.alpha_, ridge_logo_cv.best_score_

(1000.0, 0.2624140350565368)

In [327]:
# logo = LeaveOneGroupOut()
# gen_logo = logo.split(x_train, groups = g_train)
# ridge_logo_cv = RidgeCV(cv=gen_logo, alphas = np.logspace(-8, 8, base = 10, num = 17))
# ridge_logo_cv.fit(x_all, y_all)

In [177]:
# logo = LeaveOneGroupOut().split(x_all, y_all, groups = g_all)
# ridge_logo_cv = RidgeCV(cv=logo, alphas = np.logspace(-8, 8, base = 10, num = 17), scoring='r2')
# ridge_logo_cv.fit(x_all, y_all)

In [180]:
ridge_logo_cv.alpha_, ridge_logo_cv.best_score_ 
# highest mean cross-validated test score (corresponds where alpha=best alpha)

(1000.0, 0.2624140350565368)

In [169]:
# from sklearn.model_selection import cross_val_score
# logo = LeaveOneGroupOut()
# ridge_cv = RidgeCV(alphas = np.logspace(-8, 8, base = 10, num = 17))
# scores = cross_val_score(ridge_cv, x_train, y_train, cv=logo, groups=g_train)
# scores

from sklearn.model_selection import cross_val_score, LeaveOneGroupOut, GridSearchCV, cross_val_predict
from sklearn.linear_model import Ridge, RidgeCV

In [189]:
logo = LeaveOneGroupOut()
parameters = {'alpha': np.logspace(-8, 8, base = 10, num = 17)}
ridge = Ridge()
ridge_reg = GridSearchCV(ridge, parameters, scoring='r2',cv=logo)
ridge_reg.fit(x_train, y_train, groups=g_train)
best_model = ridge_reg.best_estimator_

In [190]:
predictions = cross_val_predict(
    best_model, 
    X=x_train,
    y=y_train, 
    groups=g_train,
    cv=logo
)

In [192]:
r2_score(y_train, predictions)

0.301668347712272

In [184]:
ridge_reg.best_score_

0.2624140350565368

In [193]:
scores = cross_val_score(
    estimator=best_model, 
    X=x_train,
    y=y_train, 
    groups=g_train, 
    cv=logo, 
    scoring='r2'
)
scores

array([0.3392959 , 0.26256015, 0.23677038, 0.2054101 , 0.36150555,
       0.19283416, 0.23852201])

In [194]:
scores.mean()

0.2624140350565368

In [188]:
cv_results = pd.DataFrame(ridge_reg.cv_results_)
cv_results[cv_results.rank_test_score == 1]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_alpha,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,mean_test_score,std_test_score,rank_test_score
11,0.05471,0.010931,0.001927,0.000201,1000.0,{'alpha': 1000.0},0.339296,0.26256,0.23677,0.20541,0.361506,0.192834,0.238522,0.262414,0.05982,1


In [106]:
ridge_reg.fit()

<bound method BaseEstimator.get_params of GridSearchCV(cv=<generator object BaseCrossValidator.split at 0x7fdd1cb483c0>,
             estimator=Ridge(),
             param_grid={'alpha': array([1.e-08, 1.e-07, 1.e-06, 1.e-05, 1.e-04, 1.e-03, 1.e-02, 1.e-01,
       1.e+00, 1.e+01, 1.e+02, 1.e+03, 1.e+04, 1.e+05, 1.e+06, 1.e+07,
       1.e+08])},
             scoring='r2')>

In [89]:
logo = LeaveOneGroupOut().split(x_all, groups = g_all)
ridge_cv = RidgeCV(alphas = np.logspace(-8, 8, base = 10, num = 17))
scores = cross_val_score(
    estimator=ridge_cv, 
    X=x_all,
    y=y_all, 
    groups=g_all, 
    cv=logo, 
    scoring='r2'
)
scores

array([ 0.65046147, -4.45661669,  0.45067598,  0.43997513,  0.59768849,
        0.22949096,  0.44333262,  0.14996408])

In [63]:
# import sklearn
# sklearn.metrics.get_scorer_names()

In [82]:
# scores = scores[scores > 0]
scores.mean()

-0.1868784951414862

In [85]:
ridge_cv.score(x_test, y_test), ridge_cv.score(x_train, y_train)

(0.5214319866765469, 0.47271140453664107)

In [56]:
r2_score(y_all, ridge_cv.predict(x_all))

0.48562346284432945

In [57]:
r2_score(y_train, ridge_cv.predict(x_train))

0.47271140453664107

In [58]:
r2_score(y_test, ridge_cv.predict(x_test))

0.5214319866765469

In [6]:
# # satellite = "landsat-c2-l2"
# # satellite = "landsat-8-c2-l2"
# satellite = "sentinel-2-l2a"

# #bands = "2-3-4"
# # bands = "2-3-4-8"
# # bands = "1-2-3-4-5-6-7"
# # bands = "r-g-b-nir-swir16-swir22"
# bands = "2-3-4-5-6-7-8-11-12"

# month_range = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
# # month_range = [         4, 5, 6, 7, 8, 9            ]

# # points = 15
# points = 20

# include_2013 = True
# # include_2013 = False

# cloud_limit = 20
# na_limit = 25

# country_code = "ZMB"

# num_features = 1000
# # num_features = 1024

# if satellite == "landsat-c2-l2":
#     year_start = 2008
# elif satellite == "landsat-8-c2-l2":
#     year_start = 2013 
# else:
#     year_start = 2015 
    
# year_end = 2021
  
# data_dir = here("data")  

# feature_file_name = f'{satellite}_bands-{bands}_{country_code}_{points}k-points_{num_features}-features'
# weight_file_name = f'{country_code}_cropland_percentage_{points}k-points'

In [None]:
# if include_2013 & (satellite == "landsat-8-c2-l2") & (month_range == [4, 5, 6, 7, 8, 9]):
#     month_start = 4
# else:
#     month_start = 10
    
# keep = np.where(
#     ((features.year == year_start) & (features.month >= month_start)) | (features.year > year_start),
#     True, False
# )
# features = features[keep]

In [None]:
# features['year'] = np.where(
#     features['month'].isin([10, 11, 12]),
#     features['year'] + 1, 
#     features['year']
# )

# features = features[features['year'] <= year_end]
# features

Add logic  to calculate summary stats here. Carry over cloud cover

In [None]:
# feat_name = (f'{satellite}_bands-{bands}_{country_code}_{points}'+
#              f'k-points_{num_features}-features_meta')
# meta_file = str(here('data', 'random_features', 'feature_meta_data', f'{file[:-2]}_meta.feather'))
# meta_df = features[['year',  'month', 'crop_perc', 'stac_id', 'platform', 'cloud_cover', 'na_percent']]
# meta_df.to_feather(meta_file)

In [None]:
# meta_summary = pd\
#     .get_dummies(
#         features[['year',  'month', 'crop_perc', 'platform', 'cloud_cover', 'na_percent']],
#         columns=["platform"], 
#         drop_first=False,
#         prefix = "", prefix_sep = "")\
#     .groupby(
#         ['year',  'month'], as_index=False)\
#     .agg(
#         {
#             'crop_perc':'mean', 
#             'cloud_cover': 'mean',
#             'na_percent': 'mean',
#             'landsat-5': 'sum',
#             'landsat-7': 'sum',
#             'landsat-8': 'sum',
#             # 'landsat-9': 'sum'
#         }
# )
# meta_summary

In [None]:
# features = features[features.month.isin(month_range)]
# features = features[features.cloud_cover < cloud_limit]
# features = features[features.na_percent < na_limit]
# features.drop(['crop_perc','stac_id', 'cloud_cover', 'na_percent', 'platform'],axis=1, inplace=True)
# features

In [None]:
# features = features.set_index(['lon','lat', "year", 'month']).unstack()
# features.columns = features.columns.map(lambda x: '{}_{}'.format(*x))
# features

In [None]:
# features.replace([np.inf, -np.inf], np.nan, inplace=True)
# features = features.reset_index()
# features

In [None]:
# features = features.join(weights.set_index(['lon', 'lat']), on = ['lon', 'lat'])
# features = features.drop(["geometry"], axis = 1)
# features

In [None]:
# features = geopandas.GeoDataFrame(
#     features, 
#     geometry = geopandas.points_from_xy(x = features.lon, y = features.lat), 
#     crs='EPSG:4326'
# )

In [None]:
# features = features.sjoin(country_shp, how = 'left', predicate = 'within')
# features

In [None]:
# features = (
#     features
#     .dropna(subset=['index_right'])
#     .rename(columns = {"index_right": "district",})
#     .reset_index(drop = True)
# )
# features = features.drop(['geometry'], axis = 1)
# features

In [None]:
# class bcolors:
#     BL = '\x1b[1;34m' # BLUE
#     GR = '\x1b[1;36m' # GREEN
#     YL = '\x1b[1;33m' # YELLOW
#     RD = '\x1b[1;31m' # RED
#     RESET = '\033[0m' # RESET COLOR

In [None]:
# %%time
# num_cells = len(features) * len(month_range) * num_features
# ln_ft = len(features)
# ln_na = len(features.dropna())
# print(f'Starting total row count: {bcolors.BL}{ln_ft}{bcolors.RESET}',
#       f'\nPre-Impute NaN row count: {bcolors.RD}{ln_ft - ln_na}{bcolors.RESET}',
#       f'\nPre-Impute NaN row %: {bcolors.RD}{((ln_ft - ln_na) / ln_ft)*100:.02f}{bcolors.RESET}',
#       f'\nPre-Impute NaN cell %: {bcolors.RD}{(features.isna().sum().sum() / num_cells)*100:.02f}{bcolors.RESET}',
#       f'\n\nStep 1: Filling NaN values by month, year, and district group average')
# features = (
#     features
#     .fillna(features
#             .groupby(['year', 'district'], as_index=False)
#             .transform('mean')
#            )
# )
# ln_ft = len(features)
# ln_na = len(features.dropna())
# print(f'Post step 1 NaN row count: {bcolors.YL}{ln_ft - ln_na}{bcolors.RESET}',
#       f'\nPost step 1 NaN row %: {bcolors.YL}{((ln_ft - ln_na) / ln_ft)*100:.02f}{bcolors.RESET}',
#       f'\nPost step 1 NaN cell %: {bcolors.YL}{(features.isna().sum().sum() / num_cells)*100:.02f}{bcolors.RESET}',
#       f'\n\nStep 2: Filling NaN values by month and district across group average')
# features = (
#     features
#     .fillna(features
#             .groupby(['district'], as_index=False)
#             .transform('mean')
#            )
# )
# ln_ft = len(features)
# ln_na = len(features.dropna())
# print(f'Post step 2 NaN row count: {bcolors.GR}{ln_ft - ln_na}{bcolors.RESET}',
#       f'\nPost step 2 NaN row %: {bcolors.GR}{((ln_ft - ln_na) / ln_ft)*100:.02f}{bcolors.RESET}',
#       f'\nPost step 2 NaN cell %: {bcolors.GR}{(features.isna().sum().sum() / num_cells)*100:.02f}{bcolors.RESET}',
#       f'\n\nStep 3: Drop remaining NaN values\n')
# features = features.dropna(axis=0)
# print(f'Ending total row count: {bcolors.BL}{len(features)}{bcolors.RESET}')

In [None]:
# features

In [None]:
# full_file = here(
#     'data', 
#     'random_features', 
#     'full_files',
#     f'{file[:-1]}yr-{min(features.year)}-{max(features.year)}_mn-{min(month_range)}-\
# {max(month_range)}_cloud-limit-{cloud_limit}_na-limit-{na_limit}.feather')
# full_file

In [None]:
# features.reset_index(drop=True).to_feather(full_file)

In [None]:
# features.filter(regex=fr'_[{min(month_range)}-{max(month_range)}]|year|crop_perc|district', axis=1)