In [1]:
## import warnings
import time
import math
import os
import glob
import numpy as np
import pandas as pd
import geopandas
import pyarrow
import itertools
from sklearn.linear_model import RidgeCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from scipy.stats import spearmanr
from scipy.stats import pearsonr
from pyhere import here
from datetime import date

In [2]:
data_dir = here("data")
directory = here("data", "random_features", "full_files")
today = date.today().strftime("%Y-%m-%d")
files = os.listdir(directory)
files = [f for f in files if f not in ('.gitkeep', '.ipynb_checkpoints')]
files.sort()
names = 'crop_mask weighted_avg hot_encode'.split()
results = pd.DataFrame()

In [3]:
# files[4:6]

In [4]:
# file = 'landsat-c2-l2_bands-r-g-b-nir-swir16-swir22_ZMB_20k-points_1024-features_yr-2009-2021_mn-1-12_cloud-limit-10_na-limit-25.feather'
# f    = file.split(sep="_")
# satellite    = f[0]
# bands        = f[1].replace("bands-", "")
# country_code = f[2]
# points       = f[3].replace("k-points", "")
# num_features = f[4].replace("-features", "")
# yrs          = f[5].replace("yr-", "").split(sep="-")
# mns          = f[6].replace("mn-", "").split(sep="-")
# cloud_limit  = int(f[7].replace("cloud-limit-", ""))
# na_limit     = int(f[8].replace("na-limit-", "").replace(".feather", ""))
# years        = range(int(yrs[0]), int(yrs[1])+1)
# month_range  = list(range(int(mns[0]), int(mns[1])+1))

In [5]:
# month_range

In [6]:
# f'{min(month_range)}-{max(month_range)}'

In [7]:
%%time
for file in files:

    f            = file.split(sep="_")
    satellite    = f[0]
    bands        = f[1].replace("bands-", "")
    country_code = f[2]
    points       = f[3].replace("k-points", "")
    num_features = f[4].replace("-features", "")
    yrs          = f[5].replace("yr-", "").split(sep="-")
    mns          = f[6].replace("mn-", "").split(sep="-")
    cloud_limit  = f[7].replace("cloud-limit-", "")
    na_limit     = f[8].replace("na-limit-", "").replace(".feather", "")
    years        = range(int(yrs[0]), int(yrs[1])+1)
    month_range  = list(range(int(mns[0]), int(mns[1])+1))

    for p in itertools.product([False,True],repeat=len(names)):
        params = dict(zip(names,p))
        # limit_months = params['limit_months']
        crop_mask = params["crop_mask"]
        weighted_avg = params["weighted_avg"]
        hot_encode = params["hot_encode"]

        fn=f"{directory}/{file}"

        print(f"Opening: {file}")
        
        features = pd.read_feather(fn)

        country_shp = geopandas.read_file(f'{data_dir}/geo_boundaries/gadm36_{country_code}_2.shp')
        country_shp = country_shp.set_index('district')

        crop_df_full = pd.read_csv(f'{data_dir}/crop_yield/cfs_maize_districts_zambia_2009_2022.csv')
        crop_df_full = crop_df_full[crop_df_full.year <= max(years)]
        crop_df = crop_df_full[['district', 'year', 'yield_mt']]
        crop_df = crop_df.set_index('district')

        if limit_months:
            month_range = list(range(4, 10))
            filter_regex = fr'_[{min(month_range)}-{max(month_range)}]|year|crop_perc|district|lon|lat'
            features = features.filter(regex=filter_regex, axis=1)
        else:
            month_range = list(range(1, 13))
        
        if crop_mask:
            features = features[features.crop_perc > 0]
        else:
            pass

        print(
f"""     Setting parameters:
        Feature info:
            satellite: {satellite}
            bands: {bands}
            month range: {month_range}
            cloud limit: {cloud_limit} 
            NaN limit: {na_limit}
        User options:
            crop_mask: {crop_mask}
            weighted_avg: {weighted_avg}
            hot_encode: {hot_encode}"""
        )

        features = geopandas.GeoDataFrame(
            features, 
            geometry = geopandas.points_from_xy(x = features.lon, y = features.lat), 
            crs='EPSG:4326'
        )

        features = features.drop(['geometry', 'lon', 'lat'], axis = 1)
        features = features[features.year <= max(years)]

        print('    Summarizing features')
        
        if weighted_avg:
            var_cols = features.columns[2:-1].values.tolist()
            features_summary = (
                features
                .groupby(['year', 'district'], as_index=False)
                .apply(lambda x: pd.Series([sum(x[v] * x.crop_perc) / sum(x.crop_perc) for v in var_cols]))
            )
        else:
            features_summary = features.groupby(['district',"year"], as_index = False).mean()

        crop_data_filtered = crop_df[crop_df.year >= min(features_summary.year)]
        crop_data_filtered = crop_data_filtered[~crop_data_filtered.index.isin(['Mafinga', 'Ikelenge'])]
        crop_data_filtered.reset_index(inplace = True)

        features_summary = (
            features_summary
            .set_index(["district", "year"])
            .join(other = crop_data_filtered.set_index(["district", "year"]))
            .reset_index())
        
        features_summary.columns = features_summary.columns.astype(str)
        
        if weighted_avg:
            drop_cols = ['district', 'year', 'yield_mt']
        else:
            drop_cols = ['district', 'year', 'yield_mt', "crop_perc"]

        if hot_encode:
            drop_cols.remove("district")
            x_all = pd.get_dummies(features_summary, 
                                   columns=["district"], 
                                   drop_first=False)
            x_all = x_all.drop(drop_cols, axis = 1)
        else:
            x_all = features_summary.drop(drop_cols, axis = 1)

        y_all = np.log10(features_summary.yield_mt.to_numpy() + 1)

        x_train, x_test, y_train, y_test = train_test_split(
            x_all, y_all, test_size = 0.2, random_state = 0
        )

        print('    Modeling')
        ridge_cv_random = RidgeCV(cv = 5, alphas = np.logspace(-8, 8, base = 10, num = 17))
        ridge_cv_random.fit(x_train, y_train)

        y_pred_train = np.maximum(ridge_cv_random.predict(x_train), 0)
        r2_train = r2_score(y_train, y_pred_train)
        pearson_train = pearsonr(y_pred_train, y_train)[0]

        y_pred_test = np.maximum(ridge_cv_random.predict(x_test), 0)
        r2_test = r2_score(y_test, y_pred_test)
        pearson_test = pearsonr(y_pred_test, y_test)[0]

        features_summary["log_yield"] = np.log10(features_summary.yield_mt.to_numpy() + 1)
        features_summary["prediction"] = np.maximum(ridge_cv_random.predict(x_all), 0)
        features_summary["residual"] = features_summary["log_yield"] - features_summary["prediction"]
        features_summary["district_yield_mean"] = features_summary.groupby('district')['log_yield'].transform('mean')
        features_summary["district_prediction_mean"] = features_summary.groupby('district')['prediction'].transform('mean')
        features_summary["demean_yield"] = features_summary["log_yield"] - features_summary["district_yield_mean"]
        features_summary["demean_prediction"] = features_summary["prediction"] - features_summary["district_prediction_mean"]

        r_squared = r2_score(features_summary["demean_yield"], features_summary["demean_prediction"])
        pearson_r = pearsonr(features_summary["demean_yield"], features_summary["demean_prediction"])[0]
        
        print(
f"""    Model results:
        Data split:
            Number of total points: {len(x_all)}
            Number of training points: {len(x_train)}
            Number of testing points: {len(x_test)}
        Model regularization parameter:
            Best \u03BB = {ridge_cv_random.alpha_}
        Validation score:
            R\u00B2 = {ridge_cv_random.best_score_:0.2f}
        Training scores:
            R\u00B2 = {r2_train:0.2f}  
            Pearsons r = {pearson_train:0.2f}
            Pearson r\u00B2 = {pearson_train ** 2:0.2f}
        Testing scores:    
            R\u00B2 = {r2_test:0.2f}
            Pearsons r = {pearson_test:0.2f}
            Pearson r\u00B2 = {pearson_test ** 2:0.2f}
        Demeaned by location scores:
            R\u00B2 = {r_squared:.2f}
            Pearson's r: {pearson_r:.2f} 
            Pearson r\u00B2 = {round(pearson_r ** 2, 2)}
"""
        )
        
        d = {
            'country': country_code,
            'satellite': [satellite],
            'bands': bands,
            'num_features': num_features,
            'points': points, #len(features[features.year == min(features.year)]),
            'month_range': f'{min(month_range)}-{max(month_range)}',
            'cloud_limit': cloud_limit,
            'na_limit': na_limit,
            'crop_mask': crop_mask,
            'weighted_avg': weighted_avg,
            'hot_encode': hot_encode,
            'total_n': len(x_all),
            'train_n': len(x_train),
            'test_n': len(x_test),
            'val_R2': ridge_cv_random.best_score_,
            'train_R2': r2_train,
            'train_r': pearson_train,
            'train_r2': pearson_train ** 2,
            'test_R2': r2_test,
            'test_r': pearson_test,
            'test_r2': pearson_test ** 2,
            'demean_R2': r_squared,
            'demean_r': pearson_r,
            'demean_r2': pearson_r ** 2,
        }
        df = pd.DataFrame(data=d)
        results = pd.concat([results, df])
            
results.to_csv(f"{data_dir}/results/results_{today}.csv")
results

Opening: landsat-8-c2-l2_bands-1-2-3-4-5-6-7_ZMB_15k-points_1000-features_yr-2013-2021_mn-4-9_cloud-limit-10_na-limit-50.feather
     Setting parameters:
        Feature info:
            satellite: landsat-8-c2-l2
            bands: 1-2-3-4-5-6-7
            month range: [4, 5, 6, 7, 8, 9]
            cloud limit: 10 
            NaN limit: 50
        User options:
            crop_mask: False
            weighted_avg: False
            hot_encode: False
    Summarizing features
    Modeling
    Model results:
        Data split:
            Number of total points: 648
            Number of training points: 518
            Number of testing points: 130
        Model regularization parameter:
            Best λ = 10.0
        Validation score:
            R² = 0.61
        Training scores:
            R² = 0.73  
            Pearsons r = 0.86
            Pearson r² = 0.74
        Testing scores:    
            R² = 0.61
            Pearsons r = 0.79
            Pearson r² = 0.63
     

Unnamed: 0,country,satellite,bands,num_features,points,month_range,cloud_limit,na_limit,crop_mask,weighted_avg,...,val_R2,train_R2,train_r,train_r2,test_R2,test_r,test_r2,demean_R2,demean_r,demean_r2
0,ZMB,landsat-8-c2-l2,1-2-3-4-5-6-7,1000,15,4-9,10,50,False,False,...,0.611833,0.734995,0.858702,0.737370,0.612784,0.793637,0.629859,0.285799,0.560865,0.314569
0,ZMB,landsat-8-c2-l2,1-2-3-4-5-6-7,1000,15,4-9,10,50,False,False,...,0.748089,0.919108,0.959472,0.920586,0.683508,0.854250,0.729743,0.445727,0.671879,0.451421
0,ZMB,landsat-8-c2-l2,1-2-3-4-5-6-7,1000,15,4-9,10,50,False,True,...,0.507164,0.902564,0.951336,0.905040,0.613655,0.798784,0.638056,0.443414,0.702400,0.493366
0,ZMB,landsat-8-c2-l2,1-2-3-4-5-6-7,1000,15,4-9,10,50,False,True,...,0.713399,0.899174,0.949608,0.901755,0.788569,0.892588,0.796714,0.459292,0.680976,0.463729
0,ZMB,landsat-8-c2-l2,1-2-3-4-5-6-7,1000,15,4-9,10,50,True,False,...,0.518270,0.709148,0.843807,0.712010,0.683971,0.828155,0.685841,0.289258,0.570173,0.325098
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,ZMB,sentinel-2-l2a,2-3-4,1000,20,4-9,10,50,False,True,...,0.644630,0.918790,0.958824,0.919343,0.687865,0.837494,0.701396,0.502713,0.710119,0.504269
0,ZMB,sentinel-2-l2a,2-3-4,1000,20,4-9,10,50,True,False,...,0.545423,0.856178,0.927479,0.860218,0.534839,0.745309,0.555485,0.421782,0.672204,0.451859
0,ZMB,sentinel-2-l2a,2-3-4,1000,20,4-9,10,50,True,False,...,0.722936,0.913717,0.956184,0.914289,0.763860,0.875399,0.766324,0.526402,0.726287,0.527493
0,ZMB,sentinel-2-l2a,2-3-4,1000,20,4-9,10,50,True,True,...,0.396092,0.858301,0.928500,0.862112,0.548313,0.752095,0.565648,0.447578,0.686649,0.471486


In [8]:
# meta_summary = features[['year',  'month', 'crop_perc', 'platform', 'cloud_cover', 'na_percent']]
# meta_summary = pd.get_dummies(meta_summary, columns=["platform"], drop_first=False, prefix = "", prefix_sep = "")
# meta_summary = meta_summary.groupby(['year',  'month'], as_index=False).agg(
#     {
#         'crop_perc':'mean', 
#         'cloud_cover': 'mean',
#         'na_percent': 'mean',
#         'landsat-5': 'sum',
#         'landsat-7': 'sum',
#         'landsat-8': 'sum',
#         # 'landsat-9': 'sum'
#     }
# )
# meta_summary

In [9]:
# data_dir = here("data")
# directory = here("data", "random_features", "full_files")
# results = pd.DataFrame()

# file = "landsat-c2-l2_bands-r-g-b-nir-swir16-swir22_ZMB_20k-points_1024-features_yr-2009-2021.feather"

# f = file.split(sep="_")
# satellite = f[0]
# bands = f[1].replace("bands-", "")
# country_code = f[2]
# points = f[3].replace("k-points", "")
# num_features=f[4].replace("-features", "")
# yrs=f[5].replace("yr-", "").replace(".feather", "").split(sep="-")
# years=range(int(yrs[0]), int(yrs[1])+1)


# limit_months = True
# crop_mask    = True
# weighted_avg = True
# hot_encode   = True

# fn=f"{directory}/{file}"

# print(f"Opening: {file}")

# features = pd.read_feather(fn)

# country_shp = geopandas.read_file(f'{data_dir}/geo_boundaries/gadm36_{country_code}_2.shp')
# country_shp = country_shp.set_index('district')

# crop_df_full = pd.read_csv(f'{data_dir}/crop_yield/cfs_maize_districts_zambia_2009_2022.csv')
# crop_df_full = crop_df_full[crop_df_full.year <= max(years)]
# crop_df = crop_df_full[['district', 'year', 'yield_mt']]
# crop_df = crop_df.set_index('district')

# if limit_months:
#     month_range = list(range(4, 10))
#     filter_regex = fr'_[{min(month_range)}-{max(month_range)}]|year|crop_perc|district|lon|lat'
#     features = features.filter(regex=filter_regex, axis=1)
# else:
#     month_range = list(range(1, 13))

# if crop_mask:
#     features = features[features.crop_perc > 0]
# else:
#     pass

# print(
# f"""
# Setting parameters:
# Feature info:
#     satellite: {satellite}
#     bands: {bands}
#     month_range: {month_range}
# User options:
#     limit_months: {limit_months}
#     crop_mask: {crop_mask}
#     weighted_avg: {weighted_avg}
#     hot_encode: {hot_encode}
# """
# )

# features = geopandas.GeoDataFrame(
#     features, 
#     geometry = geopandas.points_from_xy(x = features.lon, y = features.lat), 
#     crs='EPSG:4326'
# )

# features = features.drop(['geometry', 'lon', 'lat'], axis = 1)
# features = features[features.year <= max(years)]
# var_cols = features.columns[2:-1].values.tolist()
# features.columns[2:-1]

In [10]:
# features

In [11]:
# print('    Summarizing features\n')
# if weighted_avg:
#     features_summary = (
#         features
#         .groupby(['year', 'district'], as_index=False)
#         .apply(lambda x: pd.Series([sum(x[v] * x.crop_perc) / sum(x.crop_perc) for v in var_cols]))
#     )
# else:
#     features_summary = features.groupby(['district',"year"], as_index = False).mean()

# crop_data_filtered = crop_df[crop_df.year >= min(features_summary.year)]
# crop_data_filtered = crop_data_filtered[~crop_data_filtered.index.isin(['Mafinga', 'Ikelenge'])]
# crop_data_filtered.reset_index(inplace = True)

# features_summary = (
#     features_summary
#     .set_index(["district", "year"])
#     .join(other = crop_data_filtered.set_index(["district", "year"]))
#     .reset_index())

In [12]:
# features_summary

In [13]:
# features_summary.columns = features_summary.columns.astype(str)

In [14]:
# if weighted_avg:
#     drop_cols = ['district', 'year', 'yield_mt']
# else:
#     drop_cols = ['district', 'year', 'yield_mt', "crop_perc"]

# if hot_encode:
#     drop_cols.remove("district")
#     x_all = pd.get_dummies(features_summary, 
#                            columns=["district"], 
#                            drop_first=False)
#     x_all = x_all.drop(drop_cols, axis = 1)
# else:
#     x_all = features_summary.drop(drop_cols, axis = 1)

# y_all = np.log10(features_summary.yield_mt.to_numpy() + 1)

# x_train, x_test, y_train, y_test = train_test_split(
#     x_all, y_all, test_size = 0.2, random_state = 0
# )

In [15]:
# print('    Modeling')
# ridge_cv_random = RidgeCV(cv = 5, alphas = np.logspace(-8, 8, base = 10, num = 17))
# ridge_cv_random.fit(x_train, y_train)

In [16]:
# y_pred_train = np.maximum(ridge_cv_random.predict(x_train), 0)
# r2_train = r2_score(y_train, y_pred_train)
# pearson_train = pearsonr(y_pred_train, y_train)[0]

# y_pred_test = np.maximum(ridge_cv_random.predict(x_test), 0)
# r2_test = r2_score(y_test, y_pred_test)
# pearson_test = pearsonr(y_pred_test, y_test)[0]

# features_summary["log_yield"] = np.log10(features_summary.yield_mt.to_numpy() + 1)
# features_summary["prediction"] = np.maximum(ridge_cv_random.predict(x_all), 0)
# features_summary["residual"] = features_summary["log_yield"] - features_summary["prediction"]
# features_summary["district_yield_mean"] = features_summary.groupby('district')['log_yield'].transform('mean')
# features_summary["district_prediction_mean"] = features_summary.groupby('district')['prediction'].transform('mean')
# features_summary["demean_yield"] = features_summary["log_yield"] - features_summary["district_yield_mean"]
# features_summary["demean_prediction"] = features_summary["prediction"] - features_summary["district_prediction_mean"]

# r_squared = r2_score(features_summary["demean_yield"], features_summary["demean_prediction"])
# pearson_r = pearsonr(features_summary["demean_yield"], features_summary["demean_prediction"])[0]

# print(
# f"""
# Model results:
# Data split:
#     Number of total points: {len(x_all)}
#     Number of training points: {len(x_train)}
#     Number of testing points: {len(x_test)}
# Model regularization parameter:
#     Best \u03BB = {ridge_cv_random.alpha_}
# Validation score:
#     R\u00B2 = {ridge_cv_random.best_score_:0.2f}
# Training scores:
#     R\u00B2 = {r2_train:0.2f}  
#     Pearsons r = {pearson_train:0.2f}
#     Pearson r\u00B2 = {pearson_train ** 2:0.2f}
# Testing scores:    
#     R\u00B2 = {r2_test:0.2f}
#     Pearsons r = {pearson_test:0.2f}
#     Pearson r\u00B2 = {pearson_test ** 2:0.2f}
# Demeaned by location scores:
#     R\u00B2 = {r_squared:.2f}
#     Pearson's r: {pearson_r:.2f} 
#     Pearson r\u00B2 = {round(pearson_r ** 2, 2)}
# """
# )

# d = {
#     'country': country_code,
#     'satellite': [satellite],
#     'bands': bands,
#     'num_features': num_features,
#     'points': len(features[features.year == min(features.year)]),
#     'month_range': str(month_range),
#     'limit_months': limit_months,
#     'crop_mask': crop_mask,
#     'weighted_avg': weighted_avg,
#     'hot_encode': hot_encode,
#     'total_n': len(x_all),
#     'train_n': len(x_train),
#     'test_n': len(x_test),
#     'val_R2': ridge_cv_random.best_score_,
#     'train_R2': r2_train,
#     'train_r': pearson_train,
#     'train_r2': pearson_train ** 2,
#     'test_R2': r2_test,
#     'test_r': pearson_test,
#     'test_r2': pearson_test ** 2,
#     'demean_R2': r_squared,
#     'demean_r': pearson_r,
#     'demean_r2': pearson_r ** 2,
# }
# df = pd.DataFrame(data=d)
# results = pd.concat([results, df])
            
# results