In [None]:
## import warnings
import time
import math
import os
import glob
import numpy as np
import pandas as pd
import geopandas
import pyarrow
import itertools
from sklearn.linear_model import RidgeCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from scipy.stats import spearmanr
from scipy.stats import pearsonr
from pyhere import here

In [None]:
data_dir = here("data")
directory = here("data", "random_features", "full_files")
files = os.listdir(directory)
files = [f for f in files if f not in ('.gitkeep', '.ipynb_checkpoints')]
files.sort()
names = 'limit_months crop_mask weighted_avg hot_encode'.split()
results = pd.DataFrame()

for file in files[2:3]:

    f = file.split(sep="_")
    satellite = f[0]
    bands = f[1].replace("bands-", "")
    country_code = f[2]
    points = f[3].replace("k-points", "")
    num_features=f[4].replace("-features", "")
    yrs=f[5].replace("yr-", "").replace(".feather", "").split(sep="-")
    years=range(int(yrs[0]), int(yrs[1])+1)

    for p in itertools.product([False,True],repeat=len(names)):
        params = dict(zip(names,p))
        limit_months = params['limit_months']
        crop_mask = params["crop_mask"]
        weighted_avg = params["weighted_avg"]
        hot_encode = params["hot_encode"]

        fn=f"{directory}/{file}"

        print(f"Opening: {file}")
        
        features = pd.read_feather(fn)

        country_shp = geopandas.read_file(f'{data_dir}/geo_boundaries/gadm36_{country_code}_2.shp')
        country_shp = country_shp.set_index('district')

        crop_df_full = pd.read_csv(f'{data_dir}/crop_yield/cfs_maize_districts_zambia_2009_2022.csv')
        crop_df_full = crop_df_full[crop_df_full.year <= max(years)]
        crop_df = crop_df_full[['district', 'year', 'yield_mt']]
        crop_df = crop_df.set_index('district')

        if limit_months:
            month_range = list(range(4, 10))
            filter_regex = fr'_[{min(month_range)}-{max(month_range)}]|year|crop_perc|district|lon|lat'
            features = features.filter(regex=filter_regex, axis=1)
        else:
            month_range = list(range(1, 13))
        
        if crop_mask:
            features = features[features.crop_perc > 0]
        else:
            pass

        print(
f"""
    Setting parameters:
        Feature info:
            satellite: {satellite}
            bands: {bands}
            month_range: {month_range}
        User options:
            limit_months: {limit_months}
            crop_mask: {crop_mask}
            weighted_avg: {weighted_avg}
            hot_encode: {hot_encode}
"""
        )

        features = geopandas.GeoDataFrame(
            features, 
            geometry = geopandas.points_from_xy(x = features.lon, y = features.lat), 
            crs='EPSG:4326'
        )

        features = features.drop(['geometry', 'lon', 'lat'], axis = 1)
        features = features[features.year <= max(years)]

        print('    Summarizing features\n')
        
        if weighted_avg:
            var_cols = features.columns[2:-1].values.tolist()
            features_summary = (
                features
                .groupby(['year', 'district'], as_index=False)
                .apply(lambda x: pd.Series([sum(x[v] * x.crop_perc) / sum(x.crop_perc) for v in var_cols]))
            )
        else:
            features_summary = features.groupby(['district',"year"], as_index = False).mean()

        crop_data_filtered = crop_df[crop_df.year >= min(features_summary.year)]
        crop_data_filtered = crop_data_filtered[~crop_data_filtered.index.isin(['Mafinga', 'Ikelenge'])]
        crop_data_filtered.reset_index(inplace = True)

        features_summary = (
            features_summary
            .set_index(["district", "year"])
            .join(other = crop_data_filtered.set_index(["district", "year"]))
            .reset_index())
        
        features_summary.columns = features_summary.columns.astype(str)
        
        if weighted_avg:
            drop_cols = ['district', 'year', 'yield_mt']
        else:
            drop_cols = ['district', 'year', 'yield_mt', "crop_perc"]

        if hot_encode:
            drop_cols.remove("district")
            x_all = pd.get_dummies(features_summary, 
                                   columns=["district"], 
                                   drop_first=False)
            x_all = x_all.drop(drop_cols, axis = 1)
        else:
            x_all = features_summary.drop(drop_cols, axis = 1)

        y_all = np.log10(features_summary.yield_mt.to_numpy() + 1)

        x_train, x_test, y_train, y_test = train_test_split(
            x_all, y_all, test_size = 0.2, random_state = 0
        )

        print('    Modeling')
        ridge_cv_random = RidgeCV(cv = 5, alphas = np.logspace(-8, 8, base = 10, num = 17))
        ridge_cv_random.fit(x_train, y_train)

        y_pred_train = np.maximum(ridge_cv_random.predict(x_train), 0)
        r2_train = r2_score(y_train, y_pred_train)
        pearson_train = pearsonr(y_pred_train, y_train)[0]

        y_pred_test = np.maximum(ridge_cv_random.predict(x_test), 0)
        r2_test = r2_score(y_test, y_pred_test)
        pearson_test = pearsonr(y_pred_test, y_test)[0]

        features_summary["log_yield"] = np.log10(features_summary.yield_mt.to_numpy() + 1)
        features_summary["prediction"] = np.maximum(ridge_cv_random.predict(x_all), 0)
        features_summary["residual"] = features_summary["log_yield"] - features_summary["prediction"]
        features_summary["district_yield_mean"] = features_summary.groupby('district')['log_yield'].transform('mean')
        features_summary["district_prediction_mean"] = features_summary.groupby('district')['prediction'].transform('mean')
        features_summary["demean_yield"] = features_summary["log_yield"] - features_summary["district_yield_mean"]
        features_summary["demean_prediction"] = features_summary["prediction"] - features_summary["district_prediction_mean"]

        r_squared = r2_score(features_summary["demean_yield"], features_summary["demean_prediction"])
        pearson_r = pearsonr(features_summary["demean_yield"], features_summary["demean_prediction"])[0]
        
        print(
f"""
    Model results:
        Data split:
            Number of total points: {len(x_all)}
            Number of training points: {len(x_train)}
            Number of testing points: {len(x_test)}
        Model regularization parameter:
            Best \u03BB = {ridge_cv_random.alpha_}
        Validation score:
            R\u00B2 = {ridge_cv_random.best_score_:0.2f}
        Training scores:
            R\u00B2 = {r2_train:0.2f}  
            Pearsons r = {pearson_train:0.2f}
            Pearson r\u00B2 = {pearson_train ** 2:0.2f}
        Testing scores:    
            R\u00B2 = {r2_test:0.2f}
            Pearsons r = {pearson_test:0.2f}
            Pearson r\u00B2 = {pearson_test ** 2:0.2f}
        Demeaned by location scores:
            R\u00B2 = {r_squared:.2f}
            Pearson's r: {pearson_r:.2f} 
            Pearson r\u00B2 = {round(pearson_r ** 2, 2)}
"""
        )
        
        d = {
            'country': country_code,
            'satellite': [satellite],
            'bands': bands,
            'num_features': num_features,
            'points': len(features[features.year == min(features.year)]),
            'month_range': str(month_range),
            'limit_months': limit_months,
            'crop_mask': crop_mask,
            'weighted_avg': weighted_avg,
            'hot_encode': hot_encode,
            'total_n': len(x_all),
            'train_n': len(x_train),
            'test_n': len(x_test),
            'val_R2': ridge_cv_random.best_score_,
            'train_R2': r2_train,
            'train_r': pearson_train,
            'train_r2': pearson_train ** 2,
            'test_R2': r2_test,
            'test_r': pearson_test,
            'test_r2': pearson_test ** 2,
            'demean_R2': r_squared,
            'demean_r': pearson_r,
            'demean_r2': pearson_r ** 2,
        }
        df = pd.DataFrame(data=d)
        results = pd.concat([results, df])
            
results.to_csv(f"{data_dir}/results/results_test.csv")
results

In [None]:
data_dir = here("data")
directory = here("data", "random_features", "full_files")
results = pd.DataFrame()

file = "landsat-c2-l2_bands-r-g-b-nir-swir16-swir22_ZMB_20k-points_1024-features_yr-2009-2021.feather"

f = file.split(sep="_")
satellite = f[0]
bands = f[1].replace("bands-", "")
country_code = f[2]
points = f[3].replace("k-points", "")
num_features=f[4].replace("-features", "")
yrs=f[5].replace("yr-", "").replace(".feather", "").split(sep="-")
years=range(int(yrs[0]), int(yrs[1])+1)


limit_months = True
crop_mask    = True
weighted_avg = True
hot_encode   = True

fn=f"{directory}/{file}"

print(f"Opening: {file}")

features = pd.read_feather(fn)

country_shp = geopandas.read_file(f'{data_dir}/geo_boundaries/gadm36_{country_code}_2.shp')
country_shp = country_shp.set_index('district')

crop_df_full = pd.read_csv(f'{data_dir}/crop_yield/cfs_maize_districts_zambia_2009_2022.csv')
crop_df_full = crop_df_full[crop_df_full.year <= max(years)]
crop_df = crop_df_full[['district', 'year', 'yield_mt']]
crop_df = crop_df.set_index('district')

if limit_months:
    month_range = list(range(4, 10))
    filter_regex = fr'_[{min(month_range)}-{max(month_range)}]|year|crop_perc|district|lon|lat'
    features = features.filter(regex=filter_regex, axis=1)
else:
    month_range = list(range(1, 13))

if crop_mask:
    features = features[features.crop_perc > 0]
else:
    pass

print(
f"""
Setting parameters:
Feature info:
    satellite: {satellite}
    bands: {bands}
    month_range: {month_range}
User options:
    limit_months: {limit_months}
    crop_mask: {crop_mask}
    weighted_avg: {weighted_avg}
    hot_encode: {hot_encode}
"""
)

features = geopandas.GeoDataFrame(
    features, 
    geometry = geopandas.points_from_xy(x = features.lon, y = features.lat), 
    crs='EPSG:4326'
)

features = features.drop(['geometry', 'lon', 'lat'], axis = 1)
features = features[features.year <= max(years)]
var_cols = features.columns[2:-1].values.tolist()
features.columns[2:-1]

In [None]:
features

In [None]:
print('    Summarizing features\n')
if weighted_avg:
    features_summary = (
        features
        .groupby(['year', 'district'], as_index=False)
        .apply(lambda x: pd.Series([sum(x[v] * x.crop_perc) / sum(x.crop_perc) for v in var_cols]))
    )
else:
    features_summary = features.groupby(['district',"year"], as_index = False).mean()

crop_data_filtered = crop_df[crop_df.year >= min(features_summary.year)]
crop_data_filtered = crop_data_filtered[~crop_data_filtered.index.isin(['Mafinga', 'Ikelenge'])]
crop_data_filtered.reset_index(inplace = True)

features_summary = (
    features_summary
    .set_index(["district", "year"])
    .join(other = crop_data_filtered.set_index(["district", "year"]))
    .reset_index())

In [None]:
features_summary

In [None]:
features_summary.columns = features_summary.columns.astype(str)

In [None]:
if weighted_avg:
    drop_cols = ['district', 'year', 'yield_mt']
else:
    drop_cols = ['district', 'year', 'yield_mt', "crop_perc"]

if hot_encode:
    drop_cols.remove("district")
    x_all = pd.get_dummies(features_summary, 
                           columns=["district"], 
                           drop_first=False)
    x_all = x_all.drop(drop_cols, axis = 1)
else:
    x_all = features_summary.drop(drop_cols, axis = 1)

y_all = np.log10(features_summary.yield_mt.to_numpy() + 1)

x_train, x_test, y_train, y_test = train_test_split(
    x_all, y_all, test_size = 0.2, random_state = 0
)

In [None]:
print('    Modeling')
ridge_cv_random = RidgeCV(cv = 5, alphas = np.logspace(-8, 8, base = 10, num = 17))
ridge_cv_random.fit(x_train, y_train)

In [None]:
y_pred_train = np.maximum(ridge_cv_random.predict(x_train), 0)
r2_train = r2_score(y_train, y_pred_train)
pearson_train = pearsonr(y_pred_train, y_train)[0]

y_pred_test = np.maximum(ridge_cv_random.predict(x_test), 0)
r2_test = r2_score(y_test, y_pred_test)
pearson_test = pearsonr(y_pred_test, y_test)[0]

features_summary["log_yield"] = np.log10(features_summary.yield_mt.to_numpy() + 1)
features_summary["prediction"] = np.maximum(ridge_cv_random.predict(x_all), 0)
features_summary["residual"] = features_summary["log_yield"] - features_summary["prediction"]
features_summary["district_yield_mean"] = features_summary.groupby('district')['log_yield'].transform('mean')
features_summary["district_prediction_mean"] = features_summary.groupby('district')['prediction'].transform('mean')
features_summary["demean_yield"] = features_summary["log_yield"] - features_summary["district_yield_mean"]
features_summary["demean_prediction"] = features_summary["prediction"] - features_summary["district_prediction_mean"]

r_squared = r2_score(features_summary["demean_yield"], features_summary["demean_prediction"])
pearson_r = pearsonr(features_summary["demean_yield"], features_summary["demean_prediction"])[0]

print(
f"""
Model results:
Data split:
    Number of total points: {len(x_all)}
    Number of training points: {len(x_train)}
    Number of testing points: {len(x_test)}
Model regularization parameter:
    Best \u03BB = {ridge_cv_random.alpha_}
Validation score:
    R\u00B2 = {ridge_cv_random.best_score_:0.2f}
Training scores:
    R\u00B2 = {r2_train:0.2f}  
    Pearsons r = {pearson_train:0.2f}
    Pearson r\u00B2 = {pearson_train ** 2:0.2f}
Testing scores:    
    R\u00B2 = {r2_test:0.2f}
    Pearsons r = {pearson_test:0.2f}
    Pearson r\u00B2 = {pearson_test ** 2:0.2f}
Demeaned by location scores:
    R\u00B2 = {r_squared:.2f}
    Pearson's r: {pearson_r:.2f} 
    Pearson r\u00B2 = {round(pearson_r ** 2, 2)}
"""
)

d = {
    'country': country_code,
    'satellite': [satellite],
    'bands': bands,
    'num_features': num_features,
    'points': len(features[features.year == min(features.year)]),
    'month_range': str(month_range),
    'limit_months': limit_months,
    'crop_mask': crop_mask,
    'weighted_avg': weighted_avg,
    'hot_encode': hot_encode,
    'total_n': len(x_all),
    'train_n': len(x_train),
    'test_n': len(x_test),
    'val_R2': ridge_cv_random.best_score_,
    'train_R2': r2_train,
    'train_r': pearson_train,
    'train_r2': pearson_train ** 2,
    'test_R2': r2_test,
    'test_r': pearson_test,
    'test_r2': pearson_test ** 2,
    'demean_R2': r_squared,
    'demean_r': pearson_r,
    'demean_r2': pearson_r ** 2,
}
df = pd.DataFrame(data=d)
results = pd.concat([results, df])
            
results