In [1]:
## import warnings
import time
import math
import os
import glob
import numpy as np
import pandas as pd
import geopandas
import pyarrow
import itertools
from sklearn.linear_model import RidgeCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from scipy.stats import spearmanr
from scipy.stats import pearsonr
from pyhere import here

In [2]:
directory = here("data", "random_features", "full_files")
data_dir = here("data")
names = 'crop_mask weighted_avg hot_encode'.split()
results = pd.DataFrame()


for file in os.listdir(directory):
    if file.endswith('.gitkeep'):
        pass
    else:
        f = file.split(sep="_")
        satellite = f[0]
        bands = f[1].replace("bands-", "")
        country_code = f[2]
        points = f[3].replace("k-points", "")
        num_features=f[4].replace("-features", "")
        yrs=f[5].replace("yr-", "").split(sep="-")
        years=range(int(yrs[0]), int(yrs[1])+1)
        mns=f[6].replace(".feather", "").replace("mn-", "").split(sep="-")
        month_range=range(int(mns[0]), int(mns[1])+1)
        
        for p in itertools.product([False,True],repeat=len(names)):
            params = dict(zip(names,p))
            crop_mask = params["crop_mask"]
            weighted_avg = params["weighted_avg"]
            hot_encode = params["hot_encode"]
            
            fn=f"{directory}\\{file}"
            
            print(f"Opening: {file}")
            
            features = pd.read_feather(fn)

            country_shp = geopandas.read_file(f'{data_dir}/geo_boundaries/gadm36_{country_code}_2.shp')
            country_shp = country_shp.rename(columns = {'NAME_2': 'district'})[['district', 'geometry']]
            country_shp.district = country_shp.district.replace("MPongwe", 'Mpongwe', regex=True)
            country_shp = country_shp.set_index('district')

            crop_df_full = pd.read_csv(f'{data_dir}/crop_yield/cfs_maize_districts_zambia_2009_2022.csv')
            crop_df_full = crop_df_full[crop_df_full.year <= max(years)]
            crop_df = crop_df_full[['district', 'year', 'yield_mt']]
            crop_df = crop_df.set_index('district')

            if crop_mask:
                features = features[features.crop_perc > 0]
            else:
                pass

            features = geopandas.GeoDataFrame(
                features, 
                geometry = geopandas.points_from_xy(x = features.lon, y = features.lat), 
                crs='EPSG:4326'
            )

            features = features.drop(['geometry', 'lon', 'lat'], axis = 1)
            features = features[features.year <= max(years)]
            var_cols = features.columns[2:-1].values.tolist()
            features.columns[2:-1]
            
            print('    Summarizing features')
            if weighted_avg:
                features_summary = (
                    features
                    .groupby(['year', 'district'], as_index=False)
                    .apply(lambda x: pd.Series([sum(x[v] * x.crop_perc) / sum(x.crop_perc) for v in var_cols]))
                )
            else:
                features_summary = features.groupby(['district',"year"], as_index = False).mean()

            crop_data_filtered = crop_df[crop_df.year >= min(features_summary.year)]
            crop_data_filtered = crop_data_filtered[~crop_data_filtered.index.isin(['Mafinga', 'Ikelenge'])]
            crop_data_filtered.reset_index(inplace = True)

            features_summary = (
                features_summary
                .set_index(["district", "year"])
                .join(other = crop_data_filtered.set_index(["district", "year"]))
                .reset_index())

            if weighted_avg:
                drop_cols = ['district', 'year', 'yield_mt']
            else:
                drop_cols = ['district', 'year', 'yield_mt', "crop_perc"]
            
            if hot_encode:
                drop_cols.remove("district")
                x_all = pd.get_dummies(features_summary, 
                                       columns=["district"], 
                                       drop_first=False)
                x_all = x_all.drop(drop_cols, axis = 1)
            else:
                x_all = features_summary.drop(drop_cols, axis = 1)

            y_all = np.log10(features_summary.yield_mt.to_numpy() + 1)

            x_train, x_test, y_train, y_test = train_test_split(
                x_all, y_all, test_size = 0.2, random_state = 0
            )

            print('    Modeling')
            ridge_cv_random = RidgeCV(cv = 5, alphas = np.logspace(-8, 8, base = 10, num = 17))
            ridge_cv_random.fit(x_train, y_train)

            y_pred_train = np.maximum(ridge_cv_random.predict(x_train), 0)
            r2_train = r2_score(y_train, y_pred_train)
            pearson_train = pearsonr(y_pred_train, y_train)[0]

            y_pred_test = np.maximum(ridge_cv_random.predict(x_test), 0)
            r2_test = r2_score(y_test, y_pred_test)
            pearson_test = pearsonr(y_pred_test, y_test)[0]

            features_summary["log_yield"] = np.log10(features_summary.yield_mt.to_numpy() + 1)
            features_summary["prediction"] = np.maximum(ridge_cv_random.predict(x_all), 0)
            features_summary["residual"] = features_summary["log_yield"] - features_summary["prediction"]
            features_summary["district_yield_mean"] = features_summary.groupby('district')['log_yield'].transform('mean')
            features_summary["district_prediction_mean"] = features_summary.groupby('district')['prediction'].transform('mean')
            features_summary["demean_yield"] = features_summary["log_yield"] - features_summary["district_yield_mean"]
            features_summary["demean_prediction"] = features_summary["prediction"] - features_summary["district_prediction_mean"]

            r_squared = r2_score(features_summary["demean_yield"], features_summary["demean_prediction"])
            pearson_r = pearsonr(features_summary["demean_yield"], features_summary["demean_prediction"])[0]
            print(
            f"""
        Parameters:
            Feature info:
                satellite: {satellite}
                bands: {bands}
                month_range: {month_range}
            User options
                crop_mask: {crop_mask}
                weighted_avg: {weighted_avg}
                hot_encode: {hot_encode}
        Model results:
            Data split:
                Number of total points: {len(x_all)}
                Number of training points: {len(x_train)}
                Number of testing points: {len(x_test)}
            Model regularization parameter:
                Best \u03BB = {ridge_cv_random.alpha_}
            Validation score:
                R\u00B2 = {ridge_cv_random.best_score_:0.2f}
            Training scores:
                R\u00B2 = {r2_train:0.2f}  
                Pearsons r = {pearson_train:0.2f}
                Pearson r\u00B2 = {pearson_train ** 2:0.2f}
            Testing scores:    
                R\u00B2 = {r2_test:0.2f}
                Pearsons r = {pearson_test:0.2f}
                Pearson r\u00B2 = {pearson_test ** 2:0.2f}
            Demeaned by location scores:
                R\u00B2 = {r_squared:.2f}
                Pearson's r: {pearson_r:.2f} 
                Pearson r\u00B2 = {round(pearson_r ** 2, 2)}
            """
            )
            d = {
                'country': country_code,
                'satellite': [satellite],
                'bands': bands,
                'num_features': num_features,
                'points': len(features[features.year == min(features.year)]),
                'month_range': str(month_range),
                'crop_mask': crop_mask,
                'weighted_avg': weighted_avg,
                'hot_encode': hot_encode,
                'total_n': len(x_all),
                'train_n': len(x_train),
                'test_n': len(x_test),
                'val_R2': ridge_cv_random.best_score_,
                'train_R2': r2_train,
                'train_r': pearson_train,
                'train_r2': pearson_train ** 2,
                'test_R2': r2_test,
                'test_r': pearson_test,
                'test_r2': pearson_test ** 2,
                'demean_R2': r_squared,
                'demean_r': pearson_r,
                'demean_r2': pearson_r ** 2,
            }
            df = pd.DataFrame(data=d)
            results = pd.concat([results, df])
            
results.to_csv(f"{data_dir}\\results\\results.csv")
results

Opening: landsat-8-c2-l2_bands-1-2-3-4-5-6-7_ZMB_15k-points_1000-features_yr-2013-2021_mn-4-9.feather
    Summarizing features
    Modeling

        Parameters:
            satellite: landsat-8-c2-l2
            bands: 1-2-3-4-5-6-7
            month_range: range(4, 10)
            crop_mask: False
            weighted_avg: False
            hot_encode: False
        Model results:
            Data split:
                Number of total points: 648
                Number of training points: 518
                Number of testing points: 130
            Model regularization parameter:
                Best λ = 10.0
            Validation score:
                R² = 0.61
            Training scores:
                R² = 0.73  
                Pearsons r = 0.86
                Pearson r² = 0.74
            Testing scores:    
                R² = 0.61
                Pearsons r = 0.79
                Pearson r² = 0.63
            Demeaned by location scores:
                R² = 0.29
      




        Parameters:
            satellite: landsat-8-c2-l2
            bands: 1-2-3-4-5-6-7
            month_range: range(4, 10)
            crop_mask: False
            weighted_avg: True
            hot_encode: True
        Model results:
            Data split:
                Number of total points: 648
                Number of training points: 518
                Number of testing points: 130
            Model regularization parameter:
                Best λ = 1.0
            Validation score:
                R² = 0.71
            Training scores:
                R² = 0.90  
                Pearsons r = 0.95
                Pearson r² = 0.90
            Testing scores:    
                R² = 0.79
                Pearsons r = 0.89
                Pearson r² = 0.80
            Demeaned by location scores:
                R² = 0.46
                Pearson's r: 0.68 
                Pearson r² = 0.46
            
Opening: landsat-8-c2-l2_bands-1-2-3-4-5-6-7_ZMB_15k-points_1000-fe




        Parameters:
            satellite: landsat-8-c2-l2
            bands: 1-2-3-4-5-6-7
            month_range: range(4, 10)
            crop_mask: True
            weighted_avg: True
            hot_encode: True
        Model results:
            Data split:
                Number of total points: 648
                Number of training points: 518
                Number of testing points: 130
            Model regularization parameter:
                Best λ = 1.0
            Validation score:
                R² = 0.71
            Training scores:
                R² = 0.90  
                Pearsons r = 0.95
                Pearson r² = 0.90
            Testing scores:    
                R² = 0.79
                Pearsons r = 0.89
                Pearson r² = 0.80
            Demeaned by location scores:
                R² = 0.46
                Pearson's r: 0.68 
                Pearson r² = 0.46
            
Opening: landsat-8-c2-l2_bands-1-2-3-4-5-6-7_ZMB_15k-points_1000-fea




        Parameters:
            satellite: landsat-8-c2-l2
            bands: 1-2-3-4-5-6-7
            month_range: range(1, 13)
            crop_mask: False
            weighted_avg: True
            hot_encode: True
        Model results:
            Data split:
                Number of total points: 384
                Number of training points: 307
                Number of testing points: 77
            Model regularization parameter:
                Best λ = 10.0
            Validation score:
                R² = 0.58
            Training scores:
                R² = 0.89  
                Pearsons r = 0.94
                Pearson r² = 0.89
            Testing scores:    
                R² = 0.62
                Pearsons r = 0.79
                Pearson r² = 0.63
            Demeaned by location scores:
                R² = 0.43
                Pearson's r: 0.67 
                Pearson r² = 0.45
            
Opening: landsat-8-c2-l2_bands-1-2-3-4-5-6-7_ZMB_15k-points_1000-fe




        Parameters:
            satellite: landsat-8-c2-l2
            bands: 1-2-3-4-5-6-7
            month_range: range(1, 13)
            crop_mask: True
            weighted_avg: True
            hot_encode: True
        Model results:
            Data split:
                Number of total points: 384
                Number of training points: 307
                Number of testing points: 77
            Model regularization parameter:
                Best λ = 10.0
            Validation score:
                R² = 0.58
            Training scores:
                R² = 0.89  
                Pearsons r = 0.94
                Pearson r² = 0.89
            Testing scores:    
                R² = 0.62
                Pearsons r = 0.79
                Pearson r² = 0.63
            Demeaned by location scores:
                R² = 0.43
                Pearson's r: 0.67 
                Pearson r² = 0.45
            
Opening: landsat-8-c2-l2_bands-1-2-3-4-5-6-7_ZMB_20k-points_1000-fea




        Parameters:
            satellite: landsat-8-c2-l2
            bands: 1-2-3-4-5-6-7
            month_range: range(4, 10)
            crop_mask: False
            weighted_avg: True
            hot_encode: True
        Model results:
            Data split:
                Number of total points: 648
                Number of training points: 518
                Number of testing points: 130
            Model regularization parameter:
                Best λ = 1.0
            Validation score:
                R² = 0.74
            Training scores:
                R² = 0.89  
                Pearsons r = 0.95
                Pearson r² = 0.90
            Testing scores:    
                R² = 0.78
                Pearsons r = 0.89
                Pearson r² = 0.80
            Demeaned by location scores:
                R² = 0.46
                Pearson's r: 0.69 
                Pearson r² = 0.47
            
Opening: landsat-8-c2-l2_bands-1-2-3-4-5-6-7_ZMB_20k-points_1000-fe




        Parameters:
            satellite: landsat-8-c2-l2
            bands: 1-2-3-4-5-6-7
            month_range: range(4, 10)
            crop_mask: True
            weighted_avg: True
            hot_encode: True
        Model results:
            Data split:
                Number of total points: 648
                Number of training points: 518
                Number of testing points: 130
            Model regularization parameter:
                Best λ = 1.0
            Validation score:
                R² = 0.74
            Training scores:
                R² = 0.89  
                Pearsons r = 0.95
                Pearson r² = 0.90
            Testing scores:    
                R² = 0.78
                Pearsons r = 0.89
                Pearson r² = 0.80
            Demeaned by location scores:
                R² = 0.46
                Pearson's r: 0.69 
                Pearson r² = 0.47
            
Opening: landsat-8-c2-l2_bands-1-2-3-4-5-6-7_ZMB_20k-points_1000-fea




        Parameters:
            satellite: landsat-8-c2-l2
            bands: 1-2-3-4-5-6-7
            month_range: range(1, 13)
            crop_mask: False
            weighted_avg: True
            hot_encode: True
        Model results:
            Data split:
                Number of total points: 376
                Number of training points: 300
                Number of testing points: 76
            Model regularization parameter:
                Best λ = 1.0
            Validation score:
                R² = 0.46
            Training scores:
                R² = 0.95  
                Pearsons r = 0.98
                Pearson r² = 0.95
            Testing scores:    
                R² = 0.72
                Pearsons r = 0.85
                Pearson r² = 0.72
            Demeaned by location scores:
                R² = 0.57
                Pearson's r: 0.77 
                Pearson r² = 0.59
            
Opening: landsat-8-c2-l2_bands-1-2-3-4-5-6-7_ZMB_20k-points_1000-fea




        Parameters:
            satellite: landsat-8-c2-l2
            bands: 1-2-3-4-5-6-7
            month_range: range(1, 13)
            crop_mask: True
            weighted_avg: True
            hot_encode: True
        Model results:
            Data split:
                Number of total points: 376
                Number of training points: 300
                Number of testing points: 76
            Model regularization parameter:
                Best λ = 1.0
            Validation score:
                R² = 0.46
            Training scores:
                R² = 0.95  
                Pearsons r = 0.98
                Pearson r² = 0.95
            Testing scores:    
                R² = 0.72
                Pearsons r = 0.85
                Pearson r² = 0.72
            Demeaned by location scores:
                R² = 0.57
                Pearson's r: 0.77 
                Pearson r² = 0.59
            
Opening: sentinel-2-l2a_bands-2-3-4-8_ZMB_15k-points_1000-features_yr




        Parameters:
            satellite: sentinel-2-l2a
            bands: 2-3-4-8
            month_range: range(1, 13)
            crop_mask: False
            weighted_avg: True
            hot_encode: True
        Model results:
            Data split:
                Number of total points: 420
                Number of training points: 336
                Number of testing points: 84
            Model regularization parameter:
                Best λ = 1.0
            Validation score:
                R² = 0.71
            Training scores:
                R² = 0.94  
                Pearsons r = 0.97
                Pearson r² = 0.94
            Testing scores:    
                R² = 0.70
                Pearsons r = 0.86
                Pearson r² = 0.73
            Demeaned by location scores:
                R² = 0.59
                Pearson's r: 0.77 
                Pearson r² = 0.6
            
Opening: sentinel-2-l2a_bands-2-3-4-8_ZMB_15k-points_1000-features_yr-2016-2




        Parameters:
            satellite: sentinel-2-l2a
            bands: 2-3-4-8
            month_range: range(1, 13)
            crop_mask: True
            weighted_avg: True
            hot_encode: True
        Model results:
            Data split:
                Number of total points: 420
                Number of training points: 336
                Number of testing points: 84
            Model regularization parameter:
                Best λ = 1.0
            Validation score:
                R² = 0.71
            Training scores:
                R² = 0.94  
                Pearsons r = 0.97
                Pearson r² = 0.94
            Testing scores:    
                R² = 0.70
                Pearsons r = 0.86
                Pearson r² = 0.73
            Demeaned by location scores:
                R² = 0.59
                Pearson's r: 0.77 
                Pearson r² = 0.6
            
Opening: sentinel-2-l2a_bands-2-3-4-8_ZMB_15k-points_1000-features_yr-2016-20




        Parameters:
            satellite: sentinel-2-l2a
            bands: 2-3-4-8
            month_range: range(4, 10)
            crop_mask: False
            weighted_avg: True
            hot_encode: True
        Model results:
            Data split:
                Number of total points: 432
                Number of training points: 345
                Number of testing points: 87
            Model regularization parameter:
                Best λ = 0.1
            Validation score:
                R² = 0.70
            Training scores:
                R² = 0.95  
                Pearsons r = 0.98
                Pearson r² = 0.95
            Testing scores:    
                R² = 0.65
                Pearsons r = 0.83
                Pearson r² = 0.69
            Demeaned by location scores:
                R² = 0.61
                Pearson's r: 0.78 
                Pearson r² = 0.61
            
Opening: sentinel-2-l2a_bands-2-3-4-8_ZMB_15k-points_1000-features_yr-2016-




        Parameters:
            satellite: sentinel-2-l2a
            bands: 2-3-4-8
            month_range: range(4, 10)
            crop_mask: True
            weighted_avg: True
            hot_encode: True
        Model results:
            Data split:
                Number of total points: 432
                Number of training points: 345
                Number of testing points: 87
            Model regularization parameter:
                Best λ = 0.1
            Validation score:
                R² = 0.70
            Training scores:
                R² = 0.95  
                Pearsons r = 0.98
                Pearson r² = 0.95
            Testing scores:    
                R² = 0.65
                Pearsons r = 0.83
                Pearson r² = 0.69
            Demeaned by location scores:
                R² = 0.61
                Pearson's r: 0.78 
                Pearson r² = 0.61
            
Opening: sentinel-2-l2a_bands-2-3-4_ZMB_15k-points_1000-features_yr-2016-202




        Parameters:
            satellite: sentinel-2-l2a
            bands: 2-3-4
            month_range: range(1, 13)
            crop_mask: False
            weighted_avg: True
            hot_encode: True
        Model results:
            Data split:
                Number of total points: 420
                Number of training points: 336
                Number of testing points: 84
            Model regularization parameter:
                Best λ = 1.0
            Validation score:
                R² = 0.58
            Training scores:
                R² = 0.96  
                Pearsons r = 0.98
                Pearson r² = 0.96
            Testing scores:    
                R² = 0.67
                Pearsons r = 0.84
                Pearson r² = 0.70
            Demeaned by location scores:
                R² = 0.62
                Pearson's r: 0.80 
                Pearson r² = 0.64
            
Opening: sentinel-2-l2a_bands-2-3-4_ZMB_15k-points_1000-features_yr-2016-2021




        Parameters:
            satellite: sentinel-2-l2a
            bands: 2-3-4
            month_range: range(1, 13)
            crop_mask: True
            weighted_avg: True
            hot_encode: True
        Model results:
            Data split:
                Number of total points: 420
                Number of training points: 336
                Number of testing points: 84
            Model regularization parameter:
                Best λ = 1.0
            Validation score:
                R² = 0.58
            Training scores:
                R² = 0.96  
                Pearsons r = 0.98
                Pearson r² = 0.96
            Testing scores:    
                R² = 0.67
                Pearsons r = 0.84
                Pearson r² = 0.70
            Demeaned by location scores:
                R² = 0.62
                Pearson's r: 0.80 
                Pearson r² = 0.64
            
Opening: sentinel-2-l2a_bands-2-3-4_ZMB_15k-points_1000-features_yr-2016-2021_




        Parameters:
            satellite: sentinel-2-l2a
            bands: 2-3-4
            month_range: range(4, 10)
            crop_mask: False
            weighted_avg: True
            hot_encode: True
        Model results:
            Data split:
                Number of total points: 432
                Number of training points: 345
                Number of testing points: 87
            Model regularization parameter:
                Best λ = 1.0
            Validation score:
                R² = 0.73
            Training scores:
                R² = 0.93  
                Pearsons r = 0.97
                Pearson r² = 0.93
            Testing scores:    
                R² = 0.69
                Pearsons r = 0.84
                Pearson r² = 0.70
            Demeaned by location scores:
                R² = 0.58
                Pearson's r: 0.76 
                Pearson r² = 0.58
            
Opening: sentinel-2-l2a_bands-2-3-4_ZMB_15k-points_1000-features_yr-2016-2021




        Parameters:
            satellite: sentinel-2-l2a
            bands: 2-3-4
            month_range: range(4, 10)
            crop_mask: True
            weighted_avg: True
            hot_encode: True
        Model results:
            Data split:
                Number of total points: 432
                Number of training points: 345
                Number of testing points: 87
            Model regularization parameter:
                Best λ = 1.0
            Validation score:
                R² = 0.73
            Training scores:
                R² = 0.93  
                Pearsons r = 0.97
                Pearson r² = 0.93
            Testing scores:    
                R² = 0.69
                Pearsons r = 0.84
                Pearson r² = 0.70
            Demeaned by location scores:
                R² = 0.58
                Pearson's r: 0.76 
                Pearson r² = 0.58
            
Opening: sentinel-2-l2a_bands-2-3-4_ZMB_20k-points_1000-features_yr-2016-2021_




        Parameters:
            satellite: sentinel-2-l2a
            bands: 2-3-4
            month_range: range(1, 13)
            crop_mask: False
            weighted_avg: True
            hot_encode: True
        Model results:
            Data split:
                Number of total points: 402
                Number of training points: 321
                Number of testing points: 81
            Model regularization parameter:
                Best λ = 1.0
            Validation score:
                R² = 0.73
            Training scores:
                R² = 0.93  
                Pearsons r = 0.96
                Pearson r² = 0.93
            Testing scores:    
                R² = 0.73
                Pearsons r = 0.86
                Pearson r² = 0.74
            Demeaned by location scores:
                R² = 0.58
                Pearson's r: 0.76 
                Pearson r² = 0.59
            
Opening: sentinel-2-l2a_bands-2-3-4_ZMB_20k-points_1000-features_yr-2016-2021




        Parameters:
            satellite: sentinel-2-l2a
            bands: 2-3-4
            month_range: range(1, 13)
            crop_mask: True
            weighted_avg: True
            hot_encode: True
        Model results:
            Data split:
                Number of total points: 402
                Number of training points: 321
                Number of testing points: 81
            Model regularization parameter:
                Best λ = 1.0
            Validation score:
                R² = 0.73
            Training scores:
                R² = 0.93  
                Pearsons r = 0.96
                Pearson r² = 0.93
            Testing scores:    
                R² = 0.73
                Pearsons r = 0.86
                Pearson r² = 0.74
            Demeaned by location scores:
                R² = 0.58
                Pearson's r: 0.76 
                Pearson r² = 0.59
            
Opening: sentinel-2-l2a_bands-2-3-4_ZMB_20k-points_1000-features_yr-2016-2021_




        Parameters:
            satellite: sentinel-2-l2a
            bands: 2-3-4
            month_range: range(4, 10)
            crop_mask: False
            weighted_avg: True
            hot_encode: True
        Model results:
            Data split:
                Number of total points: 432
                Number of training points: 345
                Number of testing points: 87
            Model regularization parameter:
                Best λ = 0.1
            Validation score:
                R² = 0.64
            Training scores:
                R² = 0.92  
                Pearsons r = 0.96
                Pearson r² = 0.92
            Testing scores:    
                R² = 0.69
                Pearsons r = 0.84
                Pearson r² = 0.70
            Demeaned by location scores:
                R² = 0.50
                Pearson's r: 0.71 
                Pearson r² = 0.5
            
Opening: sentinel-2-l2a_bands-2-3-4_ZMB_20k-points_1000-features_yr-2016-2021_




        Parameters:
            satellite: sentinel-2-l2a
            bands: 2-3-4
            month_range: range(4, 10)
            crop_mask: True
            weighted_avg: True
            hot_encode: True
        Model results:
            Data split:
                Number of total points: 432
                Number of training points: 345
                Number of testing points: 87
            Model regularization parameter:
                Best λ = 0.1
            Validation score:
                R² = 0.64
            Training scores:
                R² = 0.92  
                Pearsons r = 0.96
                Pearson r² = 0.92
            Testing scores:    
                R² = 0.69
                Pearsons r = 0.84
                Pearson r² = 0.70
            Demeaned by location scores:
                R² = 0.50
                Pearson's r: 0.71 
                Pearson r² = 0.5
            


Unnamed: 0,country,satellite,bands,num_features,points,month_range,crop_mask,weighted_avg,hot_encode,total_n,...,val_R2,train_R2,train_r,train_r2,test_R2,test_r,test_r2,demean_R2,demean_r,demean_r2
0,ZMB,landsat-8-c2-l2,1-2-3-4-5-6-7,1000,14955,"range(4, 10)",False,False,False,648,...,0.611833,0.734995,0.858702,0.737370,0.612784,0.793637,0.629859,0.285799,0.560865,0.314569
0,ZMB,landsat-8-c2-l2,1-2-3-4-5-6-7,1000,14955,"range(4, 10)",False,False,True,648,...,0.748089,0.919108,0.959472,0.920586,0.683508,0.854250,0.729743,0.445727,0.671879,0.451421
0,ZMB,landsat-8-c2-l2,1-2-3-4-5-6-7,1000,14955,"range(4, 10)",False,True,False,648,...,0.507164,0.902564,0.951336,0.905040,0.613655,0.798784,0.638056,0.443414,0.702400,0.493366
0,ZMB,landsat-8-c2-l2,1-2-3-4-5-6-7,1000,14955,"range(4, 10)",False,True,True,648,...,0.713399,0.899174,0.949608,0.901755,0.788569,0.892588,0.796714,0.459292,0.680976,0.463729
0,ZMB,landsat-8-c2-l2,1-2-3-4-5-6-7,1000,4749,"range(4, 10)",True,False,False,648,...,0.518270,0.709148,0.843807,0.712010,0.683971,0.828155,0.685841,0.289258,0.570173,0.325098
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,ZMB,sentinel-2-l2a,2-3-4,1000,19598,"range(4, 10)",False,True,True,432,...,0.644630,0.918790,0.958824,0.919343,0.687865,0.837494,0.701396,0.502713,0.710119,0.504269
0,ZMB,sentinel-2-l2a,2-3-4,1000,19598,"range(4, 10)",True,False,False,432,...,0.545423,0.856178,0.927479,0.860218,0.534839,0.745309,0.555485,0.421782,0.672204,0.451859
0,ZMB,sentinel-2-l2a,2-3-4,1000,19598,"range(4, 10)",True,False,True,432,...,0.722936,0.913717,0.956184,0.914289,0.763860,0.875399,0.766324,0.526402,0.726287,0.527493
0,ZMB,sentinel-2-l2a,2-3-4,1000,19598,"range(4, 10)",True,True,False,432,...,0.396092,0.858301,0.928500,0.862112,0.548313,0.752095,0.565648,0.447578,0.686649,0.471486


In [2]:
# directory = data_dir = here("data", "random_features", "full_files")  

# for file in os.listdir(directory):
#     # check only text files
#     if file.endswith('.gitkeep'):
#         pass
#     else:
#         f = file.split(sep="_")
#         satellite = f[0]
#         bands = f[1].replace("bands-", "")
#         country = f[2]
#         points = f[3].replace("k-points", "")
#         features=f[4].replace("-features", "")
#         yrs=f[5].replace("yr-", "").split(sep="-")
#         years=range(int(yrs[0]), int(yrs[1])+1)
#         mns=f[6].replace(".feather", "").replace("mn-", "").split(sep="-")
#         months=range(int(mns[0]), int(mns[1])+1)
#         print(f'{satellite}\n{bands}\n{country}\n{points}\n{features}\n{years}\n{months}\n')
        
#         https://stackoverflow.com/questions/60237583/python-iterate-over-all-possible-combinations-on-boolean-variables

In [None]:
satellite = "landsat-8-c2-l2"
# satellite = "sentinel-2-l2a"

# bands = "2-3-4"
# bands = "2-3-4-8"
bands = "1-2-3-4-5-6-7"
# bands = "2-3-4-5-6-7-8-11-12"

points = 15
# points = 20

# month_range = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
month_range = [         4, 5, 6, 7, 8, 9            ]

country_code = "ZMB"
num_features = 1000

# crop_mask = True
crop_mask = False

#weighted_avg = True
weighted_avg = False

# hot_encode = True
hot_encode = False

include_2013 = True
# include_2013 = False


if satellite == "landsat-8-c2-l2":
    year_start = 2013 # Landsat
else:
    year_start = 2015 # Sentinel
year_end = 2021
year_end_crops = 2021

if include_2013 & (satellite == "landsat-8-c2-l2") & (month_range == [4, 5, 6, 7, 8, 9]):
    yr = year_start
else:
    yr = year_start+1
    
data_dir = here("data")  

feature_file_name = (f'{satellite}_bands-{bands}_{country_code}_{points}k-points_{num_features}-features')

country_shp = geopandas.read_file(f'{data_dir}/geo_boundaries/gadm36_{country_code}_2.shp')
country_shp = country_shp.rename(columns = {'NAME_2': 'district'})[['district', 'geometry']]
country_shp.district = country_shp.district.replace("MPongwe", 'Mpongwe', regex=True)
country_districts = country_shp.district.sort_values().unique().tolist()
country_shp = country_shp.set_index('district')

crop_df_full = pd.read_csv(f'{data_dir}/crop_yield/cfs_maize_districts_zambia_2009_2022.csv')
crop_df_full = crop_df_full[crop_df_full.year <= year_end_crops]
crop_districts = crop_df_full.district.sort_values().unique().tolist()
crop_df = crop_df_full[['district', 'year', 'yield_mt']]
ln = len(crop_df[crop_df.year == 2016].district)
crop_df = crop_df.set_index('district')

country_crop = geopandas.GeoDataFrame(crop_df.join(country_shp), crs = country_shp.crs)

fn=f"{data_dir}\\random_features\\full_files\\{feature_file_name}_yr-{yr}-{year_end}_mn-{min(month_range)}-{max(month_range)}.feather"
print(f"Opening: {feature_file_name}_yr-{yr}-{year_end}_mn-{min(month_range)}-{max(month_range)}.feather")
features = pd.read_feather(fn)

if crop_mask:
    features = features[features.crop_perc > 0]
else:
    pass

features = geopandas.GeoDataFrame(
    features, 
    geometry = geopandas.points_from_xy(x = features.lon, y = features.lat), 
    crs='EPSG:4326'
)

features = features.drop(['geometry', 'lon', 'lat'], axis = 1)

features = features[features.year <= year_end_crops]
var_cols = features.columns[2:-1].values.tolist()

if weighted_avg:
    features_summary = (
        features
        .groupby(['year', 'district'], as_index=False)
        .apply(lambda x: pd.Series([sum(x[v] * x.crop_perc) / sum(x.crop_perc) for v in var_cols]))
    )
else:
    features_summary = features.groupby(['district',"year"], as_index = False).mean()
    
crop_data_filtered = crop_df[crop_df.year >= min(features_summary.year)]

crop_data_filtered = crop_data_filtered[~crop_data_filtered.index.isin(['Mafinga', 'Ikelenge'])]

crop_data_filtered.reset_index(inplace = True)

features_summary = (
    features_summary
    .set_index(["district", "year"])
    .join(other = crop_data_filtered.set_index(["district", "year"]))
    .reset_index())

if weighted_avg:
    drop_cols = ['district', 'year', 'yield_mt']
else:
    drop_cols = [
        'district',
        'year', 'yield_mt', "crop_perc"]

if hot_encode:
    x_all = pd.get_dummies(features_summary, 
                           columns=["district"], 
                           drop_first=False)
    x_all = x_all.drop(drop_cols, axis = 1)
else:
    x_all = features_summary.drop(drop_cols, axis = 1)

y_all = np.log10(features_summary.yield_mt.to_numpy() + 1)

x_train, x_test, y_train, y_test = train_test_split(
    x_all, y_all, test_size = 0.2, random_state = 0
)

ridge_cv_random = RidgeCV(cv = 5, alphas = np.logspace(-8, 8, base = 10, num = 17))
ridge_cv_random.fit(x_train, y_train)

y_pred_train = np.maximum(ridge_cv_random.predict(x_train), 0)
r2_train = r2_score(y_train, y_pred_train)
pearson_train = pearsonr(y_pred_train, y_train)[0]

y_pred_test = np.maximum(ridge_cv_random.predict(x_test), 0)
r2_test = r2_score(y_test, y_pred_test)
pearson_test = pearsonr(y_pred_test, y_test)[0]

print(
f"""
Data split:
    Number of total points: {len(x_all)}
    Number of training points: {len(x_train)}
    Number of testing points: {len(x_test)}
Model regularization parameter:
    Best \u03BB = {ridge_cv_random.alpha_}
Validation score:
    R\u00B2 = {ridge_cv_random.best_score_:0.2f}
Training scores:
    R\u00B2 = {r2_train:0.2f}  
    Pearsons r = {pearson_train:0.2f}
    Pearson r\u00B2 = {pearson_train ** 2:0.2f}
Testing scores:    
    R\u00B2 = {r2_test:0.2f}
    Pearsons r = {pearson_test:0.2f}
    Pearson r\u00B2 = {pearson_test ** 2:0.2f}"""
) 

features_summary["log_yield"] = np.log10(features_summary.yield_mt.to_numpy() + 1)
features_summary["prediction"] = np.maximum(ridge_cv_random.predict(x_all), 0)
features_summary["residual"] = features_summary["log_yield"] - features_summary["prediction"]
features_summary["district_yield_mean"] = features_summary.groupby('district')['log_yield'].transform('mean')
features_summary["district_prediction_mean"] = features_summary.groupby('district')['prediction'].transform('mean')
features_summary["demean_yield"] = features_summary["log_yield"] - features_summary["district_yield_mean"]
features_summary["demean_prediction"] = features_summary["prediction"] - features_summary["district_prediction_mean"]

r_squared = r2_score(features_summary["demean_yield"], features_summary["demean_prediction"])
pearson_r = pearsonr(features_summary["demean_yield"], features_summary["demean_prediction"])[0]
print(
f"""
Demeaned by location scores:
    R\u00B2 = {r_squared:.2f}
    Pearson's r: {pearson_r:.2f} 
    Pearson r\u00B2 = {round(pearson_r ** 2, 2)}
"""
)

Opening: landsat-8-c2-l2_bands-1-2-3-4-5-6-7_ZMB_15k-points_1000-features_yr-2013-2021_mn-4-9.feather

Data split:
    Number of total points: 648
    Number of training points: 518
    Number of testing points: 130
Model regularization parameter:
    Best λ = 10.0
Validation score:
    R² = 0.61
Training scores:
    R² = 0.73  
    Pearsons r = 0.86
    Pearson r² = 0.74
Testing scores:    
    R² = 0.61
    Pearsons r = 0.79
    Pearson r² = 0.63

Demeaned by location scores:
    R² = 0.29
    Pearson's r: 0.56 
    Pearson r² = 0.31



In [6]:
results = pd.DataFrame()
d = {
    'country': country_code,
    'satellite': [satellite],
    'bands': bands,
    'num_features': num_features,
    'points': len(features[features.year == min(features.year)]),
    'month_range': str(month_range),
    'crop_mask': crop_mask,
    'weighted_avg': weighted_avg,
    'hot_encode': hot_encode,
    'total_n': len(x_all),
    'train_n': len(x_train),
    'test_n': len(x_test),
    'val_R2': ridge_cv_random.best_score_,
    'train_R2': r2_train,
    'train_r': pearson_train,
    'train_r2': pearson_train ** 2,
    'test_R2': r2_test,
    'test_r': pearson_test,
    'test_r2': pearson_test ** 2,
    'demean_R2': r_squared,
    'demean_r': pearson_r,
    'demean_r2': pearson_r ** 2,
}
df = pd.DataFrame(data=d)
results = pd.concat([results, df])
results

Unnamed: 0,country,satellite,bands,num_features,points,month_range,crop_mask,weighted_avg,hot_encode,total_n,...,val_R2,train_R2,train_r,train_r2,test_R2,test_r,test_r2,demean_R2,demean_r,demean_r2
0,ZMB,landsat-8-c2-l2,1-2-3-4-5-6-7,1000,14955,"[4, 5, 6, 7, 8, 9]",False,False,False,648,...,0.611833,0.734995,0.858702,0.73737,0.612784,0.793637,0.629859,0.285799,0.560865,0.314569
0,ZMB,landsat-8-c2-l2,1-2-3-4-5-6-7,1000,14955,"[4, 5, 6, 7, 8, 9]",False,False,False,648,...,0.611833,0.734995,0.858702,0.73737,0.612784,0.793637,0.629859,0.285799,0.560865,0.314569
0,ZMB,landsat-8-c2-l2,1-2-3-4-5-6-7,1000,14955,"[4, 5, 6, 7, 8, 9]",False,False,False,648,...,0.611833,0.734995,0.858702,0.73737,0.612784,0.793637,0.629859,0.285799,0.560865,0.314569


In [19]:
results

Unnamed: 0,country,satellite,bands,num_features,points,month_range,crop_mask,weighted_avg,hot_encode,total_n,...,val_R2,train_R2,train_r,train_r2,test_R2,test_r,test_r2,demean_R2,demean_r,demean_r2
0,ZMB,landsat-8-c2-l2,1-2-3-4-5-6-7,1000,14955,"[4, 5, 6, 7, 8, 9]",False,False,False,648,...,0.611833,0.734995,0.858702,0.73737,0.612784,0.793637,0.629859,0.285799,0.560865,0.314569
0,ZMB,landsat-8-c2-l2,1-2-3-4-5-6-7,1000,14955,"[4, 5, 6, 7, 8, 9]",False,False,True,648,...,0.748089,0.919108,0.959472,0.920586,0.683508,0.85425,0.729743,0.445727,0.671879,0.451421


In [8]:
names = 'crop_mask weighted_avg hot_encode'.split()
    
for p in itertools.product([True,False],repeat=len(names)):
    params = dict(zip(names,p))
    crop_mask = params["crop_mask"]
    weighted_avg = params["weighted_avg"]
    hot_encode = params["hot_encode"]
    print(params)

{'crop_mask': True, 'weighted_avg': True, 'hot_encode': True}
{'crop_mask': True, 'weighted_avg': True, 'hot_encode': False}
{'crop_mask': True, 'weighted_avg': False, 'hot_encode': True}
{'crop_mask': True, 'weighted_avg': False, 'hot_encode': False}
{'crop_mask': False, 'weighted_avg': True, 'hot_encode': True}
{'crop_mask': False, 'weighted_avg': True, 'hot_encode': False}
{'crop_mask': False, 'weighted_avg': False, 'hot_encode': True}
{'crop_mask': False, 'weighted_avg': False, 'hot_encode': False}


In [46]:
names = 'crop_mask weighted_avg hot_encode'.split()

x=pd.DataFrame()
for p in itertools.product([True,False],repeat=len(names)):
    
    params = pd.DataFrame([dict(zip(names,p))])
    x=pd.concat([x, params])
    

In [55]:
x.reset_index(drop=True, inplace=True)
x.crop_mask[0]

True