In [None]:
import warnings
import time
import math
import os
import numpy as np
import pandas as pd
import geopandas
import pyarrow
from sklearn.linear_model import RidgeCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from scipy.stats import spearmanr
from scipy.stats import pearsonr
from pyhere import here

satellite = "landsat-8-c2-l2"
# satellite = "sentinel-2-l2a"

#bands = "2-3-4"
# bands = "2-3-4-8"
bands = "1-2-3-4-5-6-7"
# bands = "2-3-4-5-6-7-8-11-12"

points = 15
# points = 20

# crop_mask = True
crop_mask = False

#weighted_avg = True
weighted_avg = False

# month_range = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
month_range = [         4, 5, 6, 7, 8, 9            ]

include_2013 = True
# include_2013 = False

country_code = "ZMB"
num_features = 1000

if satellite == "landsat-8-c2-l2":
    year_start = 2013 # Landsat
else:
    year_start = 2015 # Sentinel
year_end = 2021
year_end_crops = 2019

data_dir = "/capstone/cropmosaiks/data"  
# data_dir = here("data")  

feature_file_name = (f'{satellite}_bands-{bands}_{country_code}_{points}k-points_{num_features}-features')
weight_file_name = (f'{country_code}_crop_weights_{points}k-points')

country_shp = geopandas.read_file(f'{data_dir}/geo_boundaries/gadm36_{country_code}_2.shp')
country_shp = country_shp.rename(columns = {'NAME_2': 'district'})[['district', 'geometry']]
country_shp.district = country_shp.district.replace("MPongwe", 'Mpongwe', regex=True)
country_districts = country_shp.district.sort_values().unique().tolist()
country_shp = country_shp.set_index('district')

# crop_df_full = pd.read_csv(data_dir+'/crops/cfs_maize_districts_zambia_2009_2018.csv')
crop_df_full = pd.read_csv(data_dir+'/crop_yield/cfs_maize_districts_zambia_2009_2022.csv')
crop_df_full = crop_df_full[crop_df_full.year <= year_end_crops]
crop_districts = crop_df_full.district.sort_values().unique().tolist()
crop_df = crop_df_full[['district', 'year', 'yield_mt']]
ln = len(crop_df[crop_df.year == 2016].district)
crop_df = crop_df.set_index('district')

country_crop = geopandas.GeoDataFrame(crop_df.join(country_shp), crs = country_shp.crs)

weights = pd.read_feather(f"{data_dir}/land_cover/{weight_file_name}.feather")

weights_gdf = geopandas.GeoDataFrame(
    weights, 
    geometry = geopandas.points_from_xy(x = weights.lon, y = weights.lat), 
    crs='EPSG:4326'
)

weights.crop_perc = weights.crop_perc.fillna(0)

features = geopandas.GeoDataFrame()

for yr in range(year_start, year_end + 1):
    print(f"Opening: {feature_file_name}_{yr}.feather")
    features_x = pd.read_feather(f"{data_dir}/random_features/{satellite}/{feature_file_name}_{yr}.feather")
    features = pd.concat([features, features_x], axis=0)
    print("feature.shape", features.shape)
    print("Appending:", yr)
    print("")

if include_2013 & (satellite == "landsat-8-c2-l2") & (month_range == [4, 5, 6, 7, 8, 9]):
    month_start = 4
else:
    month_start = 10
    
month_start

keep = np.where(
    ((features.year == year_start) & (features.month >= month_start)) | (features.year > year_start),
    True, False
)
features = features[keep]

features['year'] = np.where(
    features['month'].isin([10, 11, 12]),
    features['year'] + 1, 
    features['year'])

features = features[features['year'] <= year_end]

features.sort_values(['year', 'month'], inplace=True)

features = features[features.month.isin(month_range)]

features = features.set_index(['lon','lat', "year", 'month']).unstack()
features.columns = features.columns.map(lambda x: '{}_{}'.format(*x))

features.replace([np.inf, -np.inf], np.nan, inplace=True)
features = features.reset_index()

features = features.join(weights.set_index(['lon', 'lat']), on = ['lon', 'lat'])
features = features.drop(["geometry"], axis = 1)

if crop_mask:
    features = features[features.crop_perc > 0]
else:
    pass

features = geopandas.GeoDataFrame(
    features, 
    geometry = geopandas.points_from_xy(x = features.lon, y = features.lat), 
    crs='EPSG:4326'
)

features = features.drop(['lon', 'lat'], axis = 1)

features = features.sjoin(country_shp, how = 'left', predicate = 'within')

features = (
    features
    .dropna(subset=['index_right'])
    .rename(columns = {"index_right": "district",})
    .reset_index(drop = True)
)
points = features.copy()
points = features[['geometry']]
year = features[['year']]
features = features.drop(['geometry'], axis = 1)

num_cells = len(features) * len(month_range) * num_features

class bcolors:
    BL = '\x1b[1;34m' #GREEN
    GR = '\x1b[1;36m' #GREEN
    YL = '\x1b[1;33m' #YELLOW
    RD = '\x1b[1;31m' #RED
    RESET = '\033[0m' #RESET COLOR

ln_ft = len(features)
ln_na = len(features.dropna())
print(f'Starting total row count: {bcolors.BL}{ln_ft}{bcolors.RESET}',
      f'\nPre-Impute NaN row count: {bcolors.RD}{ln_ft - ln_na}{bcolors.RESET}',
      f'\nPre-Impute NaN row %: {bcolors.RD}{((ln_ft - ln_na) / ln_ft)*100:.02f}{bcolors.RESET}',
      f'\nPre-Impute NaN cell %: {bcolors.RD}{(features.isna().sum().sum() / num_cells)*100:.02f}{bcolors.RESET}',
      f'\n\nStep 1: Filling NaN values by month, year, and district group average')
features = (
    features
    .fillna(features
            .groupby(['year', 'district'], as_index=False)
            .transform('mean')
           )
)
ln_ft = len(features)
ln_na = len(features.dropna())
print(f'Post step 1 NaN row count: {bcolors.YL}{ln_ft - ln_na}{bcolors.RESET}',
      f'\nPost step 1 NaN row %: {bcolors.YL}{((ln_ft - ln_na) / ln_ft)*100:.02f}{bcolors.RESET}',
      f'\nPost step 1 NaN cell %: {bcolors.YL}{(features.isna().sum().sum() / num_cells)*100:.02f}{bcolors.RESET}',
      f'\n\nStep 2: Filling NaN values by month and district across group average')
features = (
    features
    .fillna(features
            .groupby(['district'], as_index=False)
            .transform('mean')
           )
)
ln_ft = len(features)
ln_na = len(features.dropna())
print(f'Post step 2 NaN row count: {bcolors.GR}{ln_ft - ln_na}{bcolors.RESET}',
      f'\nPost step 2 NaN row %: {bcolors.GR}{((ln_ft - ln_na) / ln_ft)*100:.02f}{bcolors.RESET}',
      f'\nPost step 2 NaN cell %: {bcolors.GR}{(features.isna().sum().sum() / num_cells)*100:.02f}{bcolors.RESET}',
      f'\n\nStep 3: Drop remaining NaN values\n')
features = features.dropna(axis=0)
print(f'Ending total row count: {bcolors.BL}{len(features)}{bcolors.RESET}')
    
features_all_years = features.copy()
features_all_years['geometry'] = points.geometry

features_through_2019 = features[features.year <= year_end_crops]
var_cols = features_through_2019.columns[2:-1].values.tolist()
features_through_2019.columns[2:-1]

if weighted_avg:
    features_summary = (
        features_through_2019
        .groupby(['year', 'district'], as_index=False)
        .apply(lambda x: pd.Series([sum(x[v] * x.crop_perc) / sum(x.crop_perc) for v in var_cols]))
    )
else:
    features_summary = features_through_2019.groupby(['district',"year"], as_index = False).mean()
    
crop_data_filtered = crop_df[crop_df.year >= min(features_summary.year)]

# crop_data_filtered = crop_data_filtered[~crop_data_filtered.index.isin(['Mafinga', 'Ikelenge'])]

crop_data_filtered.reset_index(inplace = True)

crop_data_filtered

features_summary.columns

features_summary = (
    features_summary
    .set_index(["district", "year"])
    .join(other = crop_data_filtered.set_index(["district", "year"]))
    .reset_index())

features_summary.columns

if weighted_avg:
    drop_cols = ['district', 'year', 'yield_mt']
else:
    drop_cols = ['district', 'year', 'yield_mt', "crop_perc"]

x_all = features_summary.drop(drop_cols, axis = 1)

y_all = np.log10(features_summary.yield_mt.to_numpy() + 1)

x_train, x_test, y_train, y_test = train_test_split(
    x_all, y_all, test_size = 0.2, random_state = 0
)

print("Number of total points: ", len(x_all), "\n", 
      "Number of training points: ", len(x_train), "\n",
      "Number of testing points: ", len(x_test), sep = "")

ridge_cv_random = RidgeCV(cv = 5, alphas = np.logspace(-8, 8, base = 10, num = 17))
ridge_cv_random.fit(x_train, y_train)

print(f"Estimated regularization parameter: {ridge_cv_random.alpha_}")

print(f"Validation R2 performance: {ridge_cv_random.best_score_:0.2f}")

y_pred = np.maximum(ridge_cv_random.predict(x_train), 0)
r2_train = r2_score(y_train, y_pred)

print(f"Training R^2 = {r2_train:0.2f}\nPearsons r = {pearsonr(y_pred, y_train)[0]:0.2f}") 

pearsonr(y_pred, y_train)[0] ** 2

ridge_cv_random.score(x_train, y_train)

print(f"Testing set R^2 = {r2_test:0.2f}")
print(f"Testing set pearsons R = {pearsonr(y_pred, y_test)[0]:0.2f}")

x_all = features_all_years.drop([
    'year', 
    'geometry',
    'district',
    'crop_perc'
], axis = 1)

features_all_years['yield_prediction'] = np.maximum(ridge_cv_random.predict(x_all), 0)

features_all_years['yield_prediction'].mask(features_all_years['crop_perc']==0, np.nan, inplace=True)

features_all_years['lon'], features_all_years['lat'] = features_all_years.geometry.x, features_all_years.geometry.y

features_all_years_summary = (
    features_all_years
    .groupby(['district',"year"], as_index = False)['yield_prediction']
    .mean()
    .set_index('district')
)

features_all_years_summary = features_all_years_summary.join(country_shp).reset_index()

x_all = features_summary.drop(drop_cols, axis = 1)

residual_df = pd.DataFrame()
residual_df["yield_mt"] = features_summary.yield_mt.to_numpy()
residual_df["log_yield"] = np.log10(features_summary.yield_mt.to_numpy() + 1)
residual_df["prediction"] = np.maximum(ridge_cv_random.predict(x_all), 0)
residual_df["residual"] = residual_df["log_yield"] - residual_df["prediction"]
residual_df["year"] = features_summary.year
residual_df["district"] = features_summary.district
residual_df = residual_df.join(country_shp, how = "left", on = "district")
residual_df["district_yield_mean"] = residual_df.groupby('district')['log_yield'].transform('mean')
residual_df["district_prediction_mean"] = residual_df.groupby('district')['prediction'].transform('mean')
residual_df["demean_yield"] = residual_df["log_yield"] - residual_df["district_yield_mean"]
residual_df["demean_prediction"] = residual_df["prediction"] - residual_df["district_prediction_mean"]
residual_gdf = geopandas.GeoDataFrame(residual_df)

r_squared = r2_score(residual_gdf["demean_yield"], residual_gdf["demean_prediction"])
pearson_r = pearsonr(residual_gdf["demean_yield"], residual_gdf["demean_prediction"])
print(f"All     R^2: {r_squared:.2f}\n",
      f"Pearson's r: {pearson_r[0]:.2f}", sep = "")

r2 = round(pearson_r[0] ** 2, 2)


