In [1]:
## import warnings
import time
import math
import os
import glob
from pyhere import here
from datetime import date
import re
from collections import Counter

import numpy as np
import pandas as pd
import geopandas
import pickle

import pyarrow
import itertools
import multiprocessing
import p_tqdm

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge, RidgeCV
from sklearn.model_selection import train_test_split, KFold, LeaveOneGroupOut, cross_val_score, GridSearchCV, cross_val_predict
from sklearn.metrics import r2_score
from scipy.stats import spearmanr,  pearsonr

# NDVI model

In [2]:
climate_df = pd.read_csv(here('data', 'climate', 'climate_summary.csv'))
climate_df = climate_df.dropna()
drop_cols = ['year', 'district', 'yield_mt']
ndvi_cols = climate_df.columns[climate_df.columns.to_series().str.contains('ndvi')]
keep_cols = [*ndvi_cols, *drop_cols]
climate_df = climate_df.loc[:, keep_cols]
# climate_df = climate_df[climate_df.year >= 2016]

hot_encode = True
# hot_encode = False

crop_yield = climate_df.copy().loc[:, tuple(drop_cols)].reset_index(drop = True)
crop_yield["log_yield"] = np.log10(crop_yield.yield_mt.to_numpy() + 1)

#########################################    HOT ENCODE    ######################################### 
if hot_encode:
    drop_cols.remove('district')
    climate_df = pd.get_dummies(climate_df, columns = ["district"], drop_first = False)
else:
    pass

#########################################    STANDARDIZE FEATURES    #########################################    
climate_df = climate_df.set_index(drop_cols) 
climate_df_scaled = StandardScaler().fit_transform(climate_df.values)
climate_df = pd.DataFrame(climate_df_scaled, index=climate_df.index).reset_index()
climate_df.columns = climate_df.columns.astype(str)

#########################################     K-FOLD SPLIT    #########################################
x_all = climate_df.drop(drop_cols, axis = 1) 
y_all = np.log10(climate_df.yield_mt.to_numpy() + 1)
x_train, x_test, y_train, y_test = train_test_split(x_all, y_all, test_size=0.2, random_state=0)

#########################################     K-FOLD CV   ###########################################
### SETUP
alphas = {'alpha': np.logspace(-8, 8, base = 10, num = 17)}
kfold = KFold()
ridge = Ridge()    
### GRID SEARCH - FINDING BEST REGULARIZATION PARAMETER
ridge_reg = GridSearchCV(ridge, alphas, scoring = 'r2', cv = kfold)
ridge_reg.fit(x_train, y_train)
best_model = ridge_reg.best_estimator_
### PREDICT - PREDICTING WITH BEST HYPERPARAMETER
val_predictions = cross_val_predict(best_model, X = x_train, y = y_train, cv = kfold)   
train_predictions = best_model.predict(x_train)
test_predictions  = best_model.predict(x_test)

#########################################     DE-MEAN R2    #########################################    
crop_yield["prediction"] = np.maximum(best_model.predict(x_all), 0)

train_split = pd.DataFrame(np.repeat('train', len(x_train)), columns = ['split'], index = x_train.index)
train_split = train_split.join(crop_yield.copy()[crop_yield.index.isin(x_train.index)])
train_split['cv_prediction'] = np.maximum(val_predictions, 0)
train_split["demean_cv_yield"] = train_split["log_yield"]-train_split.groupby('district')['log_yield'].transform('mean')
train_split["demean_cv_prediction"] = train_split["cv_prediction"]-train_split.groupby('district')['cv_prediction'].transform('mean')

test_split = pd.DataFrame(np.repeat('test', len(x_test)), columns = ['split'], index = x_test.index)
test_split = test_split.join(crop_yield.copy()[crop_yield.index.isin(x_test.index)])
test_split['cv_prediction'] = np.repeat(np.nan, len(x_test))
test_split["demean_cv_yield"] = np.repeat(np.nan, len(x_test))
test_split["demean_cv_prediction"] = np.repeat(np.nan, len(x_test))

predictions = pd.concat([train_split, test_split])

test_split["demean_test_yield"] = test_split["log_yield"]-test_split.groupby('district')['log_yield'].transform('mean')
test_split["demean_test_prediction"] = test_split["prediction"]-test_split.groupby('district')['prediction'].transform('mean')

print(f'Val  R2: {r2_score(y_train, val_predictions):0.2f}',
      f'\nTest R2: {r2_score(y_test, test_predictions):0.2f}',
     f'\n\nDemean Val  R2: {r2_score(train_split.demean_cv_yield, train_split.demean_cv_prediction):0.2f}',
     f'\nDemean Test R2: {r2_score(test_split.demean_test_yield, test_split.demean_test_prediction):0.2f}')

Val  R2: 0.64 
Test R2: 0.60 

Demean Val  R2: 0.06 
Demean Test R2: 0.15


# Precipitation, Temperature, and NDVI model

In [3]:
climate_df = pd.read_csv(here('data', 'climate', 'climate_summary.csv'))
climate_df = climate_df.dropna()
drop_cols = ['year', 'district', 'yield_mt']
# climate_df = climate_df[climate_df.year >= 2016]

hot_encode = True
# hot_encode = False

crop_yield = climate_df.copy().loc[:, tuple(drop_cols)].reset_index(drop = True)
crop_yield["log_yield"] = np.log10(crop_yield.yield_mt.to_numpy() + 1)

#########################################    HOT ENCODE    ######################################### 
if hot_encode:
    drop_cols.remove('district')
    climate_df = pd.get_dummies(climate_df, columns = ["district"], drop_first = False)
else:
    pass

#########################################    STANDARDIZE FEATURES    #########################################    
climate_df = climate_df.set_index(drop_cols) 
climate_df_scaled = StandardScaler().fit_transform(climate_df.values)
climate_df = pd.DataFrame(climate_df_scaled, index=climate_df.index).reset_index()
climate_df.columns = climate_df.columns.astype(str)

#########################################     K-FOLD SPLIT    #########################################
x_all = climate_df.drop(drop_cols, axis = 1) 
y_all = np.log10(climate_df.yield_mt.to_numpy() + 1)
x_train, x_test, y_train, y_test = train_test_split(x_all, y_all, test_size=0.2, random_state=0)

#########################################     K-FOLD CV   ###########################################
### SETUP
alphas = {'alpha': np.logspace(-8, 8, base = 10, num = 17)}
kfold = KFold()
ridge = Ridge()    
### GRID SEARCH - FINDING BEST REGULARIZATION PARAMETER
ridge_reg = GridSearchCV(ridge, alphas, scoring = 'r2', cv = kfold)
ridge_reg.fit(x_train, y_train)
best_model = ridge_reg.best_estimator_
### PREDICT - PREDICTING WITH BEST HYPERPARAMETER
val_predictions = cross_val_predict(best_model, X = x_train, y = y_train, cv = kfold)   
train_predictions = best_model.predict(x_train)
test_predictions  = best_model.predict(x_test)

#########################################     DE-MEAN R2    #########################################    
crop_yield["prediction"] = np.maximum(best_model.predict(x_all), 0)

train_split = pd.DataFrame(np.repeat('train', len(x_train)), columns = ['split'], index = x_train.index)
train_split = train_split.join(crop_yield.copy()[crop_yield.index.isin(x_train.index)])
train_split['cv_prediction'] = np.maximum(val_predictions, 0)
train_split["demean_cv_yield"] = train_split["log_yield"]-train_split.groupby('district')['log_yield'].transform('mean')
train_split["demean_cv_prediction"] = train_split["cv_prediction"]-train_split.groupby('district')['cv_prediction'].transform('mean')

test_split = pd.DataFrame(np.repeat('test', len(x_test)), columns = ['split'], index = x_test.index)
test_split = test_split.join(crop_yield.copy()[crop_yield.index.isin(x_test.index)])
test_split['cv_prediction'] = np.repeat(np.nan, len(x_test))
test_split["demean_cv_yield"] = np.repeat(np.nan, len(x_test))
test_split["demean_cv_prediction"] = np.repeat(np.nan, len(x_test))

predictions = pd.concat([train_split, test_split])

test_split["demean_test_yield"] = test_split["log_yield"]-test_split.groupby('district')['log_yield'].transform('mean')
test_split["demean_test_prediction"] = test_split["prediction"]-test_split.groupby('district')['prediction'].transform('mean')

print(f'Val  R2: {r2_score(y_train, val_predictions):0.2f}',
      f'\nTest R2: {r2_score(y_test, test_predictions):0.2f}',
     f'\n\nDemean Val  R2: {r2_score(train_split.demean_cv_yield, train_split.demean_cv_prediction):0.2f}',
     f'\nDemean Test R2: {r2_score(test_split.demean_test_yield, test_split.demean_test_prediction):0.2f}')

Val  R2: 0.75 
Test R2: 0.75 

Demean Val  R2: 0.35 
Demean Test R2: 0.49


# NDVI Anomaly Model

In [4]:
climate_df = pd.read_csv(here('data', 'climate', 'climate_summary.csv'))
climate_df = climate_df.dropna()
drop_cols = ['year', 'district', 'yield_mt']
ndvi_cols = climate_df.columns[climate_df.columns.to_series().str.contains('ndvi')]
keep_cols = [*ndvi_cols, *drop_cols]
climate_df = climate_df.loc[:, keep_cols]

crop_yield = climate_df.copy().loc[:, tuple(drop_cols)].reset_index(drop = True)
crop_yield["log_yield"] = np.log10(crop_yield.yield_mt.to_numpy() + 1)

#########################################    STANDARDIZE FEATURES    #########################################    
climate_df = climate_df.set_index(drop_cols) 
climate_df_scaled = StandardScaler().fit_transform(climate_df.values)
climate_df = pd.DataFrame(climate_df_scaled, index=climate_df.index).reset_index()
climate_df.columns = climate_df.columns.astype(str)

#########################################     CALCULATE ANOMALY   #########################################
climate_df['yield_mt'] = np.log10(climate_df.yield_mt.to_numpy() + 1)
climate_df.set_index(['year', 'district'], inplace=True)
var_cols = climate_df.columns
climate_df = climate_df[var_cols] - climate_df.groupby(['district'], as_index=True)[var_cols].transform('mean')
climate_df.reset_index(drop=False, inplace=True)

#########################################     K-FOLD SPLIT    #########################################
x_all = climate_df.drop(drop_cols, axis = 1) 
y_all = climate_df.yield_mt
x_train, x_test, y_train, y_test = train_test_split(x_all, y_all, test_size=0.2, random_state=0)

#########################################     K-FOLD CV   ###########################################
### SETUP
alphas = {'alpha': np.logspace(-8, 8, base = 10, num = 17)}
kfold = KFold()
ridge = Ridge()    
### GRID SEARCH - FINDING BEST REGULARIZATION PARAMETER
ridge_reg = GridSearchCV(ridge, alphas, scoring = 'r2', cv = kfold)
ridge_reg.fit(x_train, y_train)
best_model = ridge_reg.best_estimator_
### PREDICT - PREDICTING WITH BEST HYPERPARAMETER
val_predictions   = cross_val_predict(best_model, X = x_train, y = y_train, cv = kfold)   
train_predictions = best_model.predict(x_train)
test_predictions  = best_model.predict(x_test)

#########################################     DE-MEAN R2    #########################################    
crop_yield["prediction"] = best_model.predict(x_all)

train_split = pd.DataFrame(np.repeat('train', len(x_train)), columns = ['split'], index = x_train.index)
train_split = train_split.join(crop_yield.copy()[crop_yield.index.isin(x_train.index)])
train_split['cv_prediction'] = val_predictions

test_split = pd.DataFrame(np.repeat('test', len(x_test)), columns = ['split'], index = x_test.index)
test_split = test_split.join(crop_yield.copy()[crop_yield.index.isin(x_test.index)])
test_split['cv_prediction'] = np.repeat(np.nan, len(x_test))

predictions = pd.concat([train_split, test_split])

print(f'Val  R2: {r2_score(y_train, val_predictions):0.2f}\nTest R2: {r2_score(y_test, test_predictions):0.2f}')

Val  R2: 0.23
Test R2: 0.13


# Precipitation, Temperature, and NDVI  Anomaly model

In [5]:
climate_df = pd.read_csv(here('data', 'climate', 'climate_summary.csv'))
climate_df = climate_df.dropna()
drop_cols = ['year', 'district', 'yield_mt']

crop_yield = climate_df.copy().loc[:, tuple(drop_cols)].reset_index(drop = True)
crop_yield["log_yield"] = np.log10(crop_yield.yield_mt.to_numpy() + 1)

#########################################    STANDARDIZE FEATURES    #########################################    
climate_df = climate_df.set_index(drop_cols) 
climate_df_scaled = StandardScaler().fit_transform(climate_df.values)
climate_df = pd.DataFrame(climate_df_scaled, index=climate_df.index).reset_index()
climate_df.columns = climate_df.columns.astype(str)

#########################################     CALCULATE ANOMALY   #########################################
climate_df['yield_mt'] = np.log10(climate_df.yield_mt.to_numpy() + 1)
climate_df.set_index(['year', 'district'], inplace=True)
var_cols = climate_df.columns
climate_df = climate_df[var_cols] - climate_df.groupby(['district'], as_index=True)[var_cols].transform('mean')
climate_df.reset_index(drop=False, inplace=True)

#########################################     K-FOLD SPLIT    #########################################
x_all = climate_df.drop(drop_cols, axis = 1) 
y_all = climate_df.yield_mt
x_train, x_test, y_train, y_test = train_test_split(x_all, y_all, test_size=0.2, random_state=0)

#########################################     K-FOLD CV   ###########################################
### SETUP
alphas = {'alpha': np.logspace(-8, 8, base = 10, num = 17)}
kfold = KFold()
ridge = Ridge()    
### GRID SEARCH - FINDING BEST REGULARIZATION PARAMETER
ridge_reg = GridSearchCV(ridge, alphas, scoring = 'r2', cv = kfold)
ridge_reg.fit(x_train, y_train)
best_model = ridge_reg.best_estimator_
### PREDICT - PREDICTING WITH BEST HYPERPARAMETER
val_predictions   = cross_val_predict(best_model, X = x_train, y = y_train, cv = kfold)   
train_predictions = best_model.predict(x_train)
test_predictions  = best_model.predict(x_test)

#########################################     DE-MEAN R2    #########################################    
crop_yield["prediction"] = best_model.predict(x_all)

train_split = pd.DataFrame(np.repeat('train', len(x_train)), columns = ['split'], index = x_train.index)
train_split = train_split.join(crop_yield.copy()[crop_yield.index.isin(x_train.index)])
train_split['cv_prediction'] = val_predictions

test_split = pd.DataFrame(np.repeat('test', len(x_test)), columns = ['split'], index = x_test.index)
test_split = test_split.join(crop_yield.copy()[crop_yield.index.isin(x_test.index)])
test_split['cv_prediction'] = np.repeat(np.nan, len(x_test))

predictions = pd.concat([train_split, test_split])

print(f'Val  R2: {r2_score(y_train, val_predictions):0.2f}\nTest R2: {r2_score(y_test, test_predictions):0.2f}')

Val  R2: 0.47
Test R2: 0.47
