In [1]:
!pip install -q pyhere p_tqdm glum

In [1]:
import time
import math
import os
import glob
from pyhere import here
from datetime import date
import re
from collections import Counter

import numpy as np
import pandas as pd
import geopandas
import pickle

import pyarrow
import itertools
import multiprocessing
import p_tqdm

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge, RidgeCV
from sklearn.model_selection import train_test_split, KFold, LeaveOneGroupOut, cross_val_score, GridSearchCV, cross_val_predict
from sklearn.metrics import r2_score
from scipy.stats import spearmanr,  pearsonr

In [2]:
climate_df = pd.read_csv(here('data', 'climate', 'climate_summary.csv'))
climate_df = climate_df.dropna()
drop_cols = ['year', 'district', 'yield_mt']
ndvi_cols = climate_df.columns[climate_df.columns.to_series().str.contains('ndvi')]
keep_cols = [*ndvi_cols, *drop_cols]
# climate_df = climate_df.loc[:, keep_cols]
climate_df = climate_df[climate_df.year >= 2016]
# climate_df = climate_df[climate_df.year != 2011]

hot_encode = True
# hot_encode = False

crop_yield = climate_df.copy().loc[:, tuple(drop_cols)].reset_index(drop = True)
crop_yield["log_yield"] = np.log10(crop_yield.yield_mt.to_numpy() + 1)

########################################    STANDARDIZE FEATURES    #########################################    
climate_df = climate_df.set_index(drop_cols) 
climate_df_scaled = StandardScaler().fit_transform(climate_df.values)
climate_df = pd.DataFrame(climate_df_scaled, index=climate_df.index).reset_index()
climate_df.columns = climate_df.columns.astype(str)

#########################################    HOT ENCODE    ######################################### 
if hot_encode:
    drop_cols.remove('district')
    climate_df = pd.get_dummies(climate_df, columns = ["district"], drop_first = False)
else:
    pass

#########################################     K-FOLD SPLIT    #########################################
x_all = climate_df.drop(drop_cols, axis = 1) 
y_all = np.log10(climate_df.yield_mt.to_numpy() + 1)
x_train, x_test, y_train, y_test = train_test_split(x_all, y_all, test_size=0.2, random_state=0)

In [4]:
# x_train#.iloc[:,0:37]

In [5]:
# from tqdm.contrib.itertools import product
# from glum import GeneralizedLinearRegressor as glm
# import warnings
# from scipy.linalg import LinAlgWarning


In [6]:
# X=x_train
# y=y_train
# grid=np.logspace(-8, 8, base = 10, num = 17) 
# n_splits=5
# start=[0, 12, 24, 36]
# end=[12, 24, 36, 108]
# static_lam=1e-16
# verbose=True
# fit_model_after_tuning=True

# kfold = KFold(n_splits=n_splits)
# alpha = {'alpha': [1]}

# if hasattr(start, '__iter__'):
#     pass
# else:
#     start = [start]; end = [end]
    
# penalties = [static_lam for j in range(X.shape[1])] 
# lambdas = []
# for i in range(len(start)):
#     scores = []
#     for pen in grid:
#         if verbose:
#             print(pen, end=" ")
#         penalties[start[i]:end[i]] = [pen for j in range(end[i]-start[i])]
#         ridge = glm(family="normal", P2=penalties, l1_ratio=0, random_state=42)   
#         search = GridSearchCV(ridge, alpha, scoring = 'r2', cv = kfold).fit(X, y)
#         scores.append(search.best_score_)
#     best_lambda = grid[np.argmax(scores)]
#     penalties[start[i]:end[i]] = [best_lambda for j in range(end[i]-start[i])]
#     if verbose:
#         print(f'''\n\tBest \u03BB {i+1}: {best_lambda}\n\tVal R2 {i+1}: {scores[np.argmax(scores)]:0.4f}''') 
#     lambdas.append(best_lambda)
# if fit_model_after_tuning:
#     for k in range(len(start)):
#         penalties[start[k]:end[k]] = [lambdas[k] for j in range(end[k]-start[k])]
#     ridge = glm(family="normal", P2=penalties, l1_ratio=0, random_state=42) 
#     model = GridSearchCV(ridge, alpha, scoring = 'r2', cv = kfold).fit(X, y).best_estimator_
# else:
#     model = np.nan

In [10]:
kfold = KFold(n_splits=5)
import task_modeling_utils

In [6]:
alphas = {'alpha': np.logspace(-8, 8, base = 10, num = 17)}
alphas.get('alpha')

array([1.e-08, 1.e-07, 1.e-06, 1.e-05, 1.e-04, 1.e-03, 1.e-02, 1.e-01,
       1.e+00, 1.e+01, 1.e+02, 1.e+03, 1.e+04, 1.e+05, 1.e+06, 1.e+07,
       1.e+08])

In [11]:
best_lam, res, model = kfold_rr_multi_lambda_tuning(
    x_train, y_train, 
    grid=np.logspace(-8, 8, base = 10, num = 17), 
    start=[0, 36],
    end=[36, 108], 
    static_lam=1e-16,
    verbose=True,
    show_linalg_warning=False,
    fit_model_after_tuning=True
)
val_predictions = cross_val_predict(model, X = x_train, y = y_train, cv = kfold) 
print(f"""Final Val R2: {r2_score(y_train, val_predictions):0.4f}
Final Test R2: {r2_score(y_test, model.predict(x_test)):0.4f}""")

1e-08 1e-07 1e-06 1e-05 0.0001 0.001 0.01 0.1 1.0 10.0 100.0 1000.0 10000.0 100000.0 1000000.0 10000000.0 100000000.0 
	Best λ 1: 0.01
	Val R2 1: 0.7937
1e-08 1e-07 1e-06 1e-05 0.0001 0.001 0.01 0.1 1.0 10.0 100.0 1000.0 10000.0 100000.0 1000000.0 10000000.0 100000000.0 
	Best λ 2: 0.0001
	Val R2 2: 0.7944
Total time: 2.33 minutes
Final Val R2: 0.8058
Final Test R2: 0.8304


In [33]:
best_lam, res, model = kfold_rr_multi_lambda_tuning(
    x_train, y_train, grid=np.logspace(-8, 8, base = 10, num = 17), 
    start=[0, 12, 24, 36], end=[12, 24, 36, 108], static_lam=1e-16
)
val_predictions = cross_val_predict(model, X = x_train, y = y_train, cv = kfold) 
print(f"""Final Val R2: {r2_score(y_train, val_predictions):0.4f}
Final Test R2: {r2_score(y_test, model.predict(x_test)):0.4f}""")

1e-08 1e-07 1e-06 1e-05 0.0001 0.001 0.01 0.1 1.0 10.0 100.0 1000.0 10000.0 100000.0 1000000.0 10000000.0 100000000.0 
	Best λ 1: 0.1
	Val R2 1: 0.7899
1e-08 1e-07 1e-06 1e-05 0.0001 0.001 0.01 0.1 1.0 10.0 100.0 1000.0 10000.0 100000.0 1000000.0 10000000.0 100000000.0 
	Best λ 2: 0.01
	Val R2 2: 0.7987
1e-08 1e-07 1e-06 1e-05 0.0001 0.001 0.01 0.1 1.0 10.0 100.0 1000.0 10000.0 100000.0 1000000.0 10000000.0 100000000.0 
	Best λ 3: 0.01
	Val R2 3: 0.7994
1e-08 1e-07 1e-06 1e-05 0.0001 0.001 0.01 0.1 1.0 10.0 100.0 1000.0 10000.0 100000.0 1000000.0 10000000.0 100000000.0 
	Best λ 4: 0.0001
	Val R2 4: 0.8016
Final Val R2: 0.8127
Final Test R2: 0.8381


In [12]:
%%time
#########################################     K-FOLD CV   ###########################################
### SETUP
alphas = {'alpha': np.logspace(-8, 8, base = 10, num = 17)}
kfold = KFold()
ridge = Ridge()    
### GRID SEARCH - FINDING BEST REGULARIZATION PARAMETER
ridge_reg = GridSearchCV(ridge, alphas, scoring = 'r2', cv = kfold)
ridge_reg.fit(x_train, y_train)
best_model = ridge_reg.best_estimator_
### PREDICT - PREDICTING WITH BEST HYPERPARAMETER
val_predictions = cross_val_predict(best_model, X = x_train, y = y_train, cv = kfold)   
train_predictions = best_model.predict(x_train)
test_predictions  = best_model.predict(x_test)
print(
f"""Final Val R2: {r2_score(y_train, val_predictions):0.4f}
Final Test R2: {r2_score(y_test, test_predictions):0.4f}"""
)

Final Val R2: 0.7971
Final Test R2: 0.8190
CPU times: user 2.76 s, sys: 10.6 s, total: 13.4 s
Wall time: 3.32 s


In [3]:
from prediction_utils import *

In [5]:
%%time

lambdas=np.logspace(-8, 8, base = 10, num = 17)

kfold_results = kfold_solve_custom_split_col(
    X=x_train,
    y=y_train,
    locations=x_train.index,
    split_col=x_train.reset_index().index,
    lambdas=lambdas,
    static_lam_val=1e-20,
    static_lam_idxs=list(range(0,x_train.shape[1]-72)),
    intercept=True,
    num_folds=5,
    random_state=0,
    return_preds=True,
    return_model=False,
    svd_solve=False,
    allow_linalg_warning_instances=True,
    fit_model_after_tuning=False,
)
best_alpha_1_idx = interpret_kfold_results(kfold_results, "r2_score")[0][0][0]
best_alpha_1 = lambdas[best_alpha_1_idx]
preds = np.maximum(get_pred_truth_locs(kfold_results)[0].flatten(), 0)
truth = get_pred_truth_locs(kfold_results)[1].flatten()
# print(
# f"""Best alpha 1: {best_alpha_1}
# Val R2: {r2_score(truth, preds):0.4f}\n"""
# )

kfold_results = kfold_solve_custom_split_col(
    X=x_train,
    y=y_train,
    locations=x_train.index,
    split_col=x_train.reset_index().index,
    lambdas=lambdas,
    static_lam_val=best_alpha_1,
    static_lam_idxs=list(range(x_train.shape[1]-72, x_train.shape[1])),
    intercept=True,
    num_folds=5,
    random_state=0,
    return_preds=True,
    return_model=False,
    svd_solve=False,
    allow_linalg_warning_instances=True,
    fit_model_after_tuning=False,
)
best_alpha_2_idx = interpret_kfold_results(kfold_results, "r2_score")[0][0][0]
best_alpha_2 = lambdas[best_alpha_2_idx]
preds = np.maximum(get_pred_truth_locs(kfold_results)[0].flatten(), 0)
truth = get_pred_truth_locs(kfold_results)[1].flatten()


model, intercept_term = custom_ridge(
    X=x_train,
    y=y_train,
    lam=best_alpha_1, 
    intercept=True,
    static_lam_val=best_alpha_2,
    static_lam_idxs=list(range(x_train.shape[1]-72, x_train.shape[1])))
pred_test = np.asarray(x_test).dot(model) + intercept_term 
pred_test = np.maximum(pred_test, 0)

# print(
# f"""Best alpha 2: {best_alpha_2}
# Fianl Val R2: {r2_score(truth, preds):0.4f}
# Final test R2: {r2_score(y_test, pred_test):0.4f}\n"""
# )

CPU times: user 594 ms, sys: 2.6 s, total: 3.2 s
Wall time: 794 ms


# NDVI model

In [126]:
climate_df = pd.read_csv(here('data', 'climate', 'climate_summary.csv'))
climate_df = climate_df.dropna()
drop_cols = ['year', 'district', 'yield_mt']
ndvi_cols = climate_df.columns[climate_df.columns.to_series().str.contains('ndvi')]
keep_cols = [*ndvi_cols, *drop_cols]
climate_df = climate_df.loc[:, keep_cols]
climate_df = climate_df[climate_df.year >= 2016]

hot_encode = True
# hot_encode = False

crop_yield = climate_df.copy().loc[:, tuple(drop_cols)].reset_index(drop = True)
crop_yield["log_yield"] = np.log10(crop_yield.yield_mt.to_numpy() + 1)

#########################################    HOT ENCODE    ######################################### 
if hot_encode:
    drop_cols.remove('district')
    climate_df = pd.get_dummies(climate_df, columns = ["district"], drop_first = False)
else:
    pass

#########################################    STANDARDIZE FEATURES    #########################################    
climate_df = climate_df.set_index(drop_cols) 
climate_df_scaled = StandardScaler().fit_transform(climate_df.values)
climate_df = pd.DataFrame(climate_df_scaled, index=climate_df.index).reset_index()
climate_df.columns = climate_df.columns.astype(str)

#########################################     K-FOLD SPLIT    #########################################
x_all = climate_df.drop(drop_cols, axis = 1) 
y_all = np.log10(climate_df.yield_mt.to_numpy() + 1)
x_train, x_test, y_train, y_test = train_test_split(x_all, y_all, test_size=0.2, random_state=0)

#########################################     K-FOLD CV   ###########################################
### SETUP
alphas = {'alpha': np.logspace(-8, 8, base = 10, num = 17)}
kfold = KFold()
ridge = Ridge(random_state=0)    
### GRID SEARCH - FINDING BEST REGULARIZATION PARAMETER
ridge_reg = GridSearchCV(ridge, alphas, scoring = 'r2', cv = kfold)
ridge_reg.fit(x_train, y_train)
best_model = ridge_reg.best_estimator_
### PREDICT - PREDICTING WITH BEST HYPERPARAMETER
val_predictions = cross_val_predict(best_model, X = x_train, y = y_train, cv = kfold)   
train_predictions = best_model.predict(x_train)
test_predictions  = best_model.predict(x_test)

#########################################     DE-MEAN R2    #########################################    
crop_yield["prediction"] = np.maximum(best_model.predict(x_all), 0)

train_split = pd.DataFrame(np.repeat('train', len(x_train)), columns = ['split'], index = x_train.index)
train_split = train_split.join(crop_yield.copy()[crop_yield.index.isin(x_train.index)])
train_split['cv_prediction'] = np.maximum(val_predictions, 0)
train_split["demean_cv_yield"] = train_split["log_yield"]-train_split.groupby('district')['log_yield'].transform('mean')
train_split["demean_cv_prediction"] = train_split["cv_prediction"]-train_split.groupby('district')['cv_prediction'].transform('mean')

test_split = pd.DataFrame(np.repeat('test', len(x_test)), columns = ['split'], index = x_test.index)
test_split = test_split.join(crop_yield.copy()[crop_yield.index.isin(x_test.index)])
test_split['cv_prediction'] = np.repeat(np.nan, len(x_test))
test_split["demean_cv_yield"] = np.repeat(np.nan, len(x_test))
test_split["demean_cv_prediction"] = np.repeat(np.nan, len(x_test))

predictions = pd.concat([train_split, test_split])

test_split["demean_test_yield"] = test_split["log_yield"]-test_split.groupby('district')['log_yield'].transform('mean')
test_split["demean_test_prediction"] = test_split["prediction"]-test_split.groupby('district')['prediction'].transform('mean')

print(f'Val  R2: {r2_score(y_train, val_predictions):0.4f}',
      f'\nTest R2: {r2_score(y_test, test_predictions):0.4f}',
     f'\n\nDemean Val  R2: {r2_score(train_split.demean_cv_yield, train_split.demean_cv_prediction):0.4f}',
     f'\nDemean Test R2: {r2_score(test_split.demean_test_yield, test_split.demean_test_prediction):0.4f}')

Val  R2: 0.7802 
Test R2: 0.8085 

Demean Val  R2: 0.0921 
Demean Test R2: 0.4394


# Precipitation, Temperature, and NDVI model

In [130]:
climate_df = pd.read_csv(here('data', 'climate', 'climate_summary.csv'))
climate_df = climate_df.dropna()
drop_cols = ['year', 'district', 'yield_mt']
climate_df = climate_df[climate_df.year >= 2016]

hot_encode = True
# hot_encode = False

crop_yield = climate_df.copy().loc[:, tuple(drop_cols)].reset_index(drop = True)
crop_yield["log_yield"] = np.log10(crop_yield.yield_mt.to_numpy() + 1)

#########################################    HOT ENCODE    ######################################### 
if hot_encode:
    drop_cols.remove('district')
    climate_df = pd.get_dummies(climate_df, columns = ["district"], drop_first = False)
else:
    pass

#########################################    STANDARDIZE FEATURES    #########################################    
climate_df = climate_df.set_index(drop_cols) 
climate_df_scaled = StandardScaler().fit_transform(climate_df.values)
climate_df = pd.DataFrame(climate_df_scaled, index=climate_df.index).reset_index()
climate_df.columns = climate_df.columns.astype(str)

#########################################     K-FOLD SPLIT    #########################################
x_all = climate_df.drop(drop_cols, axis = 1) 
y_all = np.log10(climate_df.yield_mt.to_numpy() + 1)
x_train, x_test, y_train, y_test = train_test_split(x_all, y_all, test_size=0.2, random_state=0)

#########################################     K-FOLD CV   ###########################################
### SETUP
alphas = {'alpha': np.logspace(-8, 8, base = 10, num = 17)}
kfold = KFold()
ridge = Ridge(random_state=0)      
### GRID SEARCH - FINDING BEST REGULARIZATION PARAMETER
ridge_reg = GridSearchCV(ridge, alphas, scoring = 'r2', cv = kfold)
ridge_reg.fit(x_train, y_train)
best_model = ridge_reg.best_estimator_
### PREDICT - PREDICTING WITH BEST HYPERPARAMETER
val_predictions = cross_val_predict(best_model, X = x_train, y = y_train, cv = kfold)   
train_predictions = best_model.predict(x_train)
test_predictions  = best_model.predict(x_test)

#########################################     DE-MEAN R2    #########################################    
crop_yield["prediction"] = np.maximum(best_model.predict(x_all), 0)

train_split = pd.DataFrame(np.repeat('train', len(x_train)), columns = ['split'], index = x_train.index)
train_split = train_split.join(crop_yield.copy()[crop_yield.index.isin(x_train.index)])
train_split['cv_prediction'] = np.maximum(val_predictions, 0)
train_split["demean_cv_yield"] = train_split["log_yield"]-train_split.groupby('district')['log_yield'].transform('mean')
train_split["demean_cv_prediction"] = train_split["cv_prediction"]-train_split.groupby('district')['cv_prediction'].transform('mean')

test_split = pd.DataFrame(np.repeat('test', len(x_test)), columns = ['split'], index = x_test.index)
test_split = test_split.join(crop_yield.copy()[crop_yield.index.isin(x_test.index)])
test_split['cv_prediction'] = np.repeat(np.nan, len(x_test))
test_split["demean_cv_yield"] = np.repeat(np.nan, len(x_test))
test_split["demean_cv_prediction"] = np.repeat(np.nan, len(x_test))

predictions = pd.concat([train_split, test_split])

test_split["demean_test_yield"] = test_split["log_yield"]-test_split.groupby('district')['log_yield'].transform('mean')
test_split["demean_test_prediction"] = test_split["prediction"]-test_split.groupby('district')['prediction'].transform('mean')

print(f'Val  R2: {r2_score(y_train, val_predictions):0.4f}',
      f'\nTest R2: {r2_score(y_test, test_predictions):0.4f}',
     f'\n\nDemean Val  R2: {r2_score(train_split.demean_cv_yield, train_split.demean_cv_prediction):0.4f}',
     f'\nDemean Test R2: {r2_score(test_split.demean_test_yield, test_split.demean_test_prediction):0.4f}')

Val  R2: 0.8044 
Test R2: 0.8297 

Demean Val  R2: 0.1723 
Demean Test R2: 0.5297


In [134]:
ridge_reg.best_score_

0.792567922097444

# NDVI Anomaly Model

In [9]:
climate_df = pd.read_csv(here('data', 'climate', 'climate_summary.csv'))
climate_df = climate_df.dropna()
drop_cols = ['year', 'district', 'yield_mt']
ndvi_cols = climate_df.columns[climate_df.columns.to_series().str.contains('ndvi')]
keep_cols = [*ndvi_cols, *drop_cols]
climate_df = climate_df.loc[:, keep_cols]
climate_df = climate_df[climate_df.year >= 2016]

crop_yield = climate_df.copy().loc[:, tuple(drop_cols)].reset_index(drop = True)
crop_yield["log_yield"] = np.log10(crop_yield.yield_mt.to_numpy() + 1)

#########################################    STANDARDIZE FEATURES    #########################################    
climate_df = climate_df.set_index(drop_cols) 
climate_df_scaled = StandardScaler().fit_transform(climate_df.values)
climate_df = pd.DataFrame(climate_df_scaled, index=climate_df.index).reset_index()
climate_df.columns = climate_df.columns.astype(str)

#########################################     CALCULATE ANOMALY   #########################################
climate_df['yield_mt'] = np.log10(climate_df.yield_mt.to_numpy() + 1)
climate_df.set_index(['year', 'district'], inplace=True)
var_cols = climate_df.columns
climate_df = climate_df[var_cols] - climate_df.groupby(['district'], as_index=True)[var_cols].transform('mean')
climate_df.reset_index(drop=False, inplace=True)

#########################################     K-FOLD SPLIT    #########################################
x_all = climate_df.drop(drop_cols, axis = 1) 
y_all = climate_df.yield_mt
x_train, x_test, y_train, y_test = train_test_split(x_all, y_all, test_size=0.2, random_state=0)

#########################################     K-FOLD CV   ###########################################
### SETUP
alphas = {'alpha': np.logspace(-8, 8, base = 10, num = 17)}
kfold = KFold()
ridge = Ridge(random_state=0)      
### GRID SEARCH - FINDING BEST REGULARIZATION PARAMETER
ridge_reg = GridSearchCV(ridge, alphas, scoring = 'r2', cv = kfold)
ridge_reg.fit(x_train, y_train)
best_model = ridge_reg.best_estimator_
### PREDICT - PREDICTING WITH BEST HYPERPARAMETER
val_predictions   = cross_val_predict(best_model, X = x_train, y = y_train, cv = kfold)   
train_predictions = best_model.predict(x_train)
test_predictions  = best_model.predict(x_test)

#########################################     DE-MEAN R2    #########################################    
crop_yield["prediction"] = best_model.predict(x_all)

train_split = pd.DataFrame(np.repeat('train', len(x_train)), columns = ['split'], index = x_train.index)
train_split = train_split.join(crop_yield.copy()[crop_yield.index.isin(x_train.index)])
train_split['cv_prediction'] = val_predictions

test_split = pd.DataFrame(np.repeat('test', len(x_test)), columns = ['split'], index = x_test.index)
test_split = test_split.join(crop_yield.copy()[crop_yield.index.isin(x_test.index)])
test_split['cv_prediction'] = np.repeat(np.nan, len(x_test))

predictions = pd.concat([train_split, test_split])

print(f'Val  R2: {r2_score(y_train, val_predictions):0.4f}\nTest R2: {r2_score(y_test, test_predictions):0.4f}')

Val  R2: 0.4449
Test R2: 0.4293


# Precipitation, Temperature, and NDVI  Anomaly model

In [10]:
climate_df = pd.read_csv(here('data', 'climate', 'climate_summary.csv'))
climate_df = climate_df.dropna()
drop_cols = ['year', 'district', 'yield_mt']
climate_df = climate_df[climate_df.year >= 2016]

crop_yield = climate_df.copy().loc[:, tuple(drop_cols)].reset_index(drop = True)
crop_yield["log_yield"] = np.log10(crop_yield.yield_mt.to_numpy() + 1)

#########################################    STANDARDIZE FEATURES    #########################################    
climate_df = climate_df.set_index(drop_cols) 
climate_df_scaled = StandardScaler().fit_transform(climate_df.values)
climate_df = pd.DataFrame(climate_df_scaled, index=climate_df.index).reset_index()
climate_df.columns = climate_df.columns.astype(str)

#########################################     CALCULATE ANOMALY   #########################################
climate_df['yield_mt'] = np.log10(climate_df.yield_mt.to_numpy() + 1)
climate_df.set_index(['year', 'district'], inplace=True)
var_cols = climate_df.columns
climate_df = climate_df[var_cols] - climate_df.groupby(['district'], as_index=True)[var_cols].transform('mean')
climate_df.reset_index(drop=False, inplace=True)

#########################################     K-FOLD SPLIT    #########################################
x_all = climate_df.drop(drop_cols, axis = 1) 
y_all = climate_df.yield_mt
x_train, x_test, y_train, y_test = train_test_split(x_all, y_all, test_size=0.2, random_state=0)

#########################################     K-FOLD CV   ###########################################
### SETUP
alphas = {'alpha': np.logspace(-8, 8, base = 10, num = 17)}
kfold = KFold()
ridge = Ridge(random_state=0)      
### GRID SEARCH - FINDING BEST REGULARIZATION PARAMETER
ridge_reg = GridSearchCV(ridge, alphas, scoring = 'r2', cv = kfold)
ridge_reg.fit(x_train, y_train)
best_model = ridge_reg.best_estimator_
### PREDICT - PREDICTING WITH BEST HYPERPARAMETER
val_predictions   = cross_val_predict(best_model, X = x_train, y = y_train, cv = kfold)   
train_predictions = best_model.predict(x_train)
test_predictions  = best_model.predict(x_test)

#########################################     DE-MEAN R2    #########################################    
crop_yield["prediction"] = best_model.predict(x_all)

train_split = pd.DataFrame(np.repeat('train', len(x_train)), columns = ['split'], index = x_train.index)
train_split = train_split.join(crop_yield.copy()[crop_yield.index.isin(x_train.index)])
train_split['cv_prediction'] = val_predictions

test_split = pd.DataFrame(np.repeat('test', len(x_test)), columns = ['split'], index = x_test.index)
test_split = test_split.join(crop_yield.copy()[crop_yield.index.isin(x_test.index)])
test_split['cv_prediction'] = np.repeat(np.nan, len(x_test))

predictions = pd.concat([train_split, test_split])

print(f'Val  R2: {r2_score(y_train, val_predictions):0.4f}\nTest R2: {r2_score(y_test, test_predictions):0.4f}')

Val  R2: 0.5284
Test R2: 0.5145
