# Modeling Crop Yield: Landsat + Sentinel
## Python modules

In [1]:
## import warnings
import time
import math
import os
import glob
from pyhere import here
from datetime import date
import re

import numpy as np
import pandas as pd
import geopandas
import pickle

import pyarrow
import itertools
import multiprocessing
import p_tqdm

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge, RidgeCV
from sklearn.model_selection import train_test_split, KFold, LeaveOneGroupOut, cross_val_score, GridSearchCV, cross_val_predict
from sklearn.metrics import r2_score
from scipy.stats import spearmanr,  pearsonr

In [2]:
def split_fn(file_name):
    f            = file_name.split(sep="_")
    satellite    = f[0],
    bands        = f[1].replace("bands-", "")
    country_code = f[2],
    points       = f[3].replace("k-points", "")
    num_features = f[4].replace("-features", "")
    yrs          = f[5].replace("yr-", "")
    mns          = f[6].replace("mn-", "")
    limit_months = f[7].replace("lm-", "")
    crop_mask    = f[8].replace("cm-", "")
    weighted_avg = f[9].replace("wa-", "")
    
    return satellite, bands, country_code, points, yrs, mns, num_features, limit_months, crop_mask, weighted_avg

def merge(x, bases = (tuple, list)):
    for e in x:
        if type(e) in bases:
            for e in merge(e, bases):
                yield e
        else:
            yield e

In [3]:
files = os.listdir(here("data", "random_features", 'summary'))
files = [f for f in files if f not in ('.gitkeep', '.ipynb_checkpoints')]
paramlist = list(itertools.product(files, files))
paramlist = [tuple(set(paramlist[i])) for i in range(len(paramlist))]
paramlist = [x for x in paramlist if len(x) > 1] 
point_pattern = re.compile("20k-points")
wa_pattern = re.compile("cm-False")
paramlist = [t for t in paramlist if not (bool(point_pattern.search(t[0])) & bool(wa_pattern.search(t[0])))]
paramlist = [t for t in paramlist if not (bool(point_pattern.search(t[1])) & bool(wa_pattern.search(t[1])))]
paramlist = list(set(tuple(sorted(s)) for s in paramlist))
len(paramlist)

946

In [4]:
# for params in paramlist[0:1]:
def model_2_sensors(params):
#########################################     SET PARAMS    #########################################    
    f1         = params[0]
    f2         = params[1]

    satellite1, bands1, country_code, points1, yrs1, mns1,\
    num_features1, limit_months1, crop_mask1, weighted_avg1 = split_fn(f1)
    
    satellite2, bands2, country_code, points2, yrs2, mns2,\
    num_features2, limit_months2, crop_mask2, weighted_avg2 = split_fn(f2)
    
    alphas = {'alpha': np.logspace(-8, 8, base = 10, num = 17)}
    kfold = KFold()
    logo = LeaveOneGroupOut()
    ridge = Ridge()    
    
#########################################     READ DATA    #########################################
    features_1 = pd.read_feather(here('data', 'random_features', 'summary', f1))
    features_2 = pd.read_feather(here('data', 'random_features', 'summary', f2))
    
#########################################     CLEAN DATA    #########################################  
    min_year = max(min(features_1.year), min(features_2.year))
    max_year = min(max(features_1.year), max(features_2.year))
    
    features_1 = features_1[features_1.year >= min_year]
    features_2 = features_2[features_2.year >= min_year]
    
    features_1 = features_1[features_1.year <= max_year]
    features_2 = features_2[features_2.year <= max_year]
    
    features_1.drop(['crop_perc'], axis=1, errors='ignore', inplace=True)
    features_2.drop(['crop_perc'], axis=1, errors='ignore', inplace=True)
    
#########################################     JOIN DATA    #########################################  
    drop_cols = ['district', 'year', 'yield_mt']
    
    features_1 = features_1.set_index(drop_cols).add_prefix("f1_")
    features_2 = features_2.set_index(drop_cols).add_prefix("f2_")
    
    features = features_1.join(features_2).reset_index()
    features = features[~features.isna().any(axis = 1)]

#########################################    STANDARDIZE FEATURES    #########################################    
    features = features.set_index(drop_cols) 
    features_scaled = StandardScaler().fit_transform(features.values)
    features = pd.DataFrame(features_scaled, index=features.index).reset_index()

#########################################     CALCULATE ANOMALY   #########################################  
    features['yield_mt'] = np.log10(features.yield_mt + 1)
    features.set_index(['year', 'district'], inplace=True)
    var_cols = features.columns
    features = features[var_cols] - features.groupby(['district'], as_index=True)[var_cols].transform('mean')
    features.reset_index(drop=False, inplace=True)
    
#########################################     K-FOLD SPLIT    #########################################    
    x_all = features.drop(drop_cols, axis=1)
    y_all = features.yield_mt
    x_train, x_test, y_train, y_test = train_test_split(x_all, y_all, test_size=0.2, random_state=0)

#########################################     K-FOLD CV   ###########################################
    ### GRID SEARCH - FINDING BEST REGULARIZATION PARAMETER
    kfold_ridge_reg = GridSearchCV(ridge, alphas, scoring = 'r2', cv = kfold)
    kfold_ridge_reg.fit(x_train, y_train)
    kfold_best_model = kfold_ridge_reg.best_estimator_
    ### PREDICT - PREDICTING WITH BEST HYPERPARAMETER
    kfold_val_predictions = cross_val_predict(kfold_best_model, X = x_train, y = y_train, cv = kfold)   
    y_pred_train_k = kfold_best_model.predict(x_train)
    y_pred_test_k  = kfold_best_model.predict(x_test)

#########################################     LOGO ITERATOR   ###########################################
    logo_val_results = []
    logo_train_results = []
    logo_test_results = []

    for year in features.year.unique():
#########################################     LOGO SPLIT   ###########################################
        x_train_g = features[features.year != year].drop(drop_cols, axis=1)
        y_train_g = features[features.year != year].yield_mt.ravel()
        g_train_g = features[features.year != year].year.ravel()
        d_train_g = features[features.year != year].district.ravel()

        x_test_g = features[features.year == year].drop(drop_cols, axis=1)
        y_test_g = features[features.year == year].yield_mt.ravel()
        g_test_g = features[features.year == year].year.ravel()
        d_test_g = features[features.year == year].district.ravel()

#########################################     LOGO CV   ###########################################
        ### GRID SEARCH - FINDING BEST REGULARIZATION PARAMETER
        logo_ridge_reg = GridSearchCV(ridge, alphas, scoring='r2', cv=logo)
        logo_ridge_reg.fit(x_train_g, y_train_g, groups=g_train_g)
        logo_best_model = logo_ridge_reg.best_estimator_
        ### PREDICT - PREDICTING WITH BEST HYPERPARAMETER
        logo_val_predictions = cross_val_predict(logo_best_model, X=x_train_g, y=y_train_g, groups=g_train_g, cv=logo) 
        logo_train_pred = logo_best_model.predict(x_train_g)
        logo_test_pred  = logo_best_model.predict(x_test_g)

#########################################     LOGO RESULTS   ###########################################
        val_results = {'year': g_train_g, 'district': d_train_g, 'split': 'val', 
                       'observed': y_train_g, 'predicted': logo_val_predictions}

        train_results = {'year': g_train_g, 'district': d_train_g,'split': 'train', 
                         'observed': y_train_g, 'predicted': logo_train_pred}

        test_results = {'year': g_test_g, 'district': d_test_g, 'split': 'test', 
                        'observed': y_test_g, 'predicted': logo_test_pred}
        
        logo_val_results.append(val_results)
        logo_train_results.append(train_results)
        logo_test_results.append(test_results)

#########################################     EXPLODE RESULTS   ###########################################
    explode_cols = ['year', 'district', 'observed', 'predicted']
    val_df   = pd.DataFrame(logo_val_results  ).explode(explode_cols) 
    train_df = pd.DataFrame(logo_train_results).explode(explode_cols) 
    test_df  = pd.DataFrame(logo_test_results ).explode(explode_cols)
    
    group_cols = ['year', 'district', 'split']
    val_summary   =   val_df.groupby(group_cols, as_index=False).mean()
    train_summary = train_df.groupby(group_cols, as_index=False).mean()

#########################################     SAVE RESULTS    #########################################
    d = {
        'country': country_code,
        
        'satellite_1'   : satellite1[0],
        'bands_1'       : bands1,
        'num_features_1': num_features1,
        'points_1'      : points1, 
        'month_range_1' : mns1,
        'limit_months_1': limit_months1,
        'crop_mask_1'   : crop_mask1,
        'weighted_avg_1': weighted_avg1,
        
        'satellite_2'   : satellite2[0],
        'bands_2'       : bands2,
        'num_features_2': num_features2,
        'points_2'      : points2, 
        'month_range_2' : mns2,
        'limit_months_2': limit_months2,
        'crop_mask_2'   : crop_mask2,
        'weighted_avg_2': weighted_avg2,

        'kfold_total_n': len(x_all),
        'kfold_train_n': len(x_train),
        'kfold_test_n' : len(x_test),
        
        'kfold_best_reg_param': list(kfold_ridge_reg.best_params_.values())[0],
        'kfold_mean_of_val_R2s': kfold_ridge_reg.best_score_,
        'kfold_val_R2': r2_score(y_train, kfold_val_predictions),
        'kfold_val_r' : pearsonr(kfold_val_predictions, y_train)[0],
        'kfold_val_r2': pearsonr(kfold_val_predictions, y_train)[0] ** 2,
        
        'kfold_train_R2': r2_score(y_train, y_pred_train_k),
        'kfold_train_r' : pearsonr(y_pred_train_k, y_train)[0],
        'kfold_train_r2': pearsonr(y_pred_train_k, y_train)[0] ** 2,
        
        'kfold_test_R2': r2_score(y_test, y_pred_test_k),
        'kfold_test_r' : pearsonr(y_pred_test_k, y_test)[0],
        'kfold_test_r2': pearsonr(y_pred_test_k, y_test)[0] ** 2,
        
        'logo_total_n': len(features),
        'logo_train_n': len(train_df),
        'logo_test_n' : len(test_df),    
        
        'logo_best_reg_param': list(logo_ridge_reg.best_params_.values())[0],      
        'logo_summary_val_R2': r2_score(val_summary.observed, val_summary.predicted),
        'logo_summary_val_r' : pearsonr(val_summary.observed, val_summary.predicted)[0],
        'logo_val_R2' : r2_score(val_df.observed, val_df.predicted),
        'logo_val_r'  : pearsonr(val_df.predicted, val_df.observed)[0],
        'logo_val_r2' : pearsonr(val_df.predicted, val_df.observed)[0] ** 2,
        
        'logo_summary_train_R2': r2_score(train_summary.observed, train_summary.predicted),
        'logo_summary_train_r' : pearsonr(train_summary.observed, train_summary.predicted)[0],
        'logo_train_R2': r2_score(train_df.observed, train_df.predicted),
        'logo_train_r' : pearsonr(train_df.predicted, train_df.observed)[0],
        'logo_train_r2': pearsonr(train_df.predicted, train_df.observed)[0] ** 2,
        
        'logo_test_R2': r2_score(test_df.observed, test_df.predicted),
        'logo_test_r' : pearsonr(test_df.predicted, test_df.observed)[0],
        'logo_test_r2': pearsonr(test_df.predicted, test_df.observed)[0] ** 2,
    }
    print('done')
    df = pd.DataFrame(data=d)
    return df

In [None]:
%%time    
#### No progress bar
workers = os.cpu_count()
if __name__ == "__main__":
    with multiprocessing.Pool(processes=workers) as pool:
        output = []
        for result in pool.imap_unordered(model_2_sensors, paramlist):
            output.append(result)
    results = pd.concat(output).reset_index(drop=True)
    today = date.today().strftime("%Y-%m-%d")
    file_name = f'2_sensor_anomaly_results_{today}.csv'
    print(f"Saving results as: {file_name}\n\n")           
    results.to_csv(here("data","results", file_name), index=False)

done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done


In [None]:
results

###### max(results.logo_val_R2)

In [None]:
# %%time     
# ##### With progress bar
# workers = os.cpu_count()
# if __name__ == "__main__":
#     output = []
#     for result in p_tqdm.p_umap(model_2_sensors, paramlist):
#         output.append(result)
#     results = pd.concat(output).reset_index(drop=True)
#     today = date.today().strftime("%Y-%m-%d")
#     file_name = f'2_sensor_anomaly_results_{today}.csv'
#     print(f"Saving results as: {file_name}\n\n")           
#     results.to_csv(here("data","results", file_name), index=False)

  0%|          | 0/946 [00:00<?, ?it/s]

In [None]:
# %%time
# #########################################     SET PARAMS    #########################################    
# f1 = 'landsat-8-c2-l2_bands-1-2-3-4-5-6-7_ZMB_20k-points_1000-features_yr-2013-2021_mn-4-9_lm-True_cm-True_wa-True_summary.feather'
# f2 = 'sentinel-2-l2a_bands-2-3-4_ZMB_4k-points_1000-features_yr-2016-2022_mn-1-12_lm-False_cm-True_wa-True_summary.feather'

# satellite1, bands1, country_code, points1, yrs1, mns1,\
# num_features1, limit_months1, crop_mask1, weighted_avg1 = split_fn(f1)

# satellite2, bands2, country_code, points2, yrs2, mns2,\
# num_features2, limit_months2, crop_mask2, weighted_avg2 = split_fn(f2)
# alphas = {'alpha': np.logspace(-8, 8, base = 10, num = 17)}

# #########################################     READ DATA    #########################################
# features_1 = pd.read_feather(here('data', 'random_features', 'summary', f1))
# features_2 = pd.read_feather(here('data', 'random_features', 'summary', f2))

# #########################################     CLEAN DATA    #########################################  
# min_year = max(min(features_1.year), min(features_2.year))
# max_year = min(max(features_1.year), max(features_2.year))

# features_1 = features_1[features_1.year >= min_year]
# features_2 = features_2[features_2.year >= min_year]

# features_1 = features_1[features_1.year <= max_year]
# features_2 = features_2[features_2.year <= max_year]

# features_1.drop(['crop_perc'], axis=1, errors='ignore', inplace=True)
# features_2.drop(['crop_perc'], axis=1, errors='ignore', inplace=True)

# drop_cols = ['district', 'year', 'yield_mt']

# features_1 = features_1.set_index(drop_cols).add_prefix("f1_")
# features_2 = features_2.set_index(drop_cols).add_prefix("f2_")

# #########################################     JOIN DATA    #########################################  
# features = features_1.join(features_2).reset_index()
# features = features[~features.isna().any(axis = 1)]

# #########################################    STANDARDIZE FEATURES    #########################################    
# features = features.set_index(drop_cols) 
# features_scaled = StandardScaler().fit_transform(features.values)
# features = pd.DataFrame(features_scaled, index=features.index).reset_index()

# #########################################     CALCULATE ANOMALY   #########################################  
# features['yield_mt'] = np.log10(features.yield_mt + 1)
# features.set_index(['year', 'district'], inplace=True)
# var_cols = features.columns
# features = features[var_cols] - features.groupby(['district'], as_index=True)[var_cols].transform('mean')
# features.reset_index(drop=False, inplace=True)

# #########################################     K-FOLD SPLIT    #########################################    
# x_all = features.drop(drop_cols, axis=1)
# y_all = features.yield_mt
# x_train, x_test, y_train, y_test = train_test_split(x_all, y_all, test_size=0.2, random_state=0)

# #########################################     K-FOLD CV    ###########################################
# kfold = KFold()
# ridge = Ridge()
# ### GRID SEARCH - FINDING BEST REGULARIZATION PARAMETER
# kfold_ridge_reg = GridSearchCV(ridge, alphas, scoring = 'r2', cv = kfold)
# kfold_ridge_reg.fit(x_train, y_train)
# kfold_best_model = kfold_ridge_reg.best_estimator_
# ### VALIDATION PREDICT - PREDICTING WITH BEST HYPERPARAMETER
# kfold_val_predictions = cross_val_predict(kfold_best_model, X = x_train, y = y_train, cv = kfold)   
# ### TRAIN AND TEST PREDICT
# y_pred_train_k = kfold_best_model.predict(x_train)
# y_pred_test_k  = kfold_best_model.predict(x_test)

# #########################################     LOGO SPLIT   ###########################################
# x_train_g = features[features.year < max(features.year)].drop(drop_cols, axis=1)
# y_train_g = features[features.year < max(features.year)].yield_mt
# g_train_g = features[features.year < max(features.year)].year.ravel()

# x_test_g = features[features.year == max(features.year)].drop(drop_cols, axis=1)
# y_test_g = features[features.year == max(features.year)].yield_mt
# g_test_g = features[features.year == max(features.year)].year

# #########################################     LOGO CV    ###########################################
# logo = LeaveOneGroupOut()
# ridge = Ridge()
# ### GRID SEARCH - FINDING BEST REGULARIZATION PARAMETER
# logo_ridge_reg = GridSearchCV(ridge, alphas, scoring='r2', cv=logo)
# logo_ridge_reg.fit(x_train_g, y_train_g, groups=g_train_g)
# logo_best_model = logo_ridge_reg.best_estimator_
# ### VALIDATION PREDICT - PREDICTING WITH BEST HYPERPARAMETER
# logo_val_predictions = cross_val_predict(logo_best_model, X=x_train_g, y=y_train_g, groups=g_train_g, cv=logo)   
# ### TRAIN AND TEST PREDICT
# logo_train_pred = logo_best_model.predict(x_train_g)
# logo_test_pred  = logo_best_model.predict(x_test_g)

In [None]:
# r2_score(y_train_g, logo_val_predictions)

In [None]:
# r2_score(y_test_g, logo_test_pred)

In [6]:
# %%time
# #########################################     SET PARAMS    #########################################    
# f1 = 'landsat-8-c2-l2_bands-1-2-3-4-5-6-7_ZMB_20k-points_1000-features_yr-2013-2021_mn-4-9_lm-True_cm-True_wa-True_summary.feather'
# f2 = 'sentinel-2-l2a_bands-2-3-4_ZMB_4k-points_1000-features_yr-2016-2022_mn-1-12_lm-False_cm-True_wa-True_summary.feather'

# satellite1, bands1, country_code, points1, yrs1, mns1,\
# num_features1, limit_months1, crop_mask1, weighted_avg1 = split_fn(f1)

# satellite2, bands2, country_code, points2, yrs2, mns2,\
# num_features2, limit_months2, crop_mask2, weighted_avg2 = split_fn(f2)

# alphas = {'alpha': np.logspace(-8, 8, base = 10, num = 17)}
# kfold = KFold()
# logo = LeaveOneGroupOut()
# ridge = Ridge()    

# #########################################     READ DATA    #########################################
# features_1 = pd.read_feather(here('data', 'random_features', 'summary', f1))
# features_2 = pd.read_feather(here('data', 'random_features', 'summary', f2))

# #########################################     CLEAN DATA    #########################################  
# min_year = max(min(features_1.year), min(features_2.year))
# max_year = min(max(features_1.year), max(features_2.year))

# features_1 = features_1[features_1.year >= min_year]
# features_2 = features_2[features_2.year >= min_year]

# features_1 = features_1[features_1.year <= max_year]
# features_2 = features_2[features_2.year <= max_year]

# features_1.drop(['crop_perc'], axis=1, errors='ignore', inplace=True)
# features_2.drop(['crop_perc'], axis=1, errors='ignore', inplace=True)

# #########################################     JOIN DATA    #########################################  
# drop_cols = ['district', 'year', 'yield_mt']

# features_1 = features_1.set_index(drop_cols).add_prefix("f1_")
# features_2 = features_2.set_index(drop_cols).add_prefix("f2_")

# features = features_1.join(features_2).reset_index()
# features = features[~features.isna().any(axis = 1)]

# #########################################    STANDARDIZE FEATURES    #########################################    
# features = features.set_index(drop_cols) 
# features_scaled = StandardScaler().fit_transform(features.values)
# features = pd.DataFrame(features_scaled, index=features.index).reset_index()

# #########################################     CALCULATE ANOMALY   #########################################  
# features['yield_mt'] = np.log10(features.yield_mt + 1)
# features.set_index(['year', 'district'], inplace=True)
# var_cols = features.columns
# features = features[var_cols] - features.groupby(['district'], as_index=True)[var_cols].transform('mean')
# features.reset_index(drop=False, inplace=True)

# #########################################     K-FOLD SPLIT    #########################################    
# x_all = features.drop(drop_cols, axis=1)
# y_all = features.yield_mt
# x_train, x_test, y_train, y_test = train_test_split(x_all, y_all, test_size=0.2, random_state=0)

# #########################################     K-FOLD CV   ###########################################
# ### GRID SEARCH - FINDING BEST REGULARIZATION PARAMETER
# kfold_ridge_reg = GridSearchCV(ridge, alphas, scoring = 'r2', cv = kfold)
# kfold_ridge_reg.fit(x_train, y_train)
# kfold_best_model = kfold_ridge_reg.best_estimator_
# ### PREDICT - PREDICTING WITH BEST HYPERPARAMETER
# kfold_val_predictions = cross_val_predict(kfold_best_model, X = x_train, y = y_train, cv = kfold)   
# y_pred_train_k = kfold_best_model.predict(x_train)
# y_pred_test_k  = kfold_best_model.predict(x_test)

# #########################################     LOGO ITERATOR   ###########################################
# logo_val_results = []
# logo_train_results = []
# logo_test_results = []

# for year in features.year.unique():
# #########################################     LOGO SPLIT   ###########################################
#     x_train_g = features[features.year != year].drop(drop_cols, axis=1)
#     y_train_g = features[features.year != year].yield_mt.ravel()
#     g_train_g = features[features.year != year].year.ravel()
#     d_train_g = features[features.year != year].district.ravel()

#     x_test_g = features[features.year == year].drop(drop_cols, axis=1)
#     y_test_g = features[features.year == year].yield_mt.ravel()
#     g_test_g = features[features.year == year].year.ravel()
#     d_test_g = features[features.year == year].district.ravel()

# #########################################     LOGO CV   ###########################################
#     ### GRID SEARCH - FINDING BEST REGULARIZATION PARAMETER
#     logo_ridge_reg = GridSearchCV(ridge, alphas, scoring='r2', cv=logo)
#     logo_ridge_reg.fit(x_train_g, y_train_g, groups=g_train_g)
#     logo_best_model = logo_ridge_reg.best_estimator_
#     ### PREDICT - PREDICTING WITH BEST HYPERPARAMETER
#     logo_val_predictions = cross_val_predict(logo_best_model, X=x_train_g, y=y_train_g, groups=g_train_g, cv=logo) 
#     logo_train_pred = logo_best_model.predict(x_train_g)
#     logo_test_pred  = logo_best_model.predict(x_test_g)

# #########################################     LOGO RESULTS   ###########################################
#     val_results = {'year': g_train_g, 'district': d_train_g, 'split': 'val', 
#                    'observed': y_train_g, 'predicted': logo_val_predictions}

#     train_results = {'year': g_train_g, 'district': d_train_g,'split': 'train', 
#                      'observed': y_train_g, 'predicted': logo_train_pred}

#     test_results = {'year': g_test_g, 'district': d_test_g, 'split': 'test', 
#                     'observed': y_test_g, 'predicted': logo_test_pred}

#     logo_val_results.append(val_results)
#     logo_train_results.append(train_results)
#     logo_test_results.append(test_results)

# #########################################     EXPLODE RESULTS   ###########################################
# explode_cols = ['year', 'district', 'observed', 'predicted']
# val_df   = pd.DataFrame(logo_val_results  ).explode(explode_cols) 
# train_df = pd.DataFrame(logo_train_results).explode(explode_cols) 
# test_df  = pd.DataFrame(logo_test_results ).explode(explode_cols)

# group_cols = ['year', 'district', 'split']
# val_summary   =   val_df.groupby(group_cols, as_index=False).mean()
# train_summary = train_df.groupby(group_cols, as_index=False).mean()

# #########################################     SAVE RESULTS    #########################################
# d = {
#     'country': country_code,

#     'satellite_1'   : satellite1[0],
#     'bands_1'       : bands1,
#     'num_features_1': num_features1,
#     'points_1'      : points1, 
#     'month_range_1' : mns1,
#     'limit_months_1': limit_months1,
#     'crop_mask_1'   : crop_mask1,
#     'weighted_avg_1': weighted_avg1,

#     'satellite_2'   : satellite2[0],
#     'bands_2'       : bands2,
#     'num_features_2': num_features2,
#     'points_2'      : points2, 
#     'month_range_2' : mns2,
#     'limit_months_2': limit_months2,
#     'crop_mask_2'   : crop_mask2,
#     'weighted_avg_2': weighted_avg2,

#     'kfold_total_n': len(x_all),
#     'kfold_train_n': len(x_train),
#     'kfold_test_n' : len(x_test),

#     'kfold_best_reg_param': list(kfold_ridge_reg.best_params_.values())[0],
#     'kfold_mean_of_val_R2s': kfold_ridge_reg.best_score_,
#     'kfold_val_R2': r2_score(y_train, kfold_val_predictions),
#     'kfold_val_r' : pearsonr(kfold_val_predictions, y_train)[0],
#     'kfold_val_r2': pearsonr(kfold_val_predictions, y_train)[0] ** 2,

#     'kfold_train_R2': r2_score(y_train, y_pred_train_k),
#     'kfold_train_r' : pearsonr(y_pred_train_k, y_train)[0],
#     'kfold_train_r2': pearsonr(y_pred_train_k, y_train)[0] ** 2,

#     'kfold_test_R2': r2_score(y_test, y_pred_test_k),
#     'kfold_test_r' : pearsonr(y_pred_test_k, y_test)[0],
#     'kfold_test_r2': pearsonr(y_pred_test_k, y_test)[0] ** 2,

#     'logo_total_n': len(features),
#     'logo_train_n': len(train_df),
#     'logo_test_n' : len(test_df),    

#     'logo_best_reg_param': list(logo_ridge_reg.best_params_.values())[0],      
#     'logo_summary_val_R2': r2_score(val_summary.observed, val_summary.predicted),
#     'logo_summary_val_r' : pearsonr(val_summary.observed, val_summary.predicted)[0],
#     'logo_val_R2' : r2_score(val_df.observed, val_df.predicted),
#     'logo_val_r'  : pearsonr(val_df.predicted, val_df.observed)[0],
#     'logo_val_r2' : pearsonr(val_df.predicted, val_df.observed)[0] ** 2,

#     'logo_summary_train_R2': r2_score(train_summary.observed, train_summary.predicted),
#     'logo_summary_train_r' : pearsonr(train_summary.observed, train_summary.predicted)[0],
#     'logo_train_R2': r2_score(train_df.observed, train_df.predicted),
#     'logo_train_r' : pearsonr(train_df.predicted, train_df.observed)[0],
#     'logo_train_r2': pearsonr(train_df.predicted, train_df.observed)[0] ** 2,

#     'logo_test_R2': r2_score(test_df.observed, test_df.predicted),
#     'logo_test_r' : pearsonr(test_df.predicted, test_df.observed)[0],
#     'logo_test_r2': pearsonr(test_df.predicted, test_df.observed)[0] ** 2,
# }
# print('done')
# df = pd.DataFrame(data=d)

done
CPU times: user 22min 33s, sys: 1h 8min 22s, total: 1h 30min 55s
Wall time: 3min 14s


In [None]:
# print(f'''Val: {r2_score(val_df.observed, val_df.predicted):0.2f}
# Train: {r2_score(train_df.observed, train_df.predicted):0.2f}
# Test: {r2_score(test_df.observed, test_df.predicted):0.2f}

# Val summary: {r2_score(val_summary.observed, val_summary.predicted):0.2f}
# Train summary: {r2_score(train_summary.observed, train_summary.predicted):0.2f}''')

In [None]:
# for year in features.year.unique():
#     a = val_df[val_df.year == year]
#     print(f'{year}: {r2_score(a.observed, a.predicted)}')

In [None]:
# for year in features.year.unique():
#     a = train_df[train_df.year == year]
#     print(f'{year}: {r2_score(a.observed, a.predicted)}')

In [None]:
# for year in features.year.unique():
#     a = test_df[test_df.year == year]
#     print(f'{year}: {r2_score(a.observed, a.predicted)}')

In [None]:
# train = x_train_g.copy()
# test = x_test_g.copy()
# train['logo_cv_prediction'] = logo_val_predictions
# train['split'], test['split'] = 'train', 'test'
# train_test = pd.concat([train, test])[['split','logo_cv_prediction']].sort_index()
# train_test['district'] = features.district
# train_test['year'] = features.year
# train_test['yield_mt'] = y_all
# train_test['logo_prediction'] = logo_best_model.predict(x_all)
# train_test = train_test[['district', 'year', 'split', 'yield_mt', 'logo_prediction', 'logo_cv_prediction']]
# train_test

In [None]:
# import matplotlib.pyplot as plt
# train_test[train_test.split == 'train'].plot.scatter(x = 'yield_mt', y = 'logo_cv_prediction')

In [None]:
# plt.scatter(train_test.yield_mt, train_test.logo_prediction)

In [None]:
# train_test[train_test.split == 'train'].plot.scatter(x = 'yield_mt', y = 'logo_prediction')

In [None]:
# train_test[train_test.split == 'test'].plot.scatter(x = 'yield_mt', y = 'logo_prediction')