# Modeling Crop Yield: Landsat + Sentinel
## Python modules

In [1]:
import warnings
import time
import os
import glob
from datetime import date

import multiprocessing
import dask
from dask.distributed import Client

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.pyplot as plt
import matplotlib.colors as colors

import geopandas

import pyarrow
from sklearn.linear_model import RidgeCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from scipy.stats import spearmanr
from scipy.linalg import LinAlgWarning
from scipy.stats import pearsonr

import itertools

from pyhere import here

import math
import seaborn as sns

from pyhere import here

In [2]:
def split_fn(file_name):
    f            = file_name.split(sep="_")
    satellite    = f[0],
    bands        = f[1].replace("bands-", "")
    country_code = f[2],
    points       = f[3].replace("k-points", "")
    num_features = f[4].replace("-features", "")
    yrs          = f[5].replace("yr-", "")
    mns          = f[6].replace("mn-", "")
    limit_months = f[7].replace("lm-", "")
    crop_mask    = f[8].replace("cm-", "")
    weighted_avg = f[9].replace("wa-", "")
    
    return satellite, bands, country_code, points, yrs, mns, num_features, limit_months, crop_mask, weighted_avg

def merge(x, bases = (tuple, list)):
    for e in x:
        if type(e) in bases:
            for e in merge(e, bases):
                yield e
        else:
            yield e

In [3]:
files = os.listdir(here("data", "random_features", 'summary_2'))
files = [f for f in files if f not in ('.gitkeep', '.ipynb_checkpoints')]
# files = files[0:8]
paramlist = list(itertools.product(files, files))
paramlist = [tuple(set(paramlist[i])) for i in range(len(paramlist))]
paramlist = [x for x in paramlist if len(x) > 1] 
paramlist = list(itertools.product(paramlist, [True, False]))
for i in range(len(paramlist)):
    paramlist[i] = tuple(merge(paramlist[i]))
len(paramlist)

6160

In [4]:
# for params in paramlist[0:1]:
def model_2_sensors(params):
    
    f1 = params[0]
    f2 = params[1]
    hot_encode = params[2]
        
    # print(f'''
    # {f1}
    # {f2}
    # ''')
    
    satellite1, bands1, country_code, points1, yrs1, mns1,\
    num_features1, limit_months1, crop_mask1, weighted_avg1 = split_fn(f1)
    
    satellite2, bands2, country_code, points2, yrs2, mns2,\
    num_features2, limit_months2, crop_mask2, weighted_avg2 = split_fn(f2)
    
    features_1 = pd.read_feather(here('data', 'random_features', 'summary_2', f1))
    features_2 = pd.read_feather(here('data', 'random_features', 'summary_2', f2))
    
    min_year = max(min(features_1.year), min(features_2.year))
    max_year = min(max(features_1.year), max(features_2.year))
    
    features_1 = features_1[features_1.year >= min_year]
    features_2 = features_2[features_2.year >= min_year]
    
    features_1 = features_1[features_1.year <= max_year]
    features_2 = features_2[features_2.year <= max_year]
    
    features_1.drop(['crop_perc'], axis=1, errors='ignore', inplace=True)
    features_2.drop(['crop_perc'], axis=1, errors='ignore', inplace=True)
    
    index_cols = ['district', 'year', 'yield_mt']
    
    features_1 = features_1.set_index(index_cols).add_prefix("f1_")
    features_2 = features_2.set_index(index_cols).add_prefix("f2_")
    
    features = features_1.join(features_2).reset_index()
    
    features = features[~features.isna().any(axis = 1)]
    
    crop_yield = features.copy().loc[:, tuple(index_cols)]
    
    if hot_encode:
        index_cols.remove('district')
        features = pd.get_dummies(features, columns = ["district"], drop_first = False)
    else:
        pass

    x_all = features.drop(index_cols, axis=1)
    x_all = StandardScaler().fit_transform(x_all)
    y_all = np.log10(features.yield_mt.to_numpy() + 1)
    
    x_train, x_test, y_train, y_test = train_test_split(
        x_all, y_all, test_size = 0.2, random_state = 0
    )

    ridge_cv_random = RidgeCV(cv = 5, alphas = np.logspace(-8, 8, base = 10, num = 17))
    ridge_cv_random.fit(x_train, y_train)
    
    y_pred_train = np.maximum(ridge_cv_random.predict(x_train), 0)
    r2_train = r2_score(y_train, y_pred_train)
    pearson_train = pearsonr(y_pred_train, y_train)[0]

    y_pred_test = np.maximum(ridge_cv_random.predict(x_test), 0)
    r2_test = r2_score(y_test, y_pred_test)
    pearson_test = pearsonr(y_pred_test, y_test)[0]
    

    crop_yield["log_yield"] = np.log10(crop_yield.yield_mt.to_numpy() + 1)
    crop_yield["prediction"] = np.maximum(ridge_cv_random.predict(x_all), 0)
    crop_yield["residual"] = crop_yield["log_yield"] - crop_yield["prediction"]
    crop_yield["district_yield_mean"] = crop_yield.groupby('district')['log_yield'].transform('mean')
    crop_yield["district_prediction_mean"] = crop_yield.groupby('district')['prediction'].transform('mean')
    crop_yield["demean_yield"] = crop_yield["log_yield"] - crop_yield["district_yield_mean"]
    crop_yield["demean_prediction"] = crop_yield["prediction"] - crop_yield["district_prediction_mean"]

    r_squared = r2_score(crop_yield["demean_yield"], crop_yield["demean_prediction"])
    pearson_r = pearsonr(crop_yield["demean_yield"], crop_yield["demean_prediction"])[0]

    d = {
        'country': country_code,
        'satellite_1': satellite1[0],
        'bands_1': bands1,
        'num_features_1': num_features1,
        'points_1': points1, 
        'month_range_1': mns1,
        'limit_months_1': limit_months1,
        'crop_mask_1': crop_mask1,
        'weighted_avg_1': weighted_avg1,
        
        'satellite_2': satellite2[0],
        'bands_2': bands2,
        'num_features_2': num_features2,
        'points_2': points2, 
        'month_range_2': mns2,
        'limit_months_2': limit_months2,
        'crop_mask_2': crop_mask2,
        'weighted_avg_2': weighted_avg2,
        
        'hot_encode': hot_encode,
        'total_n': len(x_all),
        'train_n': len(x_train),
        'test_n': len(x_test),
        'reg_param': ridge_cv_random.alpha_,
        'val_R2': ridge_cv_random.best_score_,
        'train_R2': r2_train,
        'train_r': pearson_train,
        'train_r2': pearson_train ** 2,
        'test_R2': r2_test,
        'test_r': pearson_test,
        'test_r2': pearson_test ** 2,
        'demean_R2': r_squared,
        'demean_r': pearson_r,
        'demean_r2': pearson_r ** 2,
    }
    df = pd.DataFrame(data=d)
    return df

In [6]:
%%time
workers = os.cpu_count()
if __name__ == "__main__":
    with multiprocessing.Pool(processes=workers) as pool:
        output = []
        for result in pool.imap_unordered(model_2_sensors, paramlist):
            output.append(result)
    results = pd.concat(output).reset_index(drop=True)
    today = date.today().strftime("%Y-%m-%d")
    file_name = f'2_sensor_results_{today}.csv'
    print(f"Saving results as: {file_name}\n\n")           
    results.to_csv(here("data","results", file_name))

Saving results as: 2_sensor_results_2022-10-26.csv


CPU times: user 39.9 s, sys: 5.63 s, total: 45.5 s
Wall time: 9h 36min 20s


In [7]:
results

Unnamed: 0,country,satellite_1,bands_1,num_features_1,points_1,month_range_1,limit_months_1,crop_mask_1,weighted_avg_1,satellite_2,...,val_R2,train_R2,train_r,train_r2,test_R2,test_r,test_r2,demean_R2,demean_r,demean_r2
0,ZMB,sentinel-2-l2a,2-3-4,1000,4,4-9,True,True,True,sentinel-2-l2a,...,0.555296,0.850442,0.928068,0.861310,0.653117,0.808811,0.654174,0.415556,0.666821,0.444651
1,ZMB,sentinel-2-l2a,2-3-4,1000,4,4-9,True,True,True,sentinel-2-l2a,...,0.549944,0.968851,0.985438,0.971087,0.690824,0.835215,0.697584,0.685374,0.834361,0.696159
2,ZMB,landsat-c2-l2,r-g-b-nir-swir16-swir22,1024,20,1-12,False,False,True,sentinel-2-l2a,...,0.594501,0.842691,0.923179,0.852259,0.620472,0.799279,0.638847,0.537641,0.733957,0.538693
3,ZMB,sentinel-2-l2a,2-3-4,1000,4,4-9,True,True,True,sentinel-2-l2a,...,0.635487,0.901394,0.952138,0.906566,0.723401,0.856204,0.733085,0.613126,0.783429,0.613761
4,ZMB,landsat-c2-l2,r-g-b-nir-swir16-swir22,1024,20,1-12,False,True,False,sentinel-2-l2a,...,0.597158,0.965785,0.984182,0.968615,0.671178,0.830883,0.690366,0.749087,0.866002,0.749959
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6155,ZMB,sentinel-2-l2a,2-3-4,1000,20,4-9,True,False,True,sentinel-2-l2a,...,0.576397,0.845502,0.922442,0.850900,0.609676,0.785831,0.617530,0.474061,0.698992,0.488590
6156,ZMB,sentinel-2-l2a,2-3-4,1000,20,4-9,True,False,True,sentinel-2-l2a,...,0.622032,0.924731,0.964605,0.930463,0.737015,0.858882,0.737679,0.616076,0.788622,0.621925
6157,ZMB,sentinel-2-l2a,2-3-4,1000,20,4-9,True,False,True,sentinel-2-l2a,...,0.611735,0.870703,0.936511,0.877053,0.656675,0.811395,0.658362,0.502312,0.714181,0.510055
6158,ZMB,sentinel-2-l2a,2-3-4,1000,20,4-9,True,False,True,sentinel-2-l2a,...,0.614134,0.872751,0.937658,0.879203,0.660979,0.814372,0.663202,0.508951,0.718287,0.515936


In [7]:
# df

In [77]:
# ridge_cv_random.best_score_

In [82]:
# crop_yield

In [83]:
# fig, ax = plt.subplots()
# ax.axline([0, 0], [1, 1])
# plt.scatter(crop_yield.log_yield, crop_yield.prediction, alpha=1, s=4)

In [84]:
# fig, ax = plt.subplots()
# ax.axline([-.4, -.4], [.4, .4])
# plt.scatter(crop_yield.demean_yield, crop_yield.demean_prediction, alpha=1, s=4)