# prediction of the increase in crop yield due to precipitation

This notebook calculates the crop yield for a certain German district based on crop-, weather-, climate- and soil data between 1999 and 2020.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import numpy.polynomial.polynomial as poly

from sklearn.preprocessing import PolynomialFeatures, StandardScaler, OneHotEncoder
from sklearn.ensemble import BaggingRegressor
from scipy.stats import boxcox
from scipy.special import inv_boxcox
from modelling_functions import compute_fict

In [2]:
# for preprocessing and ensemble model
polyfeat = PolynomialFeatures(degree=2)
scaler   = StandardScaler()
onehot   = OneHotEncoder(sparse=False)

## define crop and region, prepare data set

In [3]:
crop = 'Zuckerrüben' # crop
dc = '03151'         # district code

In [4]:
# open specified crop data sets
with open('data/crop_sets.pkl', 'rb') as file:
    crop_sets = pickle.load(file)

dx = crop_sets[crop].copy()

In [5]:
#district codes
dcode = dx['d.code'].to_numpy().reshape(-1,1)

# one hot encoded district codes
ohe_ = onehot.fit(dcode)
ohe  = onehot.transform(dcode)

In [6]:
df_X, y = dx[['TT', 'SD', 'RR', 'SMI_OB', 'SMI_GB']].copy(), dx['Ertrag'].copy()

X_poly_ = polyfeat.fit(df_X)
X_poly  = X_poly_.transform(df_X)

X_scal_ = scaler.fit(X_poly)
X_scal  = X_scal_.transform(X_poly)

#append one hot encoded districts
X_full = np.concatenate((X_scal, ohe), axis=1)

#boxcox transformation of target
y_bxcx, bxcxlmbd = boxcox(y)

x2_mn, x2_sd     = np.mean(X_poly[:,  3]), np.std(X_poly[:,  3])
x22_mn, x22_sd   = np.mean(X_poly[:, 15]), np.std(X_poly[:, 15])
x0x2_mn, x0x2_sd = np.mean(X_poly[:,  8]), np.std(X_poly[:,  8])
x1x2_mn, x1x2_sd = np.mean(X_poly[:, 12]), np.std(X_poly[:, 12])
x2x3_mn, x2x3_sd = np.mean(X_poly[:, 16]), np.std(X_poly[:, 16])
x2x4_mn, x2x4_sd = np.mean(X_poly[:, 17]), np.std(X_poly[:, 17])

X_mean = [x2_mn, x22_mn, x0x2_mn, x1x2_mn, x2x3_mn, x2x4_mn]
X_sd   = [x2_sd, x22_sd, x0x2_sd, x1x2_sd, x2x3_sd, x2x4_sd]

## bagging regressor on crop yield

In [None]:
# number of iterations
iterations = 10
# maximum of precipitation for predictions
max_precip = 550

min_val_it = np.zeros((max_precip, iterations))
max_val_it = np.zeros((max_precip, iterations))
mean_val_it = np.zeros((max_precip, iterations))

for iteration in np.arange(0, iterations):
    
    print(f'iteration: {iteration+1}/{iterations}')
    
    #bagging model
    bagg_reg = BaggingRegressor(n_estimators=100, n_jobs=-1).fit(X_full, y_bxcx)
    y_bagg = bagg_reg.predict(X_full)

    #inverse boxcox
    y_pred = inv_boxcox(y_bagg, bxcxlmbd)

    #difference between true and predicted values
    y_diff = [a-b for a,b in zip(y, y_pred)]

    #function of difference of predicted versus true values
    pf_coef = poly.polyfit(y_pred, y_diff, deg=2)

    #computing corrected predicted values
    y_corr = [(pf_coef[0]+pf_coef[1]*x+pf_coef[2]*x**2) for x in y_pred]
    y_corr = y_corr+y_pred

    idx_dstrct = dx[dx['d.code']==dc].index.tolist()

    X_dstrct = X_full[idx_dstrct]

    #district position in ohe
    dc_pos = np.where(ohe_.get_feature_names_out()==str('x0_'+ dc))[0][0]
    dc_len = len(ohe_.get_feature_names_out())

    district_poly = X_poly[idx_dstrct]
    district_scal = X_scal[idx_dstrct]

    district_fict = []

    #create empty ndarray
    oh = np.zeros(dc_len)

    #1 for one hot encoded district
    oh[dc_pos] = 1

    for year in np.arange(0, len(district_poly)):
        #empty array
        fict_data = np.ndarray((max_precip, (dc_len+21)))

        #create fictional data set
        for newrr in np.arange(0, max_precip):
            fict = compute_fict(X_poly=district_poly[year], X_scal=district_scal[year], X_mean=X_mean, X_sd=X_sd, newRR=newrr)
            fict = np.append(fict, oh)

            fict_data[newrr] = fict

        district_fict = district_fict + [fict_data]

    n_years = len(district_fict)

    dx_    = dx[['year', 'RR', 'Ertrag']][dx['d.code']==dc]
    values = np.zeros((max_precip, n_years))

    for j in np.arange(0, n_years):
        #predicting values
        t = bagg_reg.predict(district_fict[j])
        #inverted boxcox of predicted values
        t = [inv_boxcox(x, bxcxlmbd) for x in t]
        #correct the values using relationship of real versus predicted values of general model
        t_corr = [(pf_coef[0]+pf_coef[1]*x+pf_coef[2]*x**2) for x in t]
        t_corr = [t_corr + t for t_corr,t in zip(t_corr, t)]
        #write values per year to array
        values[:,j] = t_corr

    min_val  = [np.percentile(values[x,:], q=10) for x in np.arange(0, values.shape[0])]
    max_val  = [np.percentile(values[x,:], q=90) for x in np.arange(0, values.shape[0])]
    mean_val = [np.percentile(values[x,:], q=50) for x in np.arange(0, values.shape[0])]
    
    min_val_it[:, iteration] = min_val
    max_val_it[:, iteration] = max_val
    mean_val_it[:, iteration] = mean_val

min_val_ = [np.mean(min_val_it[x]) for x in np.arange(0, max_precip)]
max_val_ = [np.mean(max_val_it[x]) for x in np.arange(0, max_precip)]
mean_val_ = [np.mean(mean_val_it[x]) for x in np.arange(0, max_precip)]

## results

looking up lower and upper percentiles


In [None]:
# bad years q=10
lst = []

for i in range(int(dx_['RR'].min()), int(dx_['RR'].max())):
    lst.append(np.where(values[i,:]<np.percentile(values[i,:], q=10))[0].tolist())

lst = [item for sublist in lst for item in sublist]
years_bad = pd.Series(lst).value_counts()
years_bad.index[:2].tolist()
dx['year'][dx['d.code']==dc].iloc[years_bad.index[:3].tolist()]

In [None]:
# good years q=90
lst = []

for i in range(int(dx_['RR'].min()), int(dx_['RR'].max())):
    lst.append(np.where(values[i,:]>np.percentile(values[i,:], q=90))[0].tolist())

lst = [item for sublist in lst for item in sublist]
years_good = pd.Series(lst).value_counts()
years_good.index[:2].tolist()
dx['year'][dx['d.code']==dc].iloc[years_good.index[:3].tolist()]

In [None]:
fig, ax = plt.subplots(figsize=(16,10))

ax.axvspan(dx_['RR'].min(), dx_['RR'].max(), color='aliceblue', zorder=0, label='range of real precipitation')
ax.set_xlabel('precipitation (mm)')
ax.set_ylabel('predicted crop yield (dt/ha)')
ax.set_title(f'predicted crop yield of {crop} for a given precipitation')
sns.lineplot(x=np.arange(0, max_precip), y=min_val_[0:max_precip], color='red', linewidth=3, ax=ax, label='10% Percentile')
sns.lineplot(x=np.arange(0, max_precip), y=mean_val_[0:max_precip], color='green', linewidth=3, ax=ax, label='50% Percentile')
sns.lineplot(x=np.arange(0, max_precip), y=max_val_[0:max_precip], color='orange', linewidth=3, ax=ax, label='90% Percentile')
sns.scatterplot(x=dx_['RR'], y=dx_['Ertrag'], ax=ax, label='real crop yield')

ax.vlines(x=dx_['RR'].mean(), ymin=(y0*1.05), ymax=(y1*0.95), linewidth=1, label='mean precipitation')

y0, y1 = ax.get_ylim()

#plt.savefig(f'results/{crop} in {dc}.jpg', dpi=300)
plt.show()