# Checking the Error of our models

This notebook is intended to measure how effective our models are at prediction of ripeness. We will also compare our approach with Taylor et. al. 

In [2]:
import model_helpers

import cfgrib
import xarray as xr

import pandas as pd
import numpy as np

from pyPhenology import models, utils

from tqdm import trange, tqdm

import matplotlib.pyplot as plt

from warnings import warn

In [136]:
large_prediction_df_euro = pd.read_csv("../data/model_training_data/daily_european_site_specific_ripeness_sci_names.csv")
monthly_prediction_df = pd.read_csv("../data/model_training_data/site_specific_ripeness_sci_names.csv")


In [138]:
print(np.mean(np.abs(large_prediction_df_euro['flowering_day'] - large_prediction_df_euro['doy'])))
print(np.mean(np.abs(monthly_prediction_df['flowering_day'] - monthly_prediction_df['doy'])))
print(np.median(np.abs(large_prediction_df_euro['flowering_day'] - large_prediction_df_euro['doy'])))
print(np.median(np.abs(monthly_prediction_df['flowering_day'] - monthly_prediction_df['doy'])))

22.675085218469167
31.975759166885446
20.0
25.0


In [140]:
def error_fractions(dataframe, error_col):
    median_error = np.median(np.abs(large_prediction_df_euro['flowering_day'] - large_prediction_df_euro['doy']))
    std_error = np.std(np.abs(large_prediction_df_euro['flowering_day'] - large_prediction_df_euro['doy']))
    
    print(len(dataframe[dataframe[error_col] < median_error]) / len(dataframe))
    print(len(dataframe[dataframe[error_col] < std_error]) / len(dataframe))

#len(full_predictions.query('corrected_error < 26')) / len(full_predictions)

Daily predictions have a much lower MAE.  

In [10]:
## RMSE:

print(np.sqrt(np.mean(large_prediction_df_euro['flowering_day'] - large_prediction_df_euro['doy']) ** 2))

0.3010536101642392


In [16]:
print(np.sqrt(np.mean((large_prediction_df_euro['flowering_day'] - large_prediction_df_euro['doy']) ** 2)))
print(np.sqrt(np.mean((monthly_prediction_df['flowering_day'] - monthly_prediction_df['doy']) ** 2)))

28.17208562440632
45.33204991259524


Taylor RMSE: 18.8 days.

In [17]:
def rmse(y1, y2):
    return np.sqrt(np.mean((y1 - y2) ** 2))

def mae(y1, y2):
    return np.mean(np.abs(y1 - y2))

In [20]:
monthly_prediction_df['formatted_sci_name'].unique()

array(['Rubus occidentalis', 'Rubus idaeus', 'Rubus hayata',
       'Rubus phoenicolasius', 'Rubus rolfei', 'Rubus ellipticus',
       'Rubus odoratus', 'Morus nigra', 'Morus alba', 'Morus rubra',
       'Morus australis', 'Morus macroura', 'Amelanchier arborea',
       'Amelanchier canadensis', 'Amelanchier alnifolia',
       'Amelanchier laevis', 'Amelanchier grandiflora',
       'Amelanchier utahensis', 'Prunus americana', 'Prunus virginiana',
       'Prunus serotina', 'Prunus nigra', 'Prunus maritima',
       'Prunus mexicana', 'Prunus domestica', 'Prunus avium',
       'Prunus cerasus', 'Prunus maackii', 'Prunus subhirtella',
       'Prunus tomentosa', 'Prunus angustifolia', 'Prunus salicina',
       'Prunus ilicifolia', 'Prunus cerasifera', 'Prunus',
       'Prunus yedoensis', 'Prunus cocomilia', 'Prunus laurocerasus',
       'Prunus pensylvanica', 'Prunus emarginata', 'Prunus serrulata',
       'Prunus mahaleb', 'Prunus fruticosa', 'Prunus padus',
       'Prunus rivularis', 'Mal

In [144]:
monthly_prediction_df['abs_error'] = np.abs(monthly_prediction_df['doy'] - monthly_prediction_df['flowering_day'])
large_prediction_df_euro['abs_error'] = np.abs(large_prediction_df_euro['doy'] - large_prediction_df_euro['flowering_day'])


print(np.median(monthly_prediction_df['abs_error']))
print(np.median(large_prediction_df_euro['abs_error']))


25.0
20.0


In [145]:
error_fractions(monthly_prediction_df, 'abs_error')
error_fractions(large_prediction_df_euro, 'abs_error')

0.4028178874595257
0.3432221930515446
0.49209792376820577
0.4291911992562752


### Validity by Species

In [39]:
species_errors = []

for s in monthly_prediction_df['formatted_sci_name'].unique():
    species_df_euro = large_prediction_df_euro[large_prediction_df_euro['formatted_sci_name'] == s]
    species_df_month = monthly_prediction_df[monthly_prediction_df['formatted_sci_name'] == s]
    
    species_error_dict = {
        'species': s,
        'euro_mae': mae(species_df_euro['flowering_day'], species_df_euro['doy']),
        'month_mae': mae(species_df_month['flowering_day'], species_df_month['doy']),
        'euro_rmse': rmse(species_df_euro['flowering_day'], species_df_euro['doy']),
        'month_rmse': rmse(species_df_month['flowering_day'], species_df_month['doy']),
        'euro_n': len(species_df_euro),
        'month_n': len(species_df_month)
    }
    
    species_errors.append(species_error_dict)
    
    #print('Species: ', s)
    #print(species_error_dict)
    
error_df = pd.DataFrame(species_errors)

In [51]:
error_df.sort_values('month_rmse', ascending=False).head(20)

Unnamed: 0,species,euro_mae,month_mae,euro_rmse,month_rmse,euro_n,month_n
75,Ficus macrophylla,,224.0,,224.0,0,1
81,Citrus jambhiri,,166.5,,167.046401,0,2
80,Citrus sinensis,22.0,159.666667,22.0,161.198635,1,3
77,Ehretia tinifolia,,126.0,,126.0,0,1
82,Olea europaea,,109.244318,,120.791871,0,176
78,Citrus,142.5,88.386207,163.176285,110.148865,2,145
33,Prunus cerasifera,63.714286,79.240506,82.250488,109.730741,7,79
79,Citrus limon,,81.0,,105.309069,0,4
42,Prunus fruticosa,,105.0,,105.0,0,1
32,Prunus ilicifolia,,84.229665,,96.784375,0,209


In [54]:
error_df.sort_values('month_n', ascending=False).head(20)

Unnamed: 0,species,euro_mae,month_mae,euro_rmse,month_rmse,euro_n,month_n
52,Malus domestica,31.584845,31.789648,34.962878,35.346538,3088,3632
25,Prunus avium,10.91736,15.825492,14.289328,20.250631,1682,1828
66,Pyrus communis,17.264207,20.167136,21.082075,26.70402,1355,1777
24,Prunus domestica,16.318182,48.207547,21.130547,54.782288,264,424
46,Malus,,49.114458,,67.111847,0,332
20,Prunus serotina,0.0,28.875776,0.0,38.56945,1,322
47,Malus pumila,22.952381,32.062305,31.41883,47.593947,21,321
19,Prunus virginiana,,28.520833,,40.067287,0,240
7,Morus nigra,15.0,31.894068,17.161002,47.974702,4,236
32,Prunus ilicifolia,,84.229665,,96.784375,0,209


In [118]:
error_df[error_df['month_n'] >= 10].sort_values('month_rmse', ascending=True).head(20)

Unnamed: 0,species,euro_mae,month_mae,euro_rmse,month_rmse,euro_n,month_n
3,Rubus phoenicolasius,,12.391304,,14.698418,0,23
0,Rubus occidentalis,,12.697674,,16.910159,0,43
25,Prunus avium,10.91736,15.825492,14.289328,20.250631,1682,1828
15,Amelanchier laevis,,22.240741,,25.640318,0,54
66,Pyrus communis,17.264207,20.167136,21.082075,26.70402,1355,1777
16,Amelanchier grandiflora,,19.916667,,27.520699,0,72
8,Morus alba,12.5,19.759036,14.089003,28.0752,2,83
70,Cornus mas,104.857143,20.8125,132.194662,28.804297,7,32
14,Amelanchier alnifolia,,23.203883,,30.873371,0,103
9,Morus rubra,,25.62963,,33.360544,0,54


In [41]:
euro_errors = error_df.dropna()

In [115]:
len(euro_errors)

19

In [45]:
euro_errors[euro_errors['euro_mae'] >= euro_errors['month_mae']]

Unnamed: 0,species,euro_mae,month_mae,euro_rmse,month_rmse,euro_n,month_n
10,Morus australis,45.333333,33.0,51.78803,34.34385,3,4
60,Juglans regia,47.666667,31.6875,48.891035,54.492545,3,16
70,Cornus mas,104.857143,20.8125,132.194662,28.804297,7,32
78,Citrus,142.5,88.386207,163.176285,110.148865,2,145


In [46]:
euro_errors[euro_errors['euro_rmse'] >= euro_errors['month_rmse']]

Unnamed: 0,species,euro_mae,month_mae,euro_rmse,month_rmse,euro_n,month_n
10,Morus australis,45.333333,33.0,51.78803,34.34385,3,4
70,Cornus mas,104.857143,20.8125,132.194662,28.804297,7,32
78,Citrus,142.5,88.386207,163.176285,110.148865,2,145


In [47]:
np.mean(euro_errors['euro_mae'])

34.968111427091166

In [50]:
np.mean(euro_errors['euro_rmse'])

41.505741114357214

In [48]:
np.mean(error_df['month_mae'])

39.11731790090093

In [49]:
np.mean(error_df['month_rmse'])

46.20526311485827

In [67]:
site_errors = []

for s in monthly_prediction_df['site_id'].unique():
    site_df_euro = large_prediction_df_euro[large_prediction_df_euro['site_id'] == s]
    site_df_month = monthly_prediction_df[monthly_prediction_df['site_id'] == s]
    
    site_error_dict = {
        'site': s,
        'euro_mae': mae(site_df_euro['flowering_day'], site_df_euro['doy']),
        'month_mae': mae(site_df_month['flowering_day'], site_df_month['doy']),
        'euro_rmse': rmse(site_df_euro['flowering_day'], site_df_euro['doy']),
        'month_rmse': rmse(site_df_month['flowering_day'], site_df_month['doy']),
        'euro_n': len(site_df_euro),
        'month_n': len(site_df_month),
        'lat': list(site_df_month['latitude'])[0],
        'lon': list(site_df_month['longitude'])[0]
    }
    
    site_errors.append(site_error_dict)
    
    #print('Species: ', s)
    #print(species_error_dict)
    
error_df_sites = pd.DataFrame(site_errors)

In [71]:
np.mean(error_df_sites.sort_values('month_n', ascending=False).head(50)['month_rmse'])

33.45956350164461

In [72]:
error_df_sites

Unnamed: 0,site,euro_mae,month_mae,euro_rmse,month_rmse,euro_n,month_n,lat,lon
0,0,,25.333333,,27.688746,0,6,42,268
1,1,,22.151515,,28.708567,0,66,43,286
2,2,,21.777778,,31.944396,0,9,43,279
3,3,,34.263158,,49.365340,0,95,45,267
4,4,,41.314286,,68.061317,0,35,40,274
...,...,...,...,...,...,...,...,...,...
510,80,,101.500000,,101.560327,0,2,42,0
511,87,,107.000000,,107.000000,0,1,38,24
512,88,,76.000000,,76.000000,0,1,40,16
513,96,,16.500000,,16.507574,0,2,-35,143


In [109]:
big_sites = error_df_sites[error_df_sites['month_n'] > 10]['site']

In [111]:
many_obs_df = monthly_prediction_df[monthly_prediction_df['site_id'].isin(big_sites)]
many_obs_df_euro = large_prediction_df_euro[large_prediction_df_euro['site_id'].isin(big_sites)]


print(rmse(many_obs_df['doy'], many_obs_df['flowering_day']))
print(rmse(many_obs_df_euro['doy'], many_obs_df_euro['flowering_day']))

42.921619042872855
27.698545642130124


In [114]:
many_obs_df_euro.groupby('site_id').size()

site_id
41      15
83      54
150     22
281    104
282     38
      ... 
455     63
456    190
457     37
461     10
551      8
Length: 66, dtype: int64

In [112]:
many_obs_df.groupby('site_id').size()

site_id
1      66
3      95
4      35
5      12
7      82
       ..
529    13
535    28
536    30
543    24
551    13
Length: 164, dtype: int64

**Report for Ethan**

- Our RMSE (root mean standard error) for the daily european data is 28.2; for the monthly data, it's 45.3. 
- This means that the daily european data can predict ripeness within 28 days of observed ripeness, whereas the monthly data can predict ripeness within 45.3 days of observed ripeness. 
- Taylor et. al reports an RMSE of 18.8 – we're close to this with the european data. 
- With higher-resolution time and space data, we can get very close to the two-week threshold you mentioned. 

With only the northern hemisphere, the monthly RMSE drops to 42.5. PyPhenology likely does not account for hemisphere, since it was developed for American data. I'll check the documentation to make sure. One solution is to scale the days of the southern hemisphere sites back by 180 days, but the challenge of mapping this to northern hemisphere data is still present. 

The minimum RMSE for any species is Rubus phoenicolasius, with an RMSE of 14.7 and 23 observations. This falls very close to your estimation that "two weeks is the best we can reasonably do". 

The european data consistently outperforms the monthly data – there were only three species out of 19 for which the monthly RMSE was higher than the european RMSE. 

(note - the 19 species appeared in both datasets, many of the species in the worldwide sites did not appear in the european dataset) 

When sorting by site, it appears that much of the error is due to sites with relatively few observations. Your prediction that higher-resolution weather data will increase site-specific precision seems to be correct. 
For monthly data, the sites with less than 10 observations have an RMSE of 63.6, whereas the sites with more than 10 observations have an RMSE of 42.9. 

Number of observations is negatively correlated with RMSE for both species and sites. This means that as observations increase, our RMSE decreases. 

It seems that, as of now, there are two explanations for the high RMSE. The first is that resolution of time and space data is too low, as we can see in the comparison between the european RMSE and the monthly RMSE. This is easily rectified by increasing the resolution of climate data.  

The second is that number of observations per site is the main driver of error, which indicates overfitting. Since sites and species with more observations are sampled more in the training data, it stands to reason that the model parameters favor the temperature conditions in those sites. Thus, my assumption that species would have similar growing patterns in response to temperature would be wrong. However, there is little way to rectify this, as no matter how we slice it, sites and species with more observations will be disproportionately represented in both the training and test data. 

To correct for overfitting of specific sites in the training data, I'll instead try to train and test the models on all sites. This means that I need to separate the training data from the test data by some other means than sites. For these purposes, years would work. Isolating 2022 as our "test year" would allow us to train the model on all years up until 2022, and test on the year 2022. This will also make our results and error metrics more interpretable. 

So, to summarize, 
- Higher temporal resolution data is more accurate. 
- Number of observations decreases error, but we should be careful of overfitting. 
- There is a slight confounding effect of hemisphere. 
- Future steps: 
    - Separate data by years instead of sites. 
    - Train data using higher-resolution data. Higher spatial resolution is possible now, as is high temporal resolution for european data. American / worldwide ERA-5 data is still coming. 

In [133]:
np.corrcoef(error_df['month_n'], error_df['month_rmse'])

array([[ 1.        , -0.04643115],
       [-0.04643115,  1.        ]])

In [134]:
np.corrcoef(error_df_sites['month_n'], error_df_sites['month_rmse'])

array([[ 1.        , -0.09757546],
       [-0.09757546,  1.        ]])

**Filter out southern hemisphere**

In [119]:
northern = monthly_prediction_df[monthly_prediction_df['latitude'] > 0]

In [120]:
rmse(northern['doy'], northern['flowering_day'])

42.544434668011796

In [125]:
species_errors_north = []

for s in northern['formatted_sci_name'].unique():
    species_df_euro = large_prediction_df_euro[large_prediction_df_euro['formatted_sci_name'] == s]
    species_df_month = northern[northern['formatted_sci_name'] == s]
    
    species_error_dict = {
        'species': s,
        'euro_mae': mae(species_df_euro['flowering_day'], species_df_euro['doy']),
        'month_mae': mae(species_df_month['flowering_day'], species_df_month['doy']),
        'euro_rmse': rmse(species_df_euro['flowering_day'], species_df_euro['doy']),
        'month_rmse': rmse(species_df_month['flowering_day'], species_df_month['doy']),
        'euro_n': len(species_df_euro),
        'month_n': len(species_df_month)
    }
    
    species_errors_north.append(species_error_dict)
    
    #print('Species: ', s)
    #print(species_error_dict)
    
error_df_north = pd.DataFrame(species_errors_north)

In [126]:
error_df_north

Unnamed: 0,species,euro_mae,month_mae,euro_rmse,month_rmse,euro_n,month_n
0,Rubus occidentalis,,12.697674,,16.910159,0,43
1,Rubus idaeus,,26.180000,,38.449707,0,50
2,Rubus hayata,,11.166667,,12.322067,0,6
3,Rubus phoenicolasius,,12.391304,,14.698418,0,23
4,Rubus rolfei,,11.000000,,11.000000,0,1
...,...,...,...,...,...,...,...
78,Citrus sinensis,22.000000,159.666667,22.000000,161.198635,1,3
79,Citrus jambhiri,,180.000000,,180.000000,0,1
80,Olea europaea,,110.771605,,122.134590,0,162
81,Olea europea,39.285714,87.033333,52.070008,90.411836,7,30


In [129]:
error_df_north[error_df_north['month_n'] > 10].sort_values('month_rmse', ascending=True).head(20)

Unnamed: 0,species,euro_mae,month_mae,euro_rmse,month_rmse,euro_n,month_n
3,Rubus phoenicolasius,,12.391304,,14.698418,0,23
0,Rubus occidentalis,,12.697674,,16.910159,0,43
25,Prunus avium,10.91736,15.825492,14.289328,20.250631,1682,1828
8,Morus alba,12.5,17.8625,14.089003,24.966227,2,80
15,Amelanchier laevis,,22.240741,,25.640318,0,54
9,Morus rubra,,21.82,,25.732858,0,50
66,Pyrus communis,17.264207,20.072072,21.082075,26.332357,1355,1776
16,Amelanchier grandiflora,,19.916667,,27.520699,0,72
70,Cornus mas,104.857143,20.8125,132.194662,28.804297,7,32
14,Amelanchier alnifolia,,23.203883,,30.873371,0,103
