In [1]:
from model_helpers import *

import cfgrib
import xarray as xr

import pandas as pd
import numpy as np

from pyPhenology import models, utils

from tqdm import trange, tqdm

import matplotlib.pyplot as plt

from warnings import warn
import warnings

In [2]:
warnings.filterwarnings('ignore')

In [3]:
high_cutoff_year = 2022
low_cutoff_year = 2010

In this notebook, we implement two methods for increasing the accuracy of our ripeness predictions:

1. higher space resolution. 
2. train on all years before 2022, test on 2022. 

In [4]:
# Load in high-res weather data
grib_data = cfgrib.open_datasets('../data/monthly_weather_data.grib')

core_data = grib_data[0]

In [5]:
## Load observations
formatted_plants = pd.read_csv("../data/model_training_data/all_plants_formatted.csv", index_col=0)


In [6]:
formatted_plants['rounded_lat'] = np.round(formatted_plants['latitude'], 1)
formatted_plants['rounded_lon'] = np.round(formatted_plants['lon_360'], 1)

rounded_sites = formatted_plants[['site_id', 'rounded_lat', 'rounded_lon']].drop_duplicates()

site_x_vals = xr.DataArray(rounded_sites['rounded_lat'], dims=['site'])
site_y_vals = xr.DataArray(rounded_sites['rounded_lon'], dims=['site'])

In [7]:
full_weather_data = core_data.sel(latitude=site_x_vals, longitude=site_y_vals, method='nearest').to_dataframe().dropna()

In [8]:
formatted_weather = format_weather_data(full_weather_data)

formatted_weather['latitude'] = np.round(formatted_weather['latitude'], 1)
formatted_weather['longitude'] = np.round(formatted_weather['longitude'], 1)

In [9]:
rounded_sites['coordstring'] = rounded_sites['rounded_lat'].astype(str) + rounded_sites['rounded_lon'].astype(str)
formatted_weather['coordstring'] = formatted_weather['latitude'].astype(str) + formatted_weather['longitude'].astype(str)



In [10]:
## Add Site ID to the weather data
weather_with_sites = pd.merge(formatted_weather, rounded_sites[['coordstring', 'site_id']], on='coordstring').drop('coordstring', axis=1)
## Separate into training data and testing data

# filter out current year
weather_with_sites = weather_with_sites[weather_with_sites['year'] != 2023]

weather_training = weather_with_sites[weather_with_sites['year'] < high_cutoff_year]
weather_test = weather_with_sites[weather_with_sites['year'] >= high_cutoff_year]



In [11]:
species_list = formatted_plants['formatted_sci_name'].unique()

In [12]:
formatted_plants.drop('species', axis=1, inplace=True)

In [13]:
weather_sites = weather_with_sites['site_id'].unique()

filtered_plants = formatted_plants[(formatted_plants['site_id'].isin(weather_sites)) & 
                                   (formatted_plants['year'] != 2023) &
                                   (formatted_plants['latitude'] > 0)]



In [14]:
def make_test_df(train_df):
    #print(train_df)
    species_sites = train_df['site_id'].unique()
        
    #print(species_sites)
    
    site_ripenesses = []

    for site in species_sites:
        site_df = train_df[train_df['site_id'] == site]

        site_ripenesses.append({
            'site_id': site,
            'doy': np.mean(site_df['doy'])
        })

    species_test_df = pd.DataFrame(site_ripenesses)
    species_test_df['year'] = high_cutoff_year
    
    return species_test_df

In [15]:
## Train models

species_prediction_dict = {}

for s in tqdm(species_list):
    print(s)
    species_train_df = filtered_plants.query('formatted_sci_name == "{}" and year < {}'.format(s, high_cutoff_year))
    
    if len(species_train_df) == 0:
        continue
    
    species_test_df = filtered_plants.query('formatted_sci_name == "{}" and year >= {}'.format(s, high_cutoff_year))
    
    if len(species_test_df) == 0:
        # make predictions and compare to the mean ripeness day at each site
        species_test_df = make_test_df(species_train_df)
    
    if len(species_test_df) == 0:
        print("No test data for {}".format(s))
        #print(species_test_df)
        
    predictions = train_ripeness_small(species_train_df, weather_training,
                        species_test_df, weather_test)
    
    species_prediction_dict[s] = predictions

  0%|                                                                                                                                                                              | 0/97 [00:00<?, ?it/s]

Rubus
running model ThermalTime
making predictions for model ThermalTime


  1%|█▋                                                                                                                                                                    | 1/97 [00:04<06:56,  4.34s/it]

model ThermalTime got a MAE of 28.6
model ThermalTime got an RMSE of 49.24970389081881
model ThermalTime's median error is: 3.0
Ripeness Day: 187.13333333333333
Rubus occidentalis
running model ThermalTime
making predictions for model ThermalTime


  2%|███▍                                                                                                                                                                  | 2/97 [00:07<05:31,  3.49s/it]

model ThermalTime got a MAE of 7.2
model ThermalTime got an RMSE of 8.449852069711044
model ThermalTime's median error is: 6.0
Ripeness Day: 183.0
Ficus
running model ThermalTime


  3%|█████▏                                                                                                                                                                | 3/97 [00:10<05:06,  3.26s/it]

making predictions for model ThermalTime
model ThermalTime got a MAE of 50.125
model ThermalTime got an RMSE of 59.21465190305521
model ThermalTime's median error is: 15.0
Ripeness Day: 214.0
Ficus auriculata
running model ThermalTime


  4%|██████▊                                                                                                                                                               | 4/97 [00:12<04:45,  3.07s/it]

making predictions for model ThermalTime
model ThermalTime got a MAE of 2.0
model ThermalTime got an RMSE of 2.0
model ThermalTime's median error is: -2.0
Ripeness Day: 245.0
Ficus carica
running model ThermalTime


  5%|████████▌                                                                                                                                                             | 5/97 [00:15<04:31,  2.95s/it]

making predictions for model ThermalTime
model ThermalTime got a MAE of 38.916666666666664
model ThermalTime got an RMSE of 48.479377058704046
model ThermalTime's median error is: 34.0
Ripeness Day: 214.0
Ficus citrifolia
running model ThermalTime


  6%|██████████▎                                                                                                                                                           | 6/97 [00:18<04:21,  2.88s/it]

making predictions for model ThermalTime
model ThermalTime got a MAE of 39.5
model ThermalTime got an RMSE of 39.75550276376844
model ThermalTime's median error is: -4.5
Ripeness Day: 245.0
Ficus macrophylla
running model ThermalTime
making predictions for model ThermalTime
model ThermalTime got a MAE of 104.0
model ThermalTime got an RMSE of 104.58011283222064
model ThermalTime's median error is: 11.0
Ripeness Day: 214.0


  7%|███████████▉                                                                                                                                                          | 7/97 [00:21<04:10,  2.78s/it]

Ficus sycomorus
Olea europaea
running model ThermalTime
making predictions for model ThermalTime
model ThermalTime got a MAE of 128.42
model ThermalTime got an RMSE of 134.21229451879586
model ThermalTime's median error is: 123.5
Ripeness Day: 162.0


  9%|███████████████▍                                                                                                                                                      | 9/97 [00:23<02:59,  2.04s/it]

Olea europea
running model ThermalTime


 10%|█████████████████                                                                                                                                                    | 10/97 [00:28<03:58,  2.75s/it]

making predictions for model ThermalTime
model ThermalTime got a MAE of 25.333333333333336
model ThermalTime got an RMSE of 32.59959996884402
model ThermalTime's median error is: 17.0
Ripeness Day: 283.1578947368421
Olea
running model ThermalTime
making predictions for model ThermalTime


 11%|██████████████████▋                                                                                                                                                  | 11/97 [00:31<04:10,  2.91s/it]

model ThermalTime got a MAE of 119.0
model ThermalTime got an RMSE of 133.7098849499667
model ThermalTime's median error is: 121.0
Ripeness Day: 154.42857142857142
Morus rubra
running model ThermalTime
making predictions for model ThermalTime


 12%|████████████████████▍                                                                                                                                                | 12/97 [00:35<04:29,  3.18s/it]

model ThermalTime got a MAE of 25.692307692307693
model ThermalTime got an RMSE of 45.346528999557314
model ThermalTime's median error is: 7.0
Ripeness Day: 166.84615384615384
Morus
running model ThermalTime


 13%|██████████████████████                                                                                                                                               | 13/97 [00:38<04:17,  3.06s/it]

making predictions for model ThermalTime
model ThermalTime got a MAE of 27.526315789473685
model ThermalTime got an RMSE of 42.65837364580679
model ThermalTime's median error is: -4.5
Ripeness Day: 175.10526315789474
Morus macroura
running model ThermalTime


 14%|███████████████████████▊                                                                                                                                             | 14/97 [00:42<04:38,  3.36s/it]

making predictions for model ThermalTime
model ThermalTime got a MAE of 35.0
model ThermalTime got an RMSE of 35.0
model ThermalTime's median error is: -35.0
Ripeness Day: 183.0
Morus alba
running model ThermalTime


 15%|█████████████████████████▌                                                                                                                                           | 15/97 [00:46<04:45,  3.48s/it]

making predictions for model ThermalTime
model ThermalTime got a MAE of 27.161290322580644
model ThermalTime got an RMSE of 47.51773862328644
model ThermalTime's median error is: 1.0
Ripeness Day: 172.3548387096774
Morus nigra
running model ThermalTime


 16%|███████████████████████████▏                                                                                                                                         | 16/97 [00:49<04:48,  3.56s/it]

making predictions for model ThermalTime
model ThermalTime got a MAE of 31.125
model ThermalTime got an RMSE of 46.99228660110082
model ThermalTime's median error is: -12.5
Ripeness Day: 186.1
Morus australis
running model ThermalTime


 18%|████████████████████████████▉                                                                                                                                        | 17/97 [00:52<04:26,  3.33s/it]

making predictions for model ThermalTime
model ThermalTime got a MAE of 49.0
model ThermalTime got an RMSE of 49.0
model ThermalTime's median error is: -49.0
Ripeness Day: 183.0
Amelanchier alnifolia
running model ThermalTime
making predictions for model ThermalTime


 19%|██████████████████████████████▌                                                                                                                                      | 18/97 [00:55<04:12,  3.20s/it]

model ThermalTime got a MAE of 29.09090909090909
model ThermalTime got an RMSE of 33.24290765096984
model ThermalTime's median error is: -29.0
Ripeness Day: 242.1818181818182
Amelanchier arborea
running model ThermalTime
making predictions for model ThermalTime
model ThermalTime got a MAE of 41.25
model ThermalTime got an RMSE of 57.17079674099356
model ThermalTime's median error is: 0.0
Ripeness Day: 169.25


 20%|████████████████████████████████▎                                                                                                                                    | 19/97 [00:59<04:16,  3.29s/it]

Amelanchier canadensis
running model ThermalTime


 21%|██████████████████████████████████                                                                                                                                   | 20/97 [01:03<04:46,  3.72s/it]

making predictions for model ThermalTime
model ThermalTime got a MAE of 16.4
model ThermalTime got an RMSE of 19.979989989987484
model ThermalTime's median error is: -7.0
Ripeness Day: 185.06666666666666
Amelanchier grandiflora
running model ThermalTime


 22%|███████████████████████████████████▋                                                                                                                                 | 21/97 [01:07<04:31,  3.58s/it]

making predictions for model ThermalTime
model ThermalTime got a MAE of 13.222222222222221
model ThermalTime got an RMSE of 15.0
model ThermalTime's median error is: -12.0
Ripeness Day: 183.0
Amelanchier laevis
running model ThermalTime


 23%|█████████████████████████████████████▍                                                                                                                               | 22/97 [01:10<04:17,  3.43s/it]

making predictions for model ThermalTime
model ThermalTime got a MAE of 14.32
model ThermalTime got an RMSE of 17.24876807195227
model ThermalTime's median error is: -10.0
Ripeness Day: 183.0
Amelanchier utahensis
running model ThermalTime


 24%|███████████████████████████████████████                                                                                                                              | 23/97 [01:13<04:02,  3.28s/it]

making predictions for model ThermalTime
model ThermalTime got a MAE of 30.875
model ThermalTime got an RMSE of 30.875253035400373
model ThermalTime's median error is: 0.125
Ripeness Day: 245.0
Amelanchier
running model ThermalTime
making predictions for model ThermalTime


 25%|████████████████████████████████████████▊                                                                                                                            | 24/97 [01:16<04:03,  3.34s/it]

model ThermalTime got a MAE of 12.25
model ThermalTime got an RMSE of 15.15490193407991
model ThermalTime's median error is: -1.0
Ripeness Day: 169.57894736842104
Citrus
running model ThermalTime


 26%|██████████████████████████████████████████▌                                                                                                                          | 25/97 [01:19<03:44,  3.12s/it]

making predictions for model ThermalTime
model ThermalTime got a MAE of 95.5859375
model ThermalTime got an RMSE of 113.82947086541341
model ThermalTime's median error is: 27.5
Ripeness Day: 173.390625
Citrus jambhiri
running model ThermalTime


 27%|████████████████████████████████████████████▏                                                                                                                        | 26/97 [01:22<03:34,  3.02s/it]

making predictions for model ThermalTime
model ThermalTime got a MAE of 3.0
model ThermalTime got an RMSE of 3.0
model ThermalTime's median error is: -3.0
Ripeness Day: 336.0
Citrus limon
running model ThermalTime
making predictions for model ThermalTime


 28%|█████████████████████████████████████████████▉                                                                                                                       | 27/97 [01:25<03:42,  3.18s/it]

model ThermalTime got a MAE of 44.0
model ThermalTime got an RMSE of 55.75541827182957
model ThermalTime's median error is: 37.0
Ripeness Day: 203.66666666666666
Rubus idaeus
running model ThermalTime


 29%|███████████████████████████████████████████████▋                                                                                                                     | 28/97 [01:29<03:48,  3.32s/it]

making predictions for model ThermalTime
model ThermalTime got a MAE of 25.125
model ThermalTime got an RMSE of 37.750827805493216
model ThermalTime's median error is: -1.5
Ripeness Day: 217.875
Rubus ellipticus
Rubus hayata
running model ThermalTime
making predictions for model ThermalTime


 31%|███████████████████████████████████████████████████                                                                                                                  | 30/97 [01:31<02:42,  2.42s/it]

model ThermalTime got a MAE of 11.166666666666666
model ThermalTime got an RMSE of 12.322066926183016
model ThermalTime's median error is: 2.5
Ripeness Day: 214.0
Rubus odoratus
Rubus phoenicolasius
running model ThermalTime


 33%|██████████████████████████████████████████████████████▍                                                                                                              | 32/97 [01:34<02:09,  2.00s/it]

making predictions for model ThermalTime
model ThermalTime got a MAE of 9.6
model ThermalTime got an RMSE of 10.881176406988354
model ThermalTime's median error is: 5.0
Ripeness Day: 183.0
Rubus rolfei
running model ThermalTime


 34%|████████████████████████████████████████████████████████▏                                                                                                            | 33/97 [01:37<02:18,  2.17s/it]

making predictions for model ThermalTime
model ThermalTime got a MAE of 11.0
model ThermalTime got an RMSE of 11.0
model ThermalTime's median error is: -11.0
Ripeness Day: 214.0
Citrus aurantium
running model ThermalTime


 35%|█████████████████████████████████████████████████████████▊                                                                                                           | 34/97 [01:40<02:24,  2.30s/it]

making predictions for model ThermalTime
model ThermalTime got a MAE of 13.5
model ThermalTime got an RMSE of 13.5
model ThermalTime's median error is: -13.5
Ripeness Day: 336.0
Citrus sinensis
running model ThermalTime


 36%|███████████████████████████████████████████████████████████▌                                                                                                         | 35/97 [01:44<03:00,  2.92s/it]

making predictions for model ThermalTime
model ThermalTime got a MAE of 28.0
model ThermalTime got an RMSE of 34.45770354120154
model ThermalTime's median error is: 26.0
Ripeness Day: 275.0
Prunus americana
running model ThermalTime
making predictions for model ThermalTime


 37%|█████████████████████████████████████████████████████████████▏                                                                                                       | 36/97 [01:48<03:08,  3.09s/it]

model ThermalTime got a MAE of 40.77777777777778
model ThermalTime got an RMSE of 48.546197928708416
model ThermalTime's median error is: -20.0
Ripeness Day: 218.13333333333333
Prunus domestica
running model ThermalTime


 38%|██████████████████████████████████████████████████████████████▉                                                                                                      | 37/97 [01:51<03:11,  3.19s/it]

making predictions for model ThermalTime
model ThermalTime got a MAE of 22.05263157894737
model ThermalTime got an RMSE of 28.36695631410455
model ThermalTime's median error is: -15.0
Ripeness Day: 243.3684210526316
Prunus maritima
running model ThermalTime


 39%|████████████████████████████████████████████████████████████████▋                                                                                                    | 38/97 [01:54<03:03,  3.10s/it]

making predictions for model ThermalTime
model ThermalTime got a MAE of 19.625
model ThermalTime got an RMSE of 24.902309129877896
model ThermalTime's median error is: -14.5
Ripeness Day: 245.0
Prunus nigra
running model ThermalTime
making predictions for model ThermalTime


 40%|██████████████████████████████████████████████████████████████████▎                                                                                                  | 39/97 [01:59<03:19,  3.45s/it]

model ThermalTime got a MAE of 55.0
model ThermalTime got an RMSE of 55.0
model ThermalTime's median error is: -55.0
Ripeness Day: 275.0
Prunus cerasifera
running model ThermalTime


 40%|██████████████████████████████████████████████████████████████████▎                                                                                                  | 39/97 [02:00<02:59,  3.09s/it]


KeyboardInterrupt: 

In [None]:
## Train models

genus_prediction_dict = {}

for s in tqdm(filtered_plants['genus'].unique()):
    print(s)
    species_train_df = filtered_plants.query('genus == "{}" and year < {}'.format(s, high_cutoff_year))
    
    if len(species_train_df) == 0:
        continue
    
    species_test_df = filtered_plants.query('genus == "{}" and year >= {}'.format(s, high_cutoff_year))
    
    if len(species_test_df) == 0:
        # make predictions and compare to the mean ripeness day at each site
        species_test_df = make_test_df(species_train_df)
    
    if len(species_test_df) == 0:
        print("No test data for {}".format(s))
        #print(species_test_df)
        
    predictions = train_ripeness_small(species_train_df, weather_training,
                        species_test_df, weather_test)
    
    genus_prediction_dict[s] = predictions

In [23]:
pd.DataFrame(filtered_plants.groupby('formatted_sci_name').size()).rename(columns={0:'n'}).query('n > 10')

Unnamed: 0_level_0,n
formatted_sci_name,Unnamed: 1_level_1
Amelanchier,318
Amelanchier alnifolia,189
Amelanchier arborea,127
Amelanchier canadensis,199
Amelanchier grandiflora,251
Amelanchier laevis,86
Citrus,571
Cornus mas,45
Diospyros,63
Diospyros kaki,33


In [25]:
pd.DataFrame(filtered_plants.groupby('genus').size()).rename(columns={0:'n'}).query('n > 1')

Unnamed: 0_level_0,n
genus,Unnamed: 1_level_1
Amelanchier,1176
Citrus,588
Cornus,45
Diospyros,445
Eriobotrya,223
Ficus,229
Juglans,349
Malus,13553
Morus,1090
Olea,485


## Model Validation

In [None]:
#full_predictions = pd.concat(list(species_prediction_dict.values()))
genus_predictions = pd.concat(list(species_prediction_dict.values()))

In [None]:
full_predictions

In [None]:
full_predictions['abs_error'] = np.abs(full_predictions['doy'] - full_predictions['flowering_day'])
full_predictions['square_error'] = full_predictions['abs_error'] ** 2

In [None]:
full_predictions.sort_values('abs_error', ascending=False)[0:50]

In [None]:
np.mean(full_predictions['abs_error'])

In [None]:
np.median(full_predictions['abs_error'])

In [None]:
len(full_predictions.query('abs_error < 30.6')) / len(full_predictions)

In [None]:
np.std(full_predictions['abs_error'])

In [None]:
len(full_predictions.query('abs_error < 43.09')) / len(full_predictions)

In [None]:
plt.scatter(full_predictions['doy'], full_predictions['flowering_day'])

Idea for corrections: take the lower error between the base error and the year-transformed error.

NEW TRAINING PARADIGM:
Specific data if possible, more generalized data if not.
Ex. use European high-time-res data if possible, otherwise use monthly data. 
Use species if possible, otherwise use genus. 

High error is driven by southern hemisphere and "date wrapping". 

Removing southern hemisphere data and dates that are either very early in the year or very late works a little better.

Ok, median error seems to be a better error metric. the old (coarse) method had a Median Absolute Error of 25, while this one has one of 14. 

After doing some more analysis, 71% of observations fall under the mean (30 days) and 81% fall under the standard deviation (43 days). 

**Trying with no northern hemisphere / Date Wrapping**

In [None]:
unwrapped_northern_preds = full_predictions.query('latitude > 0 and doy > 60')

In [None]:
unwrapped_northern_preds.sort_values('abs_error', ascending=False)[0:50]

In [None]:
plt.hist(full_predictions['abs_error'], bins=50)

In [None]:
np.median(full_predictions['abs_error'])

In [None]:
print(unwrapped_northern_preds)

print(rmse(unwrapped_northern_preds['doy'], unwrapped_northern_preds['flowering_day']))
print(mae(unwrapped_northern_preds['doy'], unwrapped_northern_preds['flowering_day']))

In [None]:
def rmse(y1, y2):
    return np.sqrt(np.mean((y1 - y2) ** 2))

def mae(y1, y2):
    return np.mean(np.abs(y1 - y2))

In [None]:
small_preds = full_predictions.dropna()

In [None]:
print(rmse(full_predictions['doy'], full_predictions['flowering_day']))
print(rmse(small_preds['doy'], small_preds['flowering_day']))
print(mae(full_predictions['doy'], full_predictions['flowering_day']))
print(mae(small_preds['doy'], small_preds['flowering_day']))

Old MAE: 31.9; new MAE: 30.6.