In [191]:
import cfgrib
import xarray as xr

import pandas as pd
import numpy as np

from pyPhenology import models, utils

from tqdm import trange, tqdm

import matplotlib.pyplot as plt

from warnings import warn

from daylength import daylength


default_models = [models.ThermalTime(), models.FallCooling(), models.M1(), models.MSB()]
default_model_names = ['ThermalTime', "FallCooling", "M1", "MSB"]

# Turns a dataframe containing predictions of flowering day to a dict? Not sure what this does. 
def ripeness_data_to_dict(ripeness_data):    
    
    mean_maturation = np.mean(ripeness_data['flowering_day'])
    
    prediction_dict = {
        "full_flowering_data": ripeness_data,
        #"species_site_flowering days": list(ripeness_data['flowering_day']),
        "mean_flowering_day": np.mean(ripeness_data['flowering_day']),
        "best_model": ripeness_data['model'][0]
    }
    
    return prediction_dict

def get_ripeness_days(model_output):
    ripeness_days = {}
    
    for k in model_output:
        ripeness_days[k] = model_output[k]['mean_flowering_day']
    
    return ripeness_days


def aic(obs, pred, n_param):
    return len(obs) * np.log(np.mean((obs - pred)**2)) + 2*(n_param + 1)


# Trains a model with a given set of test observations and test predictors. 
def train_ripeness(observations, predictors, test_observations, test_predictors, models=['ThermalTime']):
    # set up model comparisons
    best_aic=np.inf
    best_model = None
    best_model_name = None

    # iterate through all models
    for model_name in models:
        print("running model {m}".format(m=model_name))
        
        Model = utils.load_model(model_name)
        model = Model()
        model.fit(observations, predictors, optimizer_params='practical')
        
        # predict from test observations
        print("making predictions for model {m}".format(m=model_name))        
        preds = model.predict(test_observations, test_predictors)
        
        #print(preds)
        test_days = test_observations.doy.values
        #print(test_days)
        # this isn't valid - need to filter by site IDs
        
        # THIS IS REALLY BAD:
        test_days = test_days[0:len(preds)]
        #print(test_days)
        
        # score model
        model_aic = aic(obs = test_days,
                        pred=preds,
                        n_param = len(model.get_params()))

        if model_aic < best_aic:
            best_model = model
            best_model_name = model_name
            best_aic = model_aic

        print('model {m} got an aic of {a}'.format(m=model_name,a=model_aic))

    print('Best model: {m}'.format(m=best_model_name))
    print('Best model paramters:')
    print(best_model.get_params())
    print("Ripeness Day: {}".format(np.mean(preds)))
    
    ripeness_data = test_observations
    ripeness_data['flowering_day'] = preds
    ripeness_data['model'] = best_model_name
    
    return ripeness_data

# Trains a model and uses a portion of the training data for testing. 
def train_ripeness_percent(observations, predictors, test_percent, models=['ThermalTime'], suppress_output=False):
    test_observations = observations.sample(frac=test_percent)
    observations_train = observations.drop(test_observations.index)
    
    # set up model comparisons
    best_aic=np.inf
    best_model = None
    best_model_name = None

    # iterate through all models
    for model_name in models:
        if not suppress_output:
            print("running model {m}".format(m=model_name))
        
        Model = utils.load_model(model_name)
        model = Model()
        model.fit(observations_train, predictors, optimizer_params='practical')
        
        # predict from test observations
        if not suppress_output:
            print("making predictions for model {m}".format(m=model_name))        
        preds = model.predict(test_observations, predictors)
    
        #print(preds)
        test_days = test_observations.doy.values
        #print(test_days)
        
        # THIS IS REALLY BAD:
        test_days = test_days[0:len(preds)]
        #print(test_days)
        
        # score model
        model_aic = aic(obs = test_days,
                        pred=preds,
                        n_param = len(model.get_params()))
        if not suppress_output:
            print(model_aic)

        if model_aic < best_aic:
            best_model = model
            best_model_name = model_name
            best_aic = model_aic
            
        if not suppress_output:
            print('model {m} got an aic of {a}'.format(m=model_name,a=model_aic))

    print('Best model: {m}'.format(m=best_model_name))
    print('Best model paramters:')
    print(best_model.get_params())
    print("Ripeness Day: {}".format(np.mean(preds)))
    
    ripeness_data = test_observations
    ripeness_data['flowering_day'] = preds
    ripeness_data['model'] = best_model_name

    prediction_dict = {
        "trained_model": best_model,
        "model_aic": best_aic,
        "model_name": best_model_name
        "full_flowering_data": ripeness_data,
        "species_site_flowering days": list(ripeness_data['flowering_day']),
        "mean_flowering_day": np.mean(ripeness_data['flowering_day'])
    }
    
    return prediction_dict


# Gets the weather history for a specific site. 
def get_site_history(weather_array, site_id, site_lat, site_lon):
    filtered = weather_array.where((abs(weather_array.latitude - site_lat) <= 0.05) & (abs(weather_array.longitude - site_lon) <= 0.05), drop=True)
    
    #print("Converting GRIB to dataframe")
    site_df = filtered.to_dataframe().drop(["number", "step", "surface"], axis=1).reset_index().rename(columns={"skt":"temperature"})
    
    site_df['site_id'] = site_id
    
    site_df['year'] = site_df.time.dt.to_period('Y')
    site_df['doy'] = site_df.time.dt.strftime('%j').astype(int)
    
    site_df = site_df[['site_id', 'temperature', 'year', 'doy', 'latitude', 'longitude']]
    
    return(site_df)

def get_site_history_coarse(weather_array, site_id, site_lat, site_lon):
    filtered = weather_array.where((abs(weather_array.latitude - site_lat) <= 0.5) & (abs(weather_array.longitude - site_lon) <= 0.5), drop=True)
    
    #print("Converting GRIB to dataframe")
    site_df = filtered.to_dataframe().drop(["number", "step", "surface"], axis=1).reset_index().rename(columns={"skt":"temperature"})
    
    site_df['site_id'] = site_id
    
    site_df['year'] = site_df.time.dt.to_period('Y')
    site_df['doy'] = site_df.time.dt.strftime('%j').astype(int)
    
    site_df = site_df[['site_id', 'temperature', 'year', 'doy', 'latitude', 'longitude']]
    
    return(site_df)

def correct_leap_years(weather_df):
    leap_year_key = {60: 61, 
                 91: 92, 
                 121: 122, 
                 152: 153, 
                 182: 183, 
                 213: 214, 
                 244: 245, 
                 274: 275, 
                 305: 306, 
                 335: 336}
    
    return weather_df.replace({'doy': leap_year_key})


# Format Claudia's Data
def claudia_observations_to_pyphenology(claudia_obs):
    new_observations = claudia_obs.copy(deep=True)
    
    new_observations['species_actual'] = new_observations['specificEpithet']
    
    new_observations.rename(columns={'YEAR': 'year',
                            'DAY': 'doy',
                            'genus': 'species',
                            'LAT': 'latitude'}, inplace=True)
    
    new_observations.drop(['specificEpithet', 'eventRemarks', 'LON'], axis=1, inplace=True)
    
    new_observations['phenophase'] = 516
    
    return new_observations

SyntaxError: invalid syntax (1399079571.py, line 159)

In [57]:

import glob
import os

cutoff_year = 2010
species_data_cutoff = 10

In [3]:
# import weather data

grib_data = cfgrib.open_datasets('../data/weather_data.grib')

core_data = grib_data[0]

In [67]:
# Filter weather data resolution to just degrees
coarse_weather_data = core_data.coarsen(latitude=10, boundary="trim").mean().coarsen(longitude=10).mean()
coarse_weather_data = coarse_weather_data.dropna("latitude", how="all")

In [68]:
coarse_weather_data

In [76]:
### Load all plant csvs
path = os.getcwd()
parent_dir = os.path.dirname(path)
#print(parent_dir)

final_path = os.path.join(parent_dir, "data/plant phenology/final fruit datasets/*.csv")
#print(final_path)

csv_files = glob.glob(final_path)
#print(csv_files)

# Merge plant data
plant_data_list = []

for f in csv_files:
    df = pd.read_csv(f)
    
    plant_data_list.append(df)
    
final_plant_data = pd.concat(plant_data_list)

# format plant data
final_plant_data["lon_360"] = final_plant_data["LON"] % 360
formatted_plants = claudia_observations_to_pyphenology(final_plant_data)
formatted_plants = formatted_plants[formatted_plants['year'] >= cutoff_year].drop_duplicates()

In [110]:
site_histories = []

# get full site histories from the plant data
for index, row in tqdm(formatted_plants.iterrows()):
    
    site_histories.append(get_site_history_coarse(coarse_weather_data, row['site_id'], row['latitude'], row['lon_360']))

# create site history df, process a bit
full_site_histories = pd.concat(site_histories).dropna()

full_site_histories['year'] = full_site_histories['year'].astype(str).astype(int)
full_site_histories['site_id'] = full_site_histories['site_id'].astype(int)

32137it [10:50, 49.39it/s]


In [None]:
## Daylength and Leap Year corrections

# Correct for leap years
leap_year_key = {60: 61, 
                 91: 92, 
                 121: 122, 
                 152: 153, 
                 182: 183, 
                 213: 214, 
                 244: 245, 
                 274: 275, 
                 305: 306, 
                 335: 336}

corrected_leap_year_histories = full_site_histories.replace({'doy': leap_year_key})

# Day Length Correction
corrected_leap_year_histories['daylength'] = corrected_leap_year_histories.apply(lambda row: daylength(row['doy'], row['latitude']), axis=1)

In [122]:
# last filtering step – drop NAs and make sure the sites match. 
filtered_observations = formatted_plants[formatted_plants['site_id'].isin(corrected_leap_year_histories['site_id'])]
filtered_observations.dropna(inplace=True)
filtered_observations = filtered_observations[filtered_observations['year'] < 2023]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_observations.dropna(inplace=True)


In [181]:
# output formatted data to CSV

filtered_observations.to_csv("../data/model_training_data/all_plants_formatted.csv")
corrected_leap_year_histories.to_csv("../data/model_training_data/all_weather_coarse_formatted.csv")

In [182]:
# Create column of combined genus and species

filtered_observations['sci_name'] = filtered_observations['species'] + " " + filtered_observations['species_actual']

In [188]:
# Suppress Warnings

import warnings
warnings.filterwarnings('ignore')

In [None]:
# Train species-specific models
using_model_names = ['ThermalTime', "Linear", "Sequential", "Unichill", 'Alternating']

model_outputs = {}

for species in tqdm(filtered_observations['sci_name'].unique()):
    print(species)
    
    species_df = filtered_observations[filtered_observations['sci_name'] == species]
    #print(species_df)
    
    
    if len(species_df) < species_data_cutoff:
        print("Not enough data, skipping species. ")
        continue
    
    model_outputs[species] = train_ripeness_percent(species_df, corrected_leap_year_histories, 0.5, models=default_model_names, suppress_output=True)


  0%|                                                                                                                                                                             | 0/228 [00:00<?, ?it/s]

Rubus occidentalis


In [152]:
model_output_df = pd.DataFrame.from_dict(get_ripeness_days(model_outputs), "index").reset_index()
model_output_df.columns = ['species', 'mean_ripeness_day']
model_output_df['formatted_date'] = pd.to_datetime(model_output_df['mean_ripeness_day'], format='%j').dt.strftime('%m-%d')

In [153]:
model_output_df

Unnamed: 0,species,mean_ripeness_day
0,Rubus occidentalis,187.275862
1,Ficus carica,208.833333
2,Olea europaea,214.000000
3,Olea europea,305.088235
4,Morus rubra,161.297872
...,...,...
70,Pyrus communis 'Early cultivar',245.000000
71,Pyrus communis 'Late cultivar',269.624765
72,Pyrus communis 'Williams',255.000000
73,Pyrus communis 'Bartlett',239.833333


In [160]:
model_output_df.to_csv('../data/model_training_data/all_species_model_outputs')

# Apply Ripeness Curves?

In [None]:
# Method 1: 