In [1]:
from pyPhenology import models, utils
import numpy as np

observations, predictors = utils.load_test_data(name='vaccinium', phenophase='flowers')


In [2]:
observations

Unnamed: 0,species,site_id,year,doy,phenophase
48,vaccinium corymbosum,1,1998,122,501
49,vaccinium corymbosum,1,1998,122,501
50,vaccinium corymbosum,1,1991,124,501
51,vaccinium corymbosum,1,1991,124,501
52,vaccinium corymbosum,1,1998,126,501
53,vaccinium corymbosum,1,2000,128,501
54,vaccinium corymbosum,1,2000,128,501
55,vaccinium corymbosum,1,1991,128,501
56,vaccinium corymbosum,1,2001,128,501
57,vaccinium corymbosum,1,2001,128,501


In [3]:
predictors

Unnamed: 0,site_id,temperature,year,doy,latitude,longitude,daylength
0,1,13.10,1990,-65,42.5429,-72.2011,10.24
1,1,13.26,1990,-64,42.5429,-72.2011,10.20
2,1,12.30,1990,-63,42.5429,-72.2011,10.16
3,1,12.15,1990,-62,42.5429,-72.2011,10.11
4,1,13.00,1990,-61,42.5429,-72.2011,10.07
...,...,...,...,...,...,...,...
4351,1,7.93,2001,293,42.5429,-72.2011,10.55
4352,1,10.76,2001,294,42.5429,-72.2011,10.51
4353,1,8.45,2001,295,42.5429,-72.2011,10.46
4354,1,9.43,2001,296,42.5429,-72.2011,10.42


In [2]:
model = models.ThermalTime()

In [3]:
model.fit(observations, predictors)

In [4]:
model.get_params()

{'t1': 28.339538732409196, 'T': 5.4653630887115945, 'F': 357.0335766940111}

In [5]:
model.predict(observations, predictors)

array([126, 126, 127, 127, 126, 129, 129, 127, 132, 132, 133, 133, 132,
       132, 130, 130, 130, 129, 127, 126, 132, 130, 129, 132, 132, 133,
       133, 137, 137, 141, 141, 142, 132, 141, 141, 139, 139, 139, 139,
       137, 137, 141, 141, 141, 141, 142, 142, 142])

In [6]:
observations[observations['year'] == 2001]

Unnamed: 0,species,site_id,year,doy,phenophase
56,vaccinium corymbosum,1,2001,128,501
57,vaccinium corymbosum,1,2001,128,501
71,vaccinium corymbosum,1,2001,134,501
72,vaccinium corymbosum,1,2001,134,501


In [35]:
predictors[4300:]

Unnamed: 0,site_id,temperature,year,doy,latitude,longitude,daylength
4300,1,19.26,2001,242,42.5429,-72.2011,13.02
4301,1,23.08,2001,243,42.5429,-72.2011,12.97
4302,1,14.26,2001,244,42.5429,-72.2011,12.93
4303,1,13.98,2001,245,42.5429,-72.2011,12.88
4304,1,16.25,2001,246,42.5429,-72.2011,12.83
4305,1,16.85,2001,247,42.5429,-72.2011,12.78
4306,1,12.89,2001,248,42.5429,-72.2011,12.73
4307,1,15.45,2001,249,42.5429,-72.2011,12.68
4308,1,19.12,2001,250,42.5429,-72.2011,12.64
4309,1,21.33,2001,251,42.5429,-72.2011,12.59


In [34]:
model.score()

2.9154759474226504

In [35]:
model.save_params(filename='trained_models/blueberry_model.json')

RuntimeWarning: File trained_models/blueberry_model.json exists. User overwrite=True to overwite

In [11]:
model = utils.load_saved_model(filename='trained_models/blueberry_model.json')

In [12]:
model.predict(to_predict=observations, predictors=predictors)

array([126, 126, 127, 127, 126, 129, 129, 127, 132, 132, 133, 133, 132,
       132, 130, 130, 130, 129, 127, 126, 132, 130, 129, 132, 132, 133,
       133, 138, 138, 141, 141, 142, 132, 141, 141, 139, 139, 139, 139,
       138, 138, 141, 141, 141, 141, 142, 142, 142])

In [22]:
species_dict = {
    "apple": [0, 1, 2]
}

current_doy = 2
current_doy2 = 4

def ripeness(species, doy):
    if species not in species_dict:
        return False
    elif doy in species_dict[species]:
        return True
    else:
        return False

In [16]:
ripeness("apple", 4)

False

In [18]:
ripeness("apple", 2)

True

In [23]:
ripeness("peach", 4)

False

**Compare Multiple Models**

In [3]:
m1 = models.ThermalTime()
m2 = models.FallCooling()
m3 = models.M1()
m4 = models.MSB()

ensemble = models.Ensemble(core_models=[m1,m2,m3,m4])
ensemble.fit(observations, predictors)

In [4]:
model_preds = ensemble.predict(observations, predictors, aggregation="none")

In [5]:
ensemble.score()

12.849894616947902

In [11]:
len(model_preds[0])

48

In [7]:
len(observations)

48

Model predictions seem to be for each site. figure out a way to average per site / species? 

Model predicts day of flowering. 

In [13]:
## This is the species/site model prediction function. 

def aic(obs, pred, n_param):
        return len(obs) * np.log(np.mean((obs - pred)**2)) + 2*(n_param + 1)

observations, predictors = utils.load_test_data(name='vaccinium',
                                                phenophase='budburst')

default_models = [models.ThermalTime(), models.FallCooling(), models.M1(), models.MSB()]

default_model_names = ['ThermalTime', "FallCooling", "M1", "MSB"]

def get_site_ripeness(observations, predictors, test_percent, site_id, species, models=['ThermalTime']):
    # filter out train / test
    observations_test = observations.sample(frac=test_percent)
    observations_train = observations.drop(observations_test.index)
    
    # set up model comparisons
    best_aic=np.inf
    best_base_model = None
    best_base_model_name = None

    # iterate through all models
    for model_name in models:
        print("running model {m}".format(m=model_name))
        
        Model = utils.load_model(model_name)
        model = Model()
        model.fit(observations_train, predictors, optimizer_params='practical')
        
        # predict from test observations
        print("making predictions for model {m}".format(m=model_name))        
        preds = model.predict(observations_test, predictors)
        
        # score model
        model_aic = aic(obs = observations_test.doy.values,
                        pred=preds,
                        n_param = len(model.get_params()))

        if model_aic < best_aic:
            best_model = model
            best_model_name = model_name
            best_aic = model_aic

        print('model {m} got an aic of {a}'.format(m=model_name,a=model_aic))

    print('Best model: {m}'.format(m=best_model_name))
    print('Best model paramters:')
    print(best_model.get_params())
    
    ripeness_data = observations_test
    ripeness_data['flowering_day'] = preds
    final_ripeness_data = ripeness_data[(ripeness_data.species == species) & (ripeness_data.site_id == site_id)]
    
    mean_maturation = np.mean(final_ripeness_data['flowering_day'])
    
    prediction_dict = {
        "trained_model": best_model,
        "model_aic": best_aic,
        "full_flowering_data": final_ripeness_data,
        "species_site_flowering days": list(final_ripeness_data['flowering_day']),
        "mean_flowering_day": np.mean(final_ripeness_data['flowering_day'])
    }
    
    print(prediction_dict)
    
    return prediction_dict
    
    

In [14]:
def predict_ripeness(prediction_dict, doy):
    if doy >= prediction_dict['mean_flowering_day']:
        return True
    else:
        return False

In [15]:
all_models = ['ThermalTime']
blueberry_models = get_site_ripeness(observations, predictors, 0.5, 1, "vaccinium corymbosum", models=all_models)

running model ThermalTime
making predictions for model ThermalTime
model ThermalTime got an aic of 71.90210784861087
Best model: ThermalTime
Best model paramters:
{'t1': 86.75113903147067, 'T': 8.623959750612043, 'F': 92.12511077659104}
{'trained_model': <pyPhenology.models.thermaltime.ThermalTime object at 0x107bb0b90>, 'model_aic': 71.90210784861087, 'full_flowering_data':                  species  site_id  year  doy  phenophase  flowering_day
15  vaccinium corymbosum        1  1993  116         371            119
42  vaccinium corymbosum        1  1995  126         371            124
13  vaccinium corymbosum        1  1993  116         371            119
31  vaccinium corymbosum        1  1999  120         371            120
38  vaccinium corymbosum        1  1997  122         371            125
4   vaccinium corymbosum        1  1998  106         371            106
28  vaccinium corymbosum        1  1999  120         371            120
7   vaccinium corymbosum        1  1991  108  

In [16]:
model_output_df = blueberry_models['full_flowering_data']

In [17]:
predict_ripeness(blueberry_models, 200)

True

TODO: implement basic gaussian curve for a few days after the ripening date. ask claudia about this. 

## Experiment with Predictions

In [10]:
observations.groupby('year').size()

year
1990    4
1991    4
1992    4
1993    4
1994    4
1995    4
1996    4
1997    4
1998    4
1999    4
2000    4
2001    4
dtype: int64

In [22]:
model_output_df[['year', 'flowering_day']].drop_duplicates().sort_values('year')

Unnamed: 0,year,flowering_day
10,1990,118
7,1991,99
45,1992,123
15,1993,119
32,1994,120
42,1995,124
19,1996,121
38,1997,125
4,1998,106
31,1999,120


In [28]:
predictors.groupby('year')[['year', 'temperature']].mean()


Unnamed: 0_level_0,year,temperature
year,Unnamed: 1_level_1,Unnamed: 2_level_1
1990,1990.0,7.577989
1991,1991.0,8.576529
1992,1992.0,6.858457
1993,1993.0,7.083196
1994,1994.0,6.755317
1995,1995.0,8.114105
1996,1996.0,6.699366
1997,1997.0,7.208595
1998,1998.0,8.504766
1999,1999.0,8.440551


I think converting things to C instead of keeping in K is a better approach – more variance between years. 

In [30]:
observations

Unnamed: 0,species,site_id,year,doy,phenophase
0,vaccinium corymbosum,1,1991,100,371
1,vaccinium corymbosum,1,1991,100,371
2,vaccinium corymbosum,1,1991,104,371
3,vaccinium corymbosum,1,1998,106,371
4,vaccinium corymbosum,1,1998,106,371
5,vaccinium corymbosum,1,1998,106,371
6,vaccinium corymbosum,1,1998,106,371
7,vaccinium corymbosum,1,1991,108,371
8,vaccinium corymbosum,1,1996,114,371
9,vaccinium corymbosum,1,1990,116,371


In [31]:
test_obs_no_obs = observations[['species', 'site_id', 'year']]

In [32]:
model.predict(to_predict=test_obs_no_obs, predictors=predictors)

array([127, 127, 127, 126, 126, 126, 126, 127, 141, 133, 133, 133, 133,
       130, 130, 130, 130, 141, 141, 141, 141, 129, 129, 129, 132, 132,
       132, 139, 132, 132, 132, 132, 137, 137, 137, 137, 142, 142, 142,
       142, 132, 129, 139, 139, 139, 141, 141, 141])

As predicted, to make predictions you only need species, year, site ID.