In [73]:
import pandas as pd
import numpy as np
from skimpy import clean_columns
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

### Count Data

In [40]:
scm_counts = pd.read_csv('Data/2023_scm_counts.csv')
scm_counts = clean_columns(scm_counts)
scm_counts

Unnamed: 0,card_id,data_collector,site,date,n_scm_i_m,n_scm_i_f,n_scm_o_m,n_scm_o_f,n_d_florilega_i,n_d_florilega_o,initials,notes
0,2245,Janice Degni,DEG_DIE_HILL,2023-04-21,,,,,,,,
1,2246,Janice Degni,DEG_DIE_HILL,2023-04-21,16.0,10.0,7.0,5.0,0.0,0.0,SN,
2,2265,Janice Degni,DEG_DIE_HILL,2023-04-27,,,,,,,,
3,2266,Janice Degni,DEG_DIE_HILL,2023-04-27,,,,,,,,
4,2373,Janice Degni,DEG_DIE_HILL,2023-05-04,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
421,2272,Marion Zuefle,ZUE_REE,2023-04-26,44.0,34.0,21.0,34.0,0.0,0.0,AS,
422,2383,Marion Zuefle,ZUE_REE,2023-05-02,,,,,,,,
423,2384,Marion Zuefle,ZUE_REE,2023-05-02,,,,,,,,
424,2417,Marion Zuefle,ZUE_REE,2023-05-09,,,,,,,,


### GDD Data

In [44]:
gdd = pd.read_csv("Data/2023_scm_gdd.csv")
gdd = clean_columns(gdd)
gdd

Unnamed: 0,record_id,date,temp_max_f,temp_min_f,temp_mean_f,gdd_fahrenheit_simple,gdd_fahrenheit_sine,gdd_fahrenheit_simple_cumsum,gdd_fahrenheit_sine_cum_sum,temp_max_c,temp_min_c,temp_mean_c,gdd_celsius_simple,gdd_celsius_sine,gdd_celsius_simple_cum_sum,gdd_celsius_sine_cum_sum
0,POV_DUN,2023-01-01,51,36,43.5,4.5,5.083043,4.5,5.083043,10.555556,2.222222,6.388889,2.500000,2.823913,2.500000,2.823913
1,POV_DUN,2023-01-02,43,37,40.0,1.0,1.509001,5.5,6.592044,6.111111,2.777778,4.444444,0.555556,0.838334,3.055556,3.662247
2,POV_DUN,2023-01-03,48,29,38.5,0.0,2.779669,5.5,9.371713,8.888889,-1.666667,3.611111,0.000000,1.544261,3.055556,5.206507
3,POV_DUN,2023-01-04,52,38,45.0,6.0,6.115838,11.5,15.487552,11.111111,3.333333,7.222222,3.333333,3.397688,6.388889,8.604195
4,POV_DUN,2023-01-05,57,37,47.0,8.0,8.273364,19.5,23.760915,13.888889,2.777778,8.333333,4.444444,4.596313,10.833333,13.200508
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10131,ONE_STA_1,2023-06-26,80,65,72.5,33.5,33.500000,1438.0,1496.852715,26.666667,18.333333,22.500000,18.611111,18.611111,798.888889,831.584842
10132,ONE_STA_1,2023-06-27,85,65,75.0,36.0,36.000000,1474.0,1532.852715,29.444444,18.333333,23.888889,20.000000,20.000000,818.888889,851.584842
10133,ONE_STA_1,2023-06-28,80,62,71.0,32.0,32.000000,1506.0,1564.852715,26.666667,16.666667,21.666667,17.777778,17.777778,836.666667,869.362619
10134,ONE_STA_1,2023-06-29,67,59,63.0,24.0,24.000000,1530.0,1588.852715,19.444444,15.000000,17.222222,13.333333,13.333333,850.000000,882.695953


Farms with first emergence data - 

- DIP_CUR
- DIP_FLE
- POV_DUN
- GAB_STE

In [57]:
first_emergence_farms = ['DIP_CUR', 'DIP_FLE', 'POV_DUN', 'GAB_STE']
first_emergence_farms

['DIP_CUR', 'DIP_FLE', 'POV_DUN', 'GAB_STE']

In [105]:
for farm in first_emergence_farms:
    count_tbl_name = farm + '_counts'
    gdd_tbl_name = farm + '_gdd'

    count_tbl_name = scm_counts[scm_counts['site'] == farm]
    gdd_tbl_name = gdd.loc[gdd['record_id'] == farm]

    pred_avg = gdd_tbl_name.loc[gdd_tbl_name['gdd_fahrenheit_simple_cumsum'] >= 301]['date'].values[0]
    pred_sin = gdd_tbl_name.loc[gdd_tbl_name['gdd_fahrenheit_sine_cum_sum'] >= 222]['date'].values[0]
    actual = count_tbl_name.loc[(count_tbl_name['n_scm_i_m'] >= 1) | (count_tbl_name['n_scm_o_m'] >= 1) | (count_tbl_name['n_scm_o_f'] >= 1)| (count_tbl_name['n_scm_o_f'] >= 1)]['date'].values[0]
    
    print('The actual first emergence date at ' + farm + ' is ' + actual)

    print('The predicted first emergence date using the simple average model at ' + farm + ' is ' + pred_avg)
    print('The predicted date using the simple average model was ' + str((datetime.strptime(pred_avg, '%Y-%m-%d').date() - datetime.strptime(actual, '%Y-%m-%d').date()).days) + ' days late') 

    print('The predicted first emergence date using the sine wave model at ' + farm + ' is ' + pred_sin)
    print('The predicted date using the sine wave model was ' + str((datetime.strptime(pred_sin, '%Y-%m-%d').date() - datetime.strptime(actual, '%Y-%m-%d').date()).days) + ' days late') 

    
    print(' ')
    

The actual first emergence date at DIP_CUR is 2023-04-06
The predicted first emergence date using the simple average model at DIP_CUR is 2023-04-23
The predicted date using the simple average model was 17 days late
The predicted first emergence date using the sine wave model at DIP_CUR is 2023-04-14
The predicted date using the sine wave model was 8 days late
 
The actual first emergence date at DIP_FLE is 2023-03-28
The predicted first emergence date using the simple average model at DIP_FLE is 2023-04-21
The predicted date using the simple average model was 24 days late
The predicted first emergence date using the sine wave model at DIP_FLE is 2023-04-12
The predicted date using the sine wave model was 15 days late
 
The actual first emergence date at POV_DUN is 2023-03-28
The predicted first emergence date using the simple average model at POV_DUN is 2023-04-22
The predicted date using the simple average model was 25 days late
The predicted first emergence date using the sine wave m

Simple Average Model - GDD Average for Actual Emergence

In [109]:
np.mean([54, 62.5, 94, 58.5])

67.25

Sine Wave Model - GDD Average for Actual Emergence

In [110]:
np.mean([109.4494181, 118.5207495, 147.4329546, 130.9217006])

126.5812057