## Time Series
Columns
* Sitename
* Range
* Column
* Date
* Canopy Height
* Cumulative GDD
* Environmental Factors
    * Min / Max / Mean Temp
    * Humidity
    * others. . . 

In [1]:
import datetime
import numpy as np
import pandas as pd
import sqlalchemy
import sqlite3

In [4]:
df_0 = pd.read_csv('data/raw/mac_season_4.csv', low_memory=False)
df_0.shape

(372363, 39)

In [5]:
df_0.columns

Index(['Unnamed: 0', 'checked', 'result_type', 'id', 'citation_id', 'site_id',
       'treatment_id', 'sitename', 'city', 'lat', 'lon', 'scientificname',
       'commonname', 'genus', 'species_id', 'cultivar_id', 'author',
       'citation_year', 'treatment', 'date', 'time', 'raw_date', 'month',
       'year', 'dateloc', 'trait', 'trait_description', 'mean', 'units', 'n',
       'statname', 'stat', 'notes', 'access_level', 'cultivar', 'entity',
       'method_name', 'view_url', 'edit_url'],
      dtype='object')

In [13]:
cols_to_drop = ['Unnamed: 0', 'checked', 'result_type', 'id', 'citation_id', 'treatment_id', 'city', 
                'scientificname', 'commonname', 'genus', 'species_id', 'author', 'citation_year', 'treatment', 
                'time', 'raw_date', 'month', 'year', 'dateloc', 'trait_description', 'units', 'n', 'statname',
                'stat', 'notes', 'access_level', 'entity', 'method_name', 'view_url', 'edit_url']

In [14]:
df_1 = df_0.drop(labels=cols_to_drop, axis=1)
# df_1.head()

Unnamed: 0,site_id,sitename,lat,lon,cultivar_id,date,trait,mean,cultivar
0,6000005673,MAC Field Scanner Season 4 Range 11 Column 5,33.074907,-111.974982,6000000730,2017 Jun 14 (America/Phoenix),leaf_desiccation_present,0.0,PI181083
1,6000005676,MAC Field Scanner Season 4 Range 11 Column 6,33.074907,-111.974966,6000000231,2017 Jun 14 (America/Phoenix),leaf_desiccation_present,0.0,PI564163
2,6000005685,MAC Field Scanner Season 4 Range 11 Column 9,33.074907,-111.974917,6000000860,2017 Jun 14 (America/Phoenix),leaf_desiccation_present,0.0,PI52606
3,6000005691,MAC Field Scanner Season 4 Range 11 Column 11,33.074907,-111.974884,6000000863,2017 Jun 14 (America/Phoenix),leaf_desiccation_present,0.0,PI533792
4,6000005700,MAC Field Scanner Season 4 Range 11 Column 14,33.074907,-111.974835,6000000869,2017 Jun 14 (America/Phoenix),leaf_desiccation_present,0.0,PI535794


In [15]:
# df_1.shape

(372363, 9)

In [16]:
# print(df_1.date.min())
# print(df_1.date.max())

2017 Apr 25 (America/Phoenix)
2017 Sep 15 (America/Phoenix)


In [17]:
# Do all dates in the df contain the string (America/Phoenix)?

count = 0
for d in df_1.date.values:
    if 'America/Phoenix' in d:
        count += 1

count

16482

In [18]:
# Do all the dates start with 2017?

count = 0
for d in df_1.date.values:
    if d.startswith('2017'):
        count += 1
        
count

372363

#### 1. Change date values to iso date format for time series index

In [21]:
new_dates = []

for d in df_1.date.values:
    
    if 'Phoenix' in d:
        new_name = d[:-18]
        new_dates.append(new_name)
    
    else:
        new_name = d
        new_dates.append(new_name)
        
print(len(new_dates))

372363


In [26]:
iso_format_dates = pd.to_datetime(new_dates)

In [27]:
df_1['date_1'] = iso_format_dates
df_1.head()

Unnamed: 0,site_id,sitename,lat,lon,cultivar_id,date,trait,mean,cultivar,date_1
0,6000005673,MAC Field Scanner Season 4 Range 11 Column 5,33.074907,-111.974982,6000000730,2017 Jun 14 (America/Phoenix),leaf_desiccation_present,0.0,PI181083,2017-06-14
1,6000005676,MAC Field Scanner Season 4 Range 11 Column 6,33.074907,-111.974966,6000000231,2017 Jun 14 (America/Phoenix),leaf_desiccation_present,0.0,PI564163,2017-06-14
2,6000005685,MAC Field Scanner Season 4 Range 11 Column 9,33.074907,-111.974917,6000000860,2017 Jun 14 (America/Phoenix),leaf_desiccation_present,0.0,PI52606,2017-06-14
3,6000005691,MAC Field Scanner Season 4 Range 11 Column 11,33.074907,-111.974884,6000000863,2017 Jun 14 (America/Phoenix),leaf_desiccation_present,0.0,PI533792,2017-06-14
4,6000005700,MAC Field Scanner Season 4 Range 11 Column 14,33.074907,-111.974835,6000000869,2017 Jun 14 (America/Phoenix),leaf_desiccation_present,0.0,PI535794,2017-06-14


#### II. Change sitenames to strip E and W subplot designations

In [28]:
sitename_values = df_1.sitename.values
no_e_w_names = []

for name in sitename_values:
    
    if name.endswith(' W') | name.endswith(' E'):
        name = name[:-2]
        no_e_w_names.append(name)
        
    else:
        no_e_w_names.append(name)

print(len(no_e_w_names))

372363


In [29]:
df_1['sitename_1'] = no_e_w_names
# df_1.sample(n=7)

Unnamed: 0,site_id,sitename,lat,lon,cultivar_id,date,trait,mean,cultivar,date_1,sitename_1
63726,6000005240,MAC Field Scanner Season 4 Range 2 Column 6,33.074584,-111.974966,6000001055,2017 Jun 3,surface_temperature,37.518488,Big_Kahuna,2017-06-03,MAC Field Scanner Season 4 Range 2 Column 6
111475,6000005482,MAC Field Scanner Season 4 Range 32 Column 14,33.075662,-111.974835,6000000837,2017 Jul 20,leaf_angle_alpha,3.51972,PI455217,2017-07-20,MAC Field Scanner Season 4 Range 32 Column 14
370963,6000005802,MAC Field Scanner Season 4 Range 35 Column 15,33.07577,-111.974819,6000000462,2017 Jun 5,leaf_angle_chi,2.089622,PI145626,2017-06-05,MAC Field Scanner Season 4 Range 35 Column 15
46496,6000005585,MAC Field Scanner Season 4 Range 9 Column 2,33.074835,-111.975031,6000000916,2017 Apr 29,surface_temperature,26.08938,PI569416,2017-04-29,MAC Field Scanner Season 4 Range 9 Column 2
145566,6000005728,MAC Field Scanner Season 4 Range 28 Column 7,33.075518,-111.97495,6000000742,2017 Jun 8,leaf_angle_mean,0.397646,PI221651,2017-06-08,MAC Field Scanner Season 4 Range 28 Column 7
350560,6000005947,MAC Field Scanner Season 4 Range 45 Column 4,33.076129,-111.974999,6000000962,2017 Jul 6,canopy_height,231.0,PI570254,2017-07-06,MAC Field Scanner Season 4 Range 45 Column 4
2528,6000005333,MAC Field Scanner Season 4 Range 50 Column 5,33.076309,-111.974983,6000000862,2017 Jul 11 (America/Phoenix),lodging_present,0.0,PI527045,2017-07-11,MAC Field Scanner Season 4 Range 50 Column 5


In [31]:
# df_1.loc[df_1.sitename.str.endswith(' E')].iloc[0]

site_id                                           6000007204
sitename       MAC Field Scanner Season 4 Range 9 Column 3 E
lat                                                  33.0748
lon                                                 -111.975
cultivar_id                                       6000000935
date                                             2017 Jun 14
trait                                       leaf_temperature
mean                                                  282.25
cultivar                                            PI569457
date_1                                   2017-06-14 00:00:00
sitename_1       MAC Field Scanner Season 4 Range 9 Column 3
Name: 2888, dtype: object

#### III. Extract Range and Column Values

In [32]:
df_1['range'] = df_1['sitename'].str.extract("Range (\d+)").astype(int)
df_1['column'] = df_1['sitename'].str.extract("Column (\d+)").astype(int)

# df_1.sample(n=7)

Unnamed: 0,site_id,sitename,lat,lon,cultivar_id,date,trait,mean,cultivar,date_1,sitename_1,range,column
16840,6000006066,MAC Field Scanner Season 4 Range 54 Column 12,33.076453,-111.974868,6000001055,2017 Jun 19,leaf_angle_chi,2.10951,Big_Kahuna,2017-06-19,MAC Field Scanner Season 4 Range 54 Column 12,54,12
141943,6000005461,MAC Field Scanner Season 4 Range 19 Column 5,33.075195,-111.974982,6000000977,2017 May 19,surface_temperature,37.241571,PI585461,2017-05-19,MAC Field Scanner Season 4 Range 19 Column 5,19,5
331191,6000005954,MAC Field Scanner Season 4 Range 48 Column 8,33.076237,-111.974934,6000000712,2017 Sep 11 (America/Phoenix),aboveground_fresh_biomass,57130.0,PI156326,2017-09-11,MAC Field Scanner Season 4 Range 48 Column 8,48,8
196187,6000005985,MAC Field Scanner Season 4 Range 49 Column 2,33.076273,-111.975032,6000000904,2017 Jul 3,surface_temperature,35.705957,PI563338,2017-07-03,MAC Field Scanner Season 4 Range 49 Column 2,49,2
280295,6000005944,MAC Field Scanner Season 4 Range 45 Column 3,33.076129,-111.975016,6000000964,2017 Aug 21,absorbance_850,0.413,PI570373,2017-08-21,MAC Field Scanner Season 4 Range 45 Column 3,45,3
313182,6000005945,MAC Field Scanner Season 4 Range 48 Column 6,33.076237,-111.974966,6000000893,2017 Aug 21,leaf_angle_alpha,1.271984,PI563020,2017-08-21,MAC Field Scanner Season 4 Range 48 Column 6,48,6
102753,6000006057,MAC Field Scanner Season 4 Range 53 Column 12,33.076417,-111.974868,6000001055,2017 Jun 27,canopy_height,236.0,Big_Kahuna,2017-06-27,MAC Field Scanner Season 4 Range 53 Column 12,53,12


#### Drop duplicated Columns, Reorder, Rename, & Sort by Date
* `site_id` will be dropped because of different ids associated with subplots

In [33]:
df_2 = df_1.drop(labels=['site_id', 'sitename', 'date'], axis=1)
df_2.shape

(372363, 10)

In [34]:
df_2.columns

Index(['lat', 'lon', 'cultivar_id', 'trait', 'mean', 'cultivar', 'date_1',
       'sitename_1', 'range', 'column'],
      dtype='object')

In [35]:
col_reorder = ['date_1', 'sitename_1', 'range', 'column', 'lat', 'lon', 'cultivar', 'cultivar_id', 'trait', 'mean']

In [37]:
df_3 = pd.DataFrame(data=df_2, columns=col_reorder, index=df_2.index)
# df_3.head()

Unnamed: 0,date_1,sitename_1,range,column,lat,lon,cultivar,cultivar_id,trait,mean
0,2017-06-14,MAC Field Scanner Season 4 Range 11 Column 5,11,5,33.074907,-111.974982,PI181083,6000000730,leaf_desiccation_present,0.0
1,2017-06-14,MAC Field Scanner Season 4 Range 11 Column 6,11,6,33.074907,-111.974966,PI564163,6000000231,leaf_desiccation_present,0.0
2,2017-06-14,MAC Field Scanner Season 4 Range 11 Column 9,11,9,33.074907,-111.974917,PI52606,6000000860,leaf_desiccation_present,0.0
3,2017-06-14,MAC Field Scanner Season 4 Range 11 Column 11,11,11,33.074907,-111.974884,PI533792,6000000863,leaf_desiccation_present,0.0
4,2017-06-14,MAC Field Scanner Season 4 Range 11 Column 14,11,14,33.074907,-111.974835,PI535794,6000000869,leaf_desiccation_present,0.0


In [38]:
df_4 = df_3.rename({'date_1': 'date', 'sitename_1': 'sitename', 'mean': 'value'}, axis=1)
# df_4.tail()

Unnamed: 0,date,sitename,range,column,lat,lon,cultivar,cultivar_id,trait,value
372358,2017-06-06,MAC Field Scanner Season 4 Range 44 Column 9,44,9,33.076093,-111.974917,PI329286,6000000555,leaf_angle_beta,2.052416
372359,2017-06-06,MAC Field Scanner Season 4 Range 44 Column 9,44,9,33.076093,-111.974917,PI329286,6000000555,leaf_angle_chi,2.140986
372360,2017-06-06,MAC Field Scanner Season 4 Range 44 Column 13,44,13,33.076093,-111.974852,PI329843,6000000813,leaf_angle_mean,0.401801
372361,2017-06-06,MAC Field Scanner Season 4 Range 44 Column 13,44,13,33.076093,-111.974852,PI329843,6000000813,leaf_angle_alpha,3.217374
372362,2017-06-06,MAC Field Scanner Season 4 Range 44 Column 13,44,13,33.076093,-111.974852,PI329843,6000000813,leaf_angle_beta,1.980157


In [39]:
df_5 = df_4.sort_values(by=['date'], ascending=True)
print(df_5.shape)
print(df_4.shape)
# df_5.head()

(372363, 10)
(372363, 10)


Unnamed: 0,date,sitename,range,column,lat,lon,cultivar,cultivar_id,trait,value
4945,2017-04-25,MAC Field Scanner Season 4 Range 7 Column 2,7,2,33.074763,-111.975035,PI303658,6000000553,planter_seed_drop,66.0
23043,2017-04-25,MAC Field Scanner Season 4 Range 10 Column 3,10,3,33.074871,-111.975011,PI175919,6000000725,planter_seed_drop,67.0
23042,2017-04-25,MAC Field Scanner Season 4 Range 10 Column 3,10,3,33.074871,-111.975019,PI175919,6000000725,planter_seed_drop,81.0
23041,2017-04-25,MAC Field Scanner Season 4 Range 10 Column 2,10,2,33.074871,-111.975027,PI641807,6000000993,planter_seed_drop,75.0
23040,2017-04-25,MAC Field Scanner Season 4 Range 10 Column 2,10,2,33.074871,-111.975035,PI641807,6000000993,planter_seed_drop,71.0


In [40]:
# df_5.tail()

Unnamed: 0,date,sitename,range,column,lat,lon,cultivar,cultivar_id,trait,value
330905,2017-09-15,MAC Field Scanner Season 4 Range 18 Column 14,18,14,33.075159,-111.974835,PI569425,6000000923,aboveground_biomass_moisture,73.98
330904,2017-09-15,MAC Field Scanner Season 4 Range 18 Column 13,18,13,33.075159,-111.974851,PI641817,6000000996,aboveground_biomass_moisture,67.35
24879,2017-09-15,MAC Field Scanner Season 4 Range 35 Column 14,35,14,33.07577,-111.974835,PI619838,6000000989,dry_matter_fraction,0.3696
330911,2017-09-15,MAC Field Scanner Season 4 Range 21 Column 15,21,15,33.075267,-111.974819,PI329632,6000000802,aboveground_biomass_moisture,52.0
267601,2017-09-15,MAC Field Scanner Season 4 Range 12 Column 13,12,13,33.074943,-111.974851,PI641824,6000000215,aboveground_biomass_moisture,69.51


#### III. Add Cumulative GDD

In [42]:
gdd_df = pd.read_csv('data/processed/daily_temps_gdd_2019-12-16T120237.csv')
# gdd_df.head()

Unnamed: 0.1,Unnamed: 0,date,day_of_year,air_temp_min,air_temp_max,air_temp_mean,gdd
0,108,2017-04-19,109,18.6,32.9,26.0,16.0
1,109,2017-04-20,110,14.1,33.3,23.5,29.0
2,110,2017-04-21,111,11.1,34.4,24.0,42.0
3,111,2017-04-22,112,14.5,35.5,25.0,57.0
4,112,2017-04-23,113,12.6,37.0,26.5,72.0


In [46]:
# Check for any days that would have a daily value of less than zero

daily_value_list = ((gdd_df.air_temp_min + gdd_df.air_temp_max) / 2) - 10
print(len(gdd_df))
print(len(daily_value_list))

150
150


In [47]:
# should have no output

for i in daily_value_list:
    
    if i < 0:
        print(i)

In [48]:
date_list = gdd_df.date.values
gdd_list = gdd_df.gdd.values

date_gdd_dict = dict(zip(date_list, gdd_list))
# date_gdd_dict

{'2017-04-19': 16.0,
 '2017-04-20': 29.0,
 '2017-04-21': 42.0,
 '2017-04-22': 57.0,
 '2017-04-23': 72.0,
 '2017-04-24': 86.0,
 '2017-04-25': 100.0,
 '2017-04-26': 113.0,
 '2017-04-27': 127.0,
 '2017-04-28': 140.0,
 '2017-04-29': 148.0,
 '2017-04-30': 158.0,
 '2017-05-01': 170.0,
 '2017-05-02': 186.0,
 '2017-05-03': 202.0,
 '2017-05-04': 220.0,
 '2017-05-05': 238.0,
 '2017-05-06': 256.0,
 '2017-05-07': 264.0,
 '2017-05-08': 273.0,
 '2017-05-09': 280.0,
 '2017-05-10': 289.0,
 '2017-05-11': 302.0,
 '2017-05-12': 319.0,
 '2017-05-13': 336.0,
 '2017-05-14': 352.0,
 '2017-05-15': 365.0,
 '2017-05-16': 374.0,
 '2017-05-17': 385.0,
 '2017-05-18': 398.0,
 '2017-05-19': 410.0,
 '2017-05-20': 422.0,
 '2017-05-21': 438.0,
 '2017-05-22': 456.0,
 '2017-05-23': 476.0,
 '2017-05-24': 498.0,
 '2017-05-25': 519.0,
 '2017-05-26': 537.0,
 '2017-05-27': 552.0,
 '2017-05-28': 570.0,
 '2017-05-29': 588.0,
 '2017-05-30': 606.0,
 '2017-05-31': 624.0,
 '2017-06-01': 643.0,
 '2017-06-02': 661.0,
 '2017-06-03': 6

In [50]:
df_date_list = df_5.date.values
len(df_date_list)

372363

In [55]:
# If date in df matches date in date:gdd dict, append gdd value to list

cum_gdd = []
counter = 0

# for d in df_date_list:
#     for k,v in date_gdd_dict.items():
        
#         if d 

In [56]:
sample_dates = date_list[:3]
sample_dates

array(['2017-04-19', '2017-04-20', '2017-04-21'], dtype=object)

In [64]:
counter = 0

for k,v in date_gdd_dict.items():
    
    for d in df_date_list:
        
        if k == d:
            print(k)
            print(v)
            print(d)
            break

#### IV. Add TO Mappings

#### Final Steps
* Create `.csv`

In [66]:
need_to_create_csv = False

if need_to_create_csv:

    timestamp = datetime.datetime.now().replace(microsecond=0).isoformat()
    output_filename = f'tall_format_traits_{timestamp}.csv'.replace(':', '')
    df_5.to_csv(f'data/processed/{output_filename}')