## Phenotype Table I
December 2019
#### Traits
* Season 4 biomass at harvest
    * `aboveground_dry_biomass`
    * `aboveground_fresh_biomass`
* Season 4 `growing_degree_days`

#### Notes
* one plot per row
* 2017 daily weather station data downloaded from [MAC Weather Station](https://cals.arizona.edu/azmet/06.htm)
    * raw data [description](https://cals.arizona.edu/azmet/raw2003.htm)
    * `.csv` [gist](https://gist.github.com/MagicMilly/92785c81acdb703c40298230a7b84140)
* [Metadata](https://terraref.ncsa.illinois.edu/bety/api/v1/managements) dates in UTC
* All curated data will use Arizona datetimes

In [1]:
import datetime
import pandas as pd
import numpy as np

In [2]:
df_0 = pd.read_csv('../data/raw/mac_season_4.csv', low_memory=False)
print(df_0.shape)
# df_0.head()

(372363, 39)


### I. Set index to sitename 
* Maintain shape of dataframe
* No grouping
* Strip `E` and `W` from sitenames

In [3]:
e_w_subplots = df_0.loc[(df_0.sitename.str.endswith(' E')) | (df_0.sitename.str.endswith(' W'))]
e_w_subplots.shape

(9460, 39)

In [4]:
new_sitenames = []
e_w_counter = 0

for site in df_0.sitename.values:
    
    if (site.endswith(' W')) | (site.endswith(' E')):
        new_name = site[:-2]
        new_sitenames.append(new_name)
        e_w_counter += 1
        
    else:
        new_sitenames.append(site)
        
print(f'Length of original sitename values list: {len(df_0.sitename.values)}')
print(f'Length new sitename list: {len(new_sitenames)}')
print(f'Number of E and W subplots: {e_w_counter}')

Length of original sitename values list: 372363
Length new sitename list: 372363
Number of E and W subplots: 9460


In [5]:
df_0['plot_name'] = new_sitenames
print(df_0.shape)
# df_0.head()

(372363, 40)


In [6]:
df_1 = df_0.set_index(keys='plot_name')
print(df_1.shape)
# df_1.head()

(372363, 39)


In [7]:
print(f'Number of unique plot names: {df_1.index.nunique()}')

Number of unique plot names: 847


### II. Biomass at harvest

#### A. Aboveground dry biomass

In [8]:
dry_bio_values_only = df_1.loc[df_1.trait == 'aboveground_dry_biomass']
print(f'Number of aboveground dry biomass values in dataset: {dry_bio_values_only.shape[0]}')
print(f'Number of unique plots in biomass df: {dry_bio_values_only.index.nunique()}')

Number of aboveground dry biomass values in dataset: 200
Number of unique plots in biomass df: 199


In [9]:
# Identify the duplicate plot

duplicates = dry_bio_values_only[dry_bio_values_only.index.duplicated(keep=False)]
# duplicates

In [10]:
print(duplicates.index.unique())
print(duplicates['mean'].unique())

Index(['MAC Field Scanner Season 4 Range 20 Column 11'], dtype='object', name='plot_name')
[10950.]


In [11]:
non_duplicate_columns = []

for col in duplicates.columns:
    
    if duplicates.iloc[0][col] != duplicates.iloc[1][col]:
        non_duplicate_columns.append(col)
        
# print(non_duplicate_columns)

for col in non_duplicate_columns:
    print(f'{col}: {duplicates.iloc[0][col]}')
    print(f'{col}: {duplicates.iloc[1][col]}')

Unnamed: 0: 72499
Unnamed: 0: 279789
n: nan
n: nan
statname: nan
statname: nan
stat: nan
stat: nan
notes: nan
notes: nan
entity: nan
entity: nan


#### Create new column `aboveground_dry_biomass` and populate with values
* Null values will be represented by an empty string (after calculations)

In [12]:
# Create new empty column

df_1['aboveground_dry_biomass'] = np.nan

In [13]:
df_2 = df_1.copy()
# df_2.head()

In [14]:
# for rows where trait == aboveground_dry_biomass, use that value to populate new aboveground_dry_biomass column

for index, row in df_2.iterrows():            
    
        if row['trait'] == 'aboveground_dry_biomass':
            df_2.loc[index, ['aboveground_dry_biomass']] = row['mean']

#### B. Aboveground fresh biomass

In [15]:
fresh_bio_values_only = df_1.loc[df_1.trait == 'aboveground_fresh_biomass']
print(f'Number of aboveground fresh biomass values in dataset: {fresh_bio_values_only.shape[0]}')
print(f'Number of unique plots in biomass df: {fresh_bio_values_only.index.nunique()}')

Number of aboveground fresh biomass values in dataset: 543
Number of unique plots in biomass df: 543


#### Create new column `aboveground_fresh_biomass` and populate with values
* Null values will be represented by an empty string (after calculations)

In [16]:
# Create new empty column

df_2['aboveground_fresh_biomass'] = np.nan

In [17]:
df_3 = df_2.copy()

In [18]:
# for rows where trait == aboveground_fresh_biomass, use that value to populate new aboveground_fresh_biomass column

for index, row in df_3.iterrows():            
    
        if row['trait'] == 'aboveground_fresh_biomass':
            df_3.loc[index, ['aboveground_fresh_biomass']] = row['mean']

### III. Drop columns

In [19]:
df_3.columns

Index(['Unnamed: 0', 'checked', 'result_type', 'id', 'citation_id', 'site_id',
       'treatment_id', 'sitename', 'city', 'lat', 'lon', 'scientificname',
       'commonname', 'genus', 'species_id', 'cultivar_id', 'author',
       'citation_year', 'treatment', 'date', 'time', 'raw_date', 'month',
       'year', 'dateloc', 'trait', 'trait_description', 'mean', 'units', 'n',
       'statname', 'stat', 'notes', 'access_level', 'cultivar', 'entity',
       'method_name', 'view_url', 'edit_url', 'aboveground_dry_biomass',
       'aboveground_fresh_biomass'],
      dtype='object')

In [20]:
cols_to_drop = ['Unnamed: 0', 'checked', 'site_id', 'result_type', 'id', 'citation_id', 'treatment_id', 'sitename', 'city', 
                'lat', 'lon', 'scientificname', 'commonname', 'genus', 'species_id', 'author', 'citation_year', 
                'treatment', 'time', 'month', 'year', 'dateloc', 'trait', 'trait_description', 'mean', 
                'units', 'n', 'statname', 'stat', 'notes', 'access_level', 'entity', 'method_name', 'view_url', 
                'edit_url',]

In [21]:
df_4 = df_3.drop(labels=cols_to_drop, axis=1)
# df_4.head()

#### A. Sort by raw date so that only the first duplicate `plot_name` (latest date) will be kept
* Can be changed to Arizona time later if needed

In [22]:
df_4.dtypes

cultivar_id                    int64
date                          object
raw_date                      object
cultivar                      object
aboveground_dry_biomass      float64
aboveground_fresh_biomass    float64
dtype: object

In [23]:
df_5 = df_4.sort_values(by=['raw_date'], ascending=False)
# df_5.head()

Unnamed: 0_level_0,cultivar_id,date,raw_date,cultivar,aboveground_dry_biomass,aboveground_fresh_biomass
plot_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
MAC Field Scanner Season 4 Range 50 Column 14,6000001057,2017 Sep 15 (America/Phoenix),2017-09-16 00:00:00 -0500,PI534165,,38370.0
MAC Field Scanner Season 4 Range 28 Column 14,6000000804,2017 Sep 15 (America/Phoenix),2017-09-16 00:00:00 -0500,PI329645,,31550.0
MAC Field Scanner Season 4 Range 32 Column 15,6000000224,2017 Sep 15 (America/Phoenix),2017-09-16 00:00:00 -0500,PI297130,,43480.0
MAC Field Scanner Season 4 Range 50 Column 14,6000001057,2017 Sep 15 (America/Phoenix),2017-09-16 00:00:00 -0500,PI534165,,38370.0
MAC Field Scanner Season 4 Range 11 Column 13,6000000868,2017 Sep 15 (America/Phoenix),2017-09-16 00:00:00 -0500,PI535793,25330.0,94640.0


In [24]:
print(f'Number of unique plot names: {df_5.index.nunique()}')
print(' ')
print(f'Shape of original data: {df_0.shape}')
print(f'Shape of current processed data with dropped columns: {df_5.shape}')

Number of unique plot names: 847
 
Shape of original data: (372363, 40)
Shape of current processed data with dropped columns: (372363, 6)


#### B. Drop duplicates by
* `plot_name`
* `cultivar_id`
* `cultivar`
* `aboveground_dry_biomass`
* `aboveground_fresh_biomass`

#### Notes
* Only the latest (harvest) date should be used since df sorted by descending date
* reset index so that duplicates can be checked in axis 1 columns only, rather than having to check for both axes

In [25]:
df_6 = df_5.reset_index()
# df_6.head()

Unnamed: 0,plot_name,cultivar_id,date,raw_date,cultivar,aboveground_dry_biomass,aboveground_fresh_biomass
0,MAC Field Scanner Season 4 Range 50 Column 14,6000001057,2017 Sep 15 (America/Phoenix),2017-09-16 00:00:00 -0500,PI534165,,38370.0
1,MAC Field Scanner Season 4 Range 28 Column 14,6000000804,2017 Sep 15 (America/Phoenix),2017-09-16 00:00:00 -0500,PI329645,,31550.0
2,MAC Field Scanner Season 4 Range 32 Column 15,6000000224,2017 Sep 15 (America/Phoenix),2017-09-16 00:00:00 -0500,PI297130,,43480.0
3,MAC Field Scanner Season 4 Range 50 Column 14,6000001057,2017 Sep 15 (America/Phoenix),2017-09-16 00:00:00 -0500,PI534165,,38370.0
4,MAC Field Scanner Season 4 Range 11 Column 13,6000000868,2017 Sep 15 (America/Phoenix),2017-09-16 00:00:00 -0500,PI535793,25330.0,94640.0


In [26]:
df_7 = df_6.drop_duplicates(subset=['plot_name', 'cultivar_id', 'cultivar', 'aboveground_dry_biomass',
                                   'aboveground_fresh_biomass'], keep='first')
# df_7.head()

Unnamed: 0,plot_name,cultivar_id,date,raw_date,cultivar,aboveground_dry_biomass,aboveground_fresh_biomass
0,MAC Field Scanner Season 4 Range 50 Column 14,6000001057,2017 Sep 15 (America/Phoenix),2017-09-16 00:00:00 -0500,PI534165,,38370.0
1,MAC Field Scanner Season 4 Range 28 Column 14,6000000804,2017 Sep 15 (America/Phoenix),2017-09-16 00:00:00 -0500,PI329645,,31550.0
2,MAC Field Scanner Season 4 Range 32 Column 15,6000000224,2017 Sep 15 (America/Phoenix),2017-09-16 00:00:00 -0500,PI297130,,43480.0
4,MAC Field Scanner Season 4 Range 11 Column 13,6000000868,2017 Sep 15 (America/Phoenix),2017-09-16 00:00:00 -0500,PI535793,25330.0,94640.0
5,MAC Field Scanner Season 4 Range 31 Column 15,6000001007,2017 Sep 15 (America/Phoenix),2017-09-16 00:00:00 -0500,PI641909,,62240.0


In [27]:
print(f'Number of unique plot names: {df_4.index.nunique()}')
print(f'Number of rows after duplicates dropped: {df_7.shape[0]}')

Number of unique plot names: 847
Number of rows after duplicates dropped: 847


In [28]:
# check number of null values

df_7.isnull().sum()

plot_name                      0
cultivar_id                    0
date                           0
raw_date                       0
cultivar                       0
aboveground_dry_biomass      648
aboveground_fresh_biomass    304
dtype: int64

In [29]:
# check proper dates - harvest dates for non-null values should be between 2017-9-10 to 2017-9-15 (Arizona dates)
# the raw harvest UTC dates in the metadata are between 2017-09-11 to 2017-9-16

non_nulls = df_7.loc[(df_6.aboveground_dry_biomass.notna()) | (df_7.aboveground_fresh_biomass.notna())]
non_nulls.shape

(547, 7)

#### C. Drop `raw_date` and use only Arizona datetime for curated dataset

In [33]:
print(f'Arizona harvest dates: {non_nulls.date.unique()}')

Arizona harvest dates: ['2017 Sep 15 (America/Phoenix)' '2017 Sep 14 (America/Phoenix)'
 '2017 Sep 11 (America/Phoenix)' '2017 Sep 10 (America/Phoenix)']


In [39]:
df_8 = df_7.drop(labels='raw_date', axis=1)
# df_8.head()

Unnamed: 0,plot_name,cultivar_id,date,cultivar,aboveground_dry_biomass,aboveground_fresh_biomass
0,MAC Field Scanner Season 4 Range 50 Column 14,6000001057,2017 Sep 15 (America/Phoenix),PI534165,,38370.0
1,MAC Field Scanner Season 4 Range 28 Column 14,6000000804,2017 Sep 15 (America/Phoenix),PI329645,,31550.0
2,MAC Field Scanner Season 4 Range 32 Column 15,6000000224,2017 Sep 15 (America/Phoenix),PI297130,,43480.0
4,MAC Field Scanner Season 4 Range 11 Column 13,6000000868,2017 Sep 15 (America/Phoenix),PI535793,25330.0,94640.0
5,MAC Field Scanner Season 4 Range 31 Column 15,6000001007,2017 Sep 15 (America/Phoenix),PI641909,,62240.0


#### Convert date to iso format
* Strip date strings of trailing `(America/Phoenix)`
* Convert string to datetime

In [40]:
df_9 = df_8.copy()

In [60]:
date_values = df_9.date.values
new_date_values = []

In [61]:
for d in date_values:
    if d.endswith('(America/Phoenix)'):
        new_date = d[:-18]
        new_date_values.append(new_date)
    else:
        new_date = d
        new_date_values.append(new_date)    

In [65]:
df_9['new_string_date'] = new_date_values
# df_9.head()

Unnamed: 0,plot_name,cultivar_id,date,cultivar,aboveground_dry_biomass,aboveground_fresh_biomass,new_string_date
0,MAC Field Scanner Season 4 Range 50 Column 14,6000001057,2017 Sep 15 (America/Phoenix),PI534165,,38370.0,2017 Sep 15
1,MAC Field Scanner Season 4 Range 28 Column 14,6000000804,2017 Sep 15 (America/Phoenix),PI329645,,31550.0,2017 Sep 15
2,MAC Field Scanner Season 4 Range 32 Column 15,6000000224,2017 Sep 15 (America/Phoenix),PI297130,,43480.0,2017 Sep 15
4,MAC Field Scanner Season 4 Range 11 Column 13,6000000868,2017 Sep 15 (America/Phoenix),PI535793,25330.0,94640.0,2017 Sep 15
5,MAC Field Scanner Season 4 Range 31 Column 15,6000001007,2017 Sep 15 (America/Phoenix),PI641909,,62240.0,2017 Sep 15


In [67]:
df_10 = df_9.drop(labels='date', axis=1)
# df_10.head()

Unnamed: 0,plot_name,cultivar_id,cultivar,aboveground_dry_biomass,aboveground_fresh_biomass,new_string_date
0,MAC Field Scanner Season 4 Range 50 Column 14,6000001057,PI534165,,38370.0,2017 Sep 15
1,MAC Field Scanner Season 4 Range 28 Column 14,6000000804,PI329645,,31550.0,2017 Sep 15
2,MAC Field Scanner Season 4 Range 32 Column 15,6000000224,PI297130,,43480.0,2017 Sep 15
4,MAC Field Scanner Season 4 Range 11 Column 13,6000000868,PI535793,25330.0,94640.0,2017 Sep 15
5,MAC Field Scanner Season 4 Range 31 Column 15,6000001007,PI641909,,62240.0,2017 Sep 15


In [70]:
df_10['date'] = pd.to_datetime(df_10['new_string_date'])
# df_10.sample(n=15)

In [72]:
df_11 = df_10.drop(labels='new_string_date', axis=1)
# df_11.head()

Unnamed: 0,plot_name,cultivar_id,cultivar,aboveground_dry_biomass,aboveground_fresh_biomass,date
0,MAC Field Scanner Season 4 Range 50 Column 14,6000001057,PI534165,,38370.0,2017-09-15
1,MAC Field Scanner Season 4 Range 28 Column 14,6000000804,PI329645,,31550.0,2017-09-15
2,MAC Field Scanner Season 4 Range 32 Column 15,6000000224,PI297130,,43480.0,2017-09-15
4,MAC Field Scanner Season 4 Range 11 Column 13,6000000868,PI535793,25330.0,94640.0,2017-09-15
5,MAC Field Scanner Season 4 Range 31 Column 15,6000001007,PI641909,,62240.0,2017-09-15


#### D. Re-order columns / Set index to plot name
* Uncomment line based on index as `plot_name` or not

In [73]:
# new_col_order = ['plot_name', 'cultivar', 'cultivar_id', 'date', 'aboveground_dry_biomass',
#                  'aboveground_fresh_biomass']

# to set plot_name as index
# df_8 = pd.DataFrame(data=df_7, columns=new_col_order, index=df_7.plot_name)

# to keep working with columns without setting index
# df_8 = pd.DataFrame(data=df_7, columns=new_col_order)

# to only set index to plot names without re-ordering columns
df_12 = df_11.set_index(keys='plot_name')
df_12.head()

Unnamed: 0_level_0,cultivar_id,cultivar,aboveground_dry_biomass,aboveground_fresh_biomass,date
plot_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
MAC Field Scanner Season 4 Range 50 Column 14,6000001057,PI534165,,38370.0,2017-09-15
MAC Field Scanner Season 4 Range 28 Column 14,6000000804,PI329645,,31550.0,2017-09-15
MAC Field Scanner Season 4 Range 32 Column 15,6000000224,PI297130,,43480.0,2017-09-15
MAC Field Scanner Season 4 Range 11 Column 13,6000000868,PI535793,25330.0,94640.0,2017-09-15
MAC Field Scanner Season 4 Range 31 Column 15,6000001007,PI641909,,62240.0,2017-09-15


In [74]:
df_12.shape

(847, 5)

### Go to final steps before converting to `.csv`

Dates for plots without biomass values?

### IV. Growing Degree Days
* Metadata contains UTC dates, but Arizona datetimes (MST, GMT-7) will be used in curated dataset
* Weather station data in Arizona time
* Air temperature values are in [Celsius](https://cals.arizona.edu/azmet/raw2003.htm)
* Day of Year [Conversion](https://www.esrl.noaa.gov/gmd/grad/neubrew/Calendar.jsp?view=DOY&year=2017&col=4)
* Planting date for Season 4: `2017-04-19`
* Planting date as Day of Year: `109` 
* Harvest Dates for Season 4
    * 1st Day of Harvest
        * `2017-09-10`
        * Day of Year `253`
    * 2nd Day of Harvest
        * `2017-09-11`
        * Day of Year `254`
    * 3rd Day of Harvest
        * `2017-09-14`
        * Day of Year `257`
    * 4th Day of Harvest
        * `2017-09-15`
        * Day of Year `258`

In [42]:
weather_df_0 = pd.read_csv('data/raw/mac_weather_station_raw_daily_2017.csv')
weather_df_0.head()

Unnamed: 0,year,day_of_year,station_number,air_temp_max,air_temp_min,air_temp_mean,rh_max,rh_min,rh_mean,vpd_mean,...,wind_speed_mean,wind_vector_magnitude,wind_vector_direction,wind_direction_std,max_wind_speed,heat_units,eto_azmet,eto_p_m,vapor_pressure_mean,dewpoint_mean
0,2017,1,6,13.6,9.3,11.8,92.7,69.2,83.5,0.23,...,3.5,2.6,188,43,10.9,0.2,1.0,1.2,1.16,9.0
1,2017,2,6,14.9,7.2,10.5,87.7,44.7,71.4,0.39,...,2.2,1.5,129,44,5.8,0.5,1.0,1.6,0.89,5.3
2,2017,3,6,13.9,3.2,9.0,97.0,60.6,81.9,0.24,...,1.0,0.1,349,78,3.3,0.2,0.6,0.9,0.93,5.8
3,2017,4,6,20.4,3.0,11.0,97.8,31.4,73.2,0.46,...,0.9,0.3,76,68,3.6,2.3,1.8,1.5,0.92,5.7
4,2017,5,6,20.9,4.0,12.3,95.5,39.8,71.0,0.53,...,1.5,0.8,253,54,7.1,2.5,2.2,1.8,0.98,6.6


In [43]:
air_temps = weather_df_0[['day_of_year', 'air_temp_max', 'air_temp_min', 'air_temp_mean']]
air_temps.tail()

Unnamed: 0,day_of_year,air_temp_max,air_temp_min,air_temp_mean
360,361,23.2,1.6,10.8
361,362,23.4,-0.6,10.5
362,363,24.4,-0.4,10.9
363,364,25.4,0.2,10.8
364,365,18.9,0.3,8.7


#### Slice dataframe from April to September
UTC Time

* Planting date: 2017-04-20, Day of Year 110
* Last Harvest Date: 2017-09-16, Day of Year 259

Arizona Time
* assume Days of Year are the same?

In [61]:
air_temps_1 = air_temps.loc[(air_temps.day_of_year >= 110) & (air_temps.day_of_year <= 259)]
# air_temps_1.head()

In [None]:
# air_temps_1.tail()

In [None]:
# GDD are cumulative - the below code only calculates a daily value
# sample_df['gdd'] = (((sample_df['air_temp_max'] + sample_df['air_temp_min']) / 2) - 10)

### Growing Degree Days Table - for Reference
* #### A. Weather Station Data - AZ time

In [57]:
air_temps_1.head()

Unnamed: 0,day_of_year,air_temp_max,air_temp_min,air_temp_mean
90,91,22.2,6.6,14.5
91,92,27.5,5.4,17.6
92,93,29.9,13.5,21.1
93,94,25.7,8.9,18.0
94,95,29.0,6.8,18.6


In [63]:
air_temps_2 = air_temps_1.copy()
air_temps_2['gdd'] = np.cumsum((((air_temps_2['air_temp_max'] + air_temps_2['air_temp_min']) / 2) - 10))

In [64]:
air_temps_2.tail()

Unnamed: 0,day_of_year,air_temp_max,air_temp_min,air_temp_mean,gdd
254,255,42.8,24.2,34.0,2955.2
255,256,41.3,24.3,33.9,2978.0
256,257,39.5,22.8,31.4,2999.15
257,258,36.2,21.4,28.5,3017.95
258,259,36.3,18.2,27.6,3035.2


In [71]:
season_dates = pd.date_range(start='4/20/2017', end='9/16/2017').tolist()
# season_dates

In [76]:
air_temps_3 = air_temps_2.copy()
air_temps_3['date'] = season_dates

In [77]:
reorder_cols = ['date', 'day_of_year', 'gdd', 'air_temp_min', 'air_temp_max', 'air_temp_mean']

air_temps_4 = pd.DataFrame(data=air_temps_3, columns=reorder_cols, index=air_temps_3.index)
air_temps_4.head()

Unnamed: 0,date,day_of_year,gdd,air_temp_min,air_temp_max,air_temp_mean
109,2017-04-20,110,13.7,14.1,33.3,23.5
110,2017-04-21,111,26.45,11.1,34.4,24.0
111,2017-04-22,112,41.45,14.5,35.5,25.0
112,2017-04-23,113,56.25,12.6,37.0,26.5
113,2017-04-24,114,70.55,14.9,33.7,25.7


In [78]:
air_temps_4.tail()

Unnamed: 0,date,day_of_year,gdd,air_temp_min,air_temp_max,air_temp_mean
254,2017-09-12,255,2955.2,24.2,42.8,34.0
255,2017-09-13,256,2978.0,24.3,41.3,33.9
256,2017-09-14,257,2999.15,22.8,39.5,31.4
257,2017-09-15,258,3017.95,21.4,36.2,28.5
258,2017-09-16,259,3035.2,18.2,36.3,27.6


In [78]:
%pwd

'/Users/ejcain/UA-AG/phenotypes/terraref-datasets'

In [80]:
air_temps_4.to_csv('data/processed/mac_season_4_temp_gdd_2019-12-12.csv')

In [86]:
test_df = air_temps_4.copy()
test_df_slice = test_df.iloc[249:258]

In [88]:
test_df_slice['gdd'] = np.round(test_df_slice.gdd, decimals=2)
test_df_slice

Unnamed: 0,date,day_of_year,gdd,air_temp_min,air_temp_max,air_temp_mean


In [82]:
air_temps_4.loc[air_temps_4.day_of_year == 116]

Unnamed: 0,date,day_of_year,gdd,air_temp_min,air_temp_max,air_temp_mean
115,2017-04-26,116,96.95,14.7,31.5,24.1


### V. Final Steps before `.csv` conversion
* Change `if` statements to `True` to run cells
* Change names of dataframes as needed
* Change name of output file as needed

#### A. Fill NaN's with empty strings

In [79]:
need_to_fill_nas = False

if need_to_fill_nas:
    
    df_13 = df_12.fillna(value=' ', axis=1)
    df_13.isnull().sum()

#### B. Set `plot_name` as index

In [None]:
need_to_set_index = False

if need_to_set_index:
    
    df_9 = df_8.set_index(keys='plot_name')
    df_9.head()

#### C. Convert to `.csv`

In [81]:
need_to_create_csv = True

if need_to_create_csv:

    timestamp = datetime.datetime.now().replace(microsecond=0).isoformat()
    output_filename = f'aboveground_biomass_harvest_{timestamp}.csv'.replace(':', '')
    df_13.to_csv(f'data/processed/{output_filename}')

### VI. For Future Tests

#### A. Determine number of rows for processed dataset

### VII. Code Keeping

#### A. Extract Range and Column Values

In [None]:
# df_4['range'] = df_4['sitename'].str.extract("Range (\d+)").astype(int)
# df_4['column'] = df_4['sitename'].str.extract("Column (\d+)").astype(int)