### Maricopa Agricultural Center Season 4

### Citation for Input Trait Data

LeBauer, David et al. (2020), Data From: TERRA-REF, An open reference data set from high resolution genomics, phenomics, and imaging sensors, v6, Dryad, Dataset, https://doi.org/10.5061/dryad.4b8gtht99

#### Environmental weather data 
- Raw data downloaded from the MAC weather station [website](https://cals.arizona.edu/azmet/06.htm)
- Processed using the code from [`weather_data_cleaning.py`](/iplant/home/shared/genophenoenvo/scripts/weather_data_cleaning.py)
- Processed MAC Season 4 [weather data](https://de.cyverse.org/dl/d/6D959379-0442-41FE-8BEE-890866ACF037/mac_season_4_weather.csv)

Please email dlebauer@email.arizona.edu or ejcain@arizona.edu with any questions or comments, or create an issue in this [repository](https://github.com/genophenoenvo/terraref-datasets) 

In [1]:
import datetime
import numpy as np
import os
import pandas as pd
import requests
import sqlite3

In [2]:
def download_csv(url, folder_name, file_name):
    response = requests.get(url)
    with open(os.path.join(folder_name, file_name), 'wb') as f:
        f.write(response.content)

In [3]:
def read_in_csv(folder_name, file_name):
    df = pd.read_csv(folder_name + '/' + file_name, low_memory=False)
    return df

In [4]:
def plot_hist(df, value_column, trait_column):
    
    trait_name = df[trait_column].unique()[0]
    return df[value_column].hist(color='navy').set_xlabel(trait_name);

In [5]:
def check_for_nulls_duplicates(df):
    
    print(
        f'Sum of null values:\n{df.isnull().sum()}\n-----\n'
        f'Value counts for duplicates:\n{df.duplicated().value_counts()}'
    )

In [6]:
def check_unique_values(df):

    for col in df.columns:
        if df[col].nunique() < 5:
            print(f'{df[col].nunique()} unique value(s) for {col} column: {df[col].unique()}')    
        else:
            print(f'{df[col].nunique()} values for {col} column')

In [7]:
def extract_range_column_values(working_df, plot_column):
    
    new_df = working_df.copy()
    new_df['range'] = new_df[plot_column].str.extract("Range (\d+)").astype(int)
    new_df['column'] = new_df[plot_column].str.extract("Column (\d+)").astype(int)
    
    return new_df

In [8]:
def convert_datetime_column(working_df, date_column):
    
    new_datetimes = pd.to_datetime(working_df[date_column])
    new_df_0 = working_df.drop(labels=date_column, axis=1)
    new_df_1 = new_df_0.copy()
    new_df_1['date'] = new_datetimes
    
    return new_df_1

In [9]:
def rename_value_column(working_df, value_column, trait_column):
    
    trait = working_df[trait_column].unique()[0]
    new_df_0 = working_df.rename({value_column: trait}, axis=1)
    new_df_1 = new_df_0.drop(labels=trait_column, axis=1)
    
    return new_df_1

Blocking height experiment description for season 4 can be found [here](https://terraref.ncsa.illinois.edu/bety/api/v1/experiments?name=~MAC+Season+4:+All+BAP+With+Late+Season+Drought)


In [10]:
def add_season_4_blocking_height(working_df, range_column):
    
    short_blocks = [11, 20, 46, 50]
    medium_blocks = [10, 12, 18, 24, 27, 29, 31, 33, 38, 51]
    tall_blocks = [3, 4, 5, 6, 7, 8, 9, 13, 14, 15, 16, 17, 19, 21, 22, 23, 25, 26, 28, 30, 32, 34, 35, 36, 37, 
                   39, 40, 41, 42, 43, 44, 45, 47, 48, 49, 52]
    border = [1, 2, 53, 54]
    
    range_values = working_df[range_column].values
    blocking_heights = []
    
    for r in range_values:
        
        if r in short_blocks:
            blocking_heights.append('short')
            
        elif r in medium_blocks:
            blocking_heights.append('medium')
            
        elif r in tall_blocks:
            blocking_heights.append('tall')
            
        elif r in border:
            blocking_heights.append('border')
            
        else:
            print(f'Error with range value {r}')
        
    working_df_1 = working_df.copy()
    working_df_1['blocking_height'] = blocking_heights
    
    return working_df_1

In [11]:
def reorder_columns(working_df, new_col_order_list):
    
    working_df_1 = pd.DataFrame(data=working_df, columns=new_col_order_list)
    return working_df_1

In [12]:
def check_for_subplots(df, plot_col):

    for name in df[plot_col].values:
        if (name.endswith(' E')) | (name.endswith(' W')):
             return 'This dataset contains subplot designations'
        else:
            return 'No subplot designations'

In [13]:
def strip_subplots(working_df, plot_col, new_plot_col_name):
    
    plot_names = working_df[plot_col].values
    new_plot_names = []
    
    for n in plot_names:
        if (n.endswith(' E') | (n.endswith(' W'))):
            new_plot_names.append(n[:-2])    
        else:
            new_plot_names.append(n)
            
    working_df_1 = working_df.drop(labels=plot_col, axis=1)
    working_df_2 = working_df_1.copy()
    
    working_df_2[new_plot_col_name] = new_plot_names
    return working_df_2

In [14]:
def save_to_csv_with_timestamp(df, name_of_dataset):
    
    timestamp = datetime.datetime.now().replace(microsecond=0).isoformat()
    output_filename = ('data/processed/' + f'{name_of_dataset}_' + f'{timestamp}.csv').replace(':', '')

    df.to_csv(output_filename, index=False)

In [15]:
def save_to_csv_without_timestamp(list_of_dfs, list_of_output_filenames):

    for i,j in zip(list_of_dfs, list_of_output_filenames):
        i.to_csv(j, index=False)

#### A. Aboveground Dry Biomass

In [16]:
folder_name = 'data'
if not os.path.exists(folder_name):
    os.makedirs(folder_name)

In [17]:
aboveground_dry_biomass_s4_url = 'https://de.cyverse.org/dl/d/6FB820CF-25A8-4691-AEB4-3B2B17BF3DDE/season_4_aboveground_dry_biomass_manual.csv'
aboveground_dry_biomass_s4_input_filename = 'aboveground_dry_biomass_s4.csv'

In [18]:
download_csv(aboveground_dry_biomass_s4_url, folder_name=folder_name, file_name=aboveground_dry_biomass_s4_input_filename)

In [19]:
adb_0 = read_in_csv(folder_name=folder_name, file_name=aboveground_dry_biomass_s4_input_filename)
# print(adb_0.shape)
# adb_0.head()

In [20]:
# plot_hist(adb_0, 'mean', 'trait')

In [21]:
# check_for_nulls_duplicates(adb_0)

In [22]:
# check_unique_values(adb_0)

In [23]:
adb_1 = extract_range_column_values(adb_0, 'plot')
# print(adb_1.shape)
# adb_1.sample(n=3)

In [24]:
adb_2 = convert_datetime_column(adb_1, 'date')
# print(adb_2.shape)
# adb_2.head()

In [25]:
adb_3 = rename_value_column(adb_2, 'mean', 'trait')
# print(adb_3.shape)
# adb_3.tail()

In [26]:
cols_to_drop = ['checked', 'author', 'season']

adb_4 = adb_3.drop(labels=cols_to_drop, axis=1)
# print(adb_4.shape)
# adb_4.head(3)

In [27]:
adb_5 = add_season_4_blocking_height(adb_4, 'range')
# print(adb_5.shape)
# adb_5.sample(n=3)

##### Add units (kg/ha) column to aboveground dry biomass dataset

In [28]:
adb_6 = adb_5.copy()
adb_6['units'] = 'kg/ha'

# print(adb_6.shape)
# adb_6.tail(3)

In [29]:
new_col_order = ['date', 'plot', 'range', 'column', 'scientificname', 'genotype', 'treatment', 'blocking_height', 
                 'method', 'aboveground_dry_biomass', 'units', 'method_type']

adb_7 = reorder_columns(adb_6, new_col_order)
# print(adb_7.shape)
# adb_7.head(3)

#### B. Canopy Height - Sensor

In [30]:
canopy_height_sensor_s4_url = 'https://de.cyverse.org/dl/d/CD0093FE-7576-4BDC-B309-51C592A528DF/season_4_canopy_height_sensor.csv'
canopy_height_sensor_s4_input_filename = 'canopy_height_sensor_s4.csv'

In [31]:
download_csv(canopy_height_sensor_s4_url, folder_name=folder_name, file_name=canopy_height_sensor_s4_input_filename)

In [32]:
ch_0 = read_in_csv(folder_name=folder_name, file_name=canopy_height_sensor_s4_input_filename)
# print(ch_0.shape)
# ch_0.head()

In [33]:
# check_unique_values(ch_0)

In [34]:
# check_for_nulls_duplicates(ch_0)

#### Drop duplicates

In [35]:
ch_1 = ch_0.drop_duplicates(ignore_index=True)
# print(ch_1.shape)
# check_for_nulls_duplicates(ch_1)

In [36]:
check_for_subplots(ch_1, 'plot')

'No subplot designations'

In [37]:
ch_2 = extract_range_column_values(ch_1, 'plot')
# print(ch_2.shape)
# ch_2.sample(n=3)

In [38]:
ch_3 = convert_datetime_column(ch_2, 'date')
# print(ch_3.shape)
# ch_3.dtypes

In [39]:
ch_4 = rename_value_column(ch_3, 'mean', 'trait')
# print(ch_4.shape)
# ch_4.tail(3)

In [40]:
ch_5 = add_season_4_blocking_height(ch_4, 'range')
# ch_5.sample(n=3)

In [41]:
ch_6 = ch_5.drop(labels=['checked', 'author', 'season'], axis=1)
# print(ch_6.shape)

#### Add units column
- cm

In [42]:
ch_7 = ch_6.copy()
ch_7['units'] = 'cm'
# print(ch_7.shape)
# ch_7.head(3)

In [43]:
new_col_order = ['date', 'plot', 'range', 'column', 'scientificname', 'genotype', 'treatment', 'blocking_height', 
                 'method', 'canopy_height', 'units', 'method_type']

ch_8 = reorder_columns(ch_7, new_col_order)
# print(ch_8.shape)
# ch_8.head(3)

#### C. Canopy Height - Manual
- using SQLite for `groupby`

In [44]:
canopy_height_manual_s4_url = 'https://de.cyverse.org/dl/d/AE10E379-BBBA-428C-AAC6-7B27296316B7/season_4_canopy_height_manual.csv'
canopy_height_manual_s4_input_filename = 'canopy_height_manual_s4.csv'

In [45]:
download_csv(canopy_height_manual_s4_url, folder_name=folder_name, file_name=canopy_height_manual_s4_input_filename)

In [46]:
chm_0 = read_in_csv(folder_name=folder_name, file_name=canopy_height_manual_s4_input_filename)
# print(ch_0.shape)
# ch_0.head()

In [47]:
# check_for_nulls_duplicates(chm_0)

In [48]:
# check_unique_values(chm_0)

In [49]:
chm_1 = extract_range_column_values(chm_0, 'plot')
# print(chm_1.shape)
# chm_1.sample(n=3)

In [50]:
chm_2 = convert_datetime_column(chm_1, 'date')
# print(chm_2.shape)
# chm_2.head()

#### Identify and Remove Subplot Designations

In [51]:
# check_for_subplots(chm_2, 'plot')
# incorrectly returns 'No subplot designations'

In [52]:
chm_2.loc[(chm_2['plot'].str.endswith(' W')) | (chm_2['plot'].str.endswith(' E'))].shape

(5803, 14)

In [53]:
chm_3 = strip_subplots(chm_2, 'plot', 'plot')
# print(chm_3.shape)
# chm_3.sample(n=3)

In [54]:
# check_for_subplots(chm_3, 'plot')

In [55]:
# check for plot/date/mean/treatment duplicates

# chm_3.duplicated(subset=['plot', 'date', 'mean', 'treatment']).value_counts()

In [56]:
# Drop Duplicates

chm_4 = chm_3.drop_duplicates(ignore_index=True, subset=['plot', 'genotype', 'treatment', 'mean', 'range', 'column',
                                                        'date'])

# print(chm_4.shape)
# chm_4.duplicated().value_counts()

In [57]:
chm_5 = add_season_4_blocking_height(chm_4, 'range')
# print(chm_5.shape)
# chm_5.sample(n=3)

#### Use sqlite database to group by `plot`, `date`, and `mean` 
- rename `mean` to `canopy_height_cm`
- can also drop and reorder columns at this time

In [58]:
conn = sqlite3.connect('data/canopy_heights_manual_season_4.sqlite')
cursor = conn.cursor()
print("Opened database successfully")

Opened database successfully


In [60]:
# comment next line out if db has already been created
# chm_5.to_sql('canopy_heights_manual_season_4.sqlite', conn)

In [61]:
chm_6 = pd.read_sql_query("""
                            SELECT date, plot, range, column, scientificname, genotype, treatment, blocking_height,
                            method, ROUND(AVG([mean]), 2) AS canopy_height_cm, method_type
                            FROM 'canopy_heights_manual_season_4.sqlite'
                            GROUP BY plot, date,[mean]
                            ORDER BY date ASC;
                            """, conn)

print(chm_6.shape)
chm_6.head(3)

(5789, 11)


Unnamed: 0,date,plot,range,column,scientificname,genotype,treatment,blocking_height,method,canopy_height_cm,method_type
0,2017-05-24 00:00:00,MAC Field Scanner Season 4 Range 10 Column 11,10,11,Sorghum bicolor,PI195754,MAC Season 4: BAP water-deficit stress Aug 15-30,medium,Manual canopy height,47.0,manual
1,2017-05-24 00:00:00,MAC Field Scanner Season 4 Range 10 Column 12,10,12,Sorghum bicolor,PI329501,MAC Season 4: BAP water-deficit stress Aug 1-14,medium,Manual canopy height,26.0,manual
2,2017-05-24 00:00:00,MAC Field Scanner Season 4 Range 10 Column 12,10,12,Sorghum bicolor,PI329501,MAC Season 4: BAP water-deficit stress Aug 1-14,medium,Manual canopy height,31.0,manual


#### D. Days & GDD to Flowering

In [62]:
flowering_time_s4_url = 'https://de.cyverse.org/dl/d/106ECC85-52DE-4769-8F3D-CC747C82ECE2/season_4_flowering_time_manual.csv'
flowering_time_s4_input_filename = 'flowering_time_s4.csv'

In [63]:
download_csv(flowering_time_s4_url, folder_name=folder_name, file_name=flowering_time_s4_input_filename)

In [64]:
fl_0 = read_in_csv(folder_name=folder_name, file_name=flowering_time_s4_input_filename)

#### Read in processed weather dataset for season 4

In [65]:
weather_s4_url = 'https://de.cyverse.org/dl/d/6D959379-0442-41FE-8BEE-890866ACF037/mac_season_4_weather.csv'
weather_s4_input_filename = 'weather_s4.csv'

In [66]:
download_csv(weather_s4_url, folder_name=folder_name, file_name=weather_s4_input_filename)

In [67]:
weather_0 = read_in_csv(folder_name=folder_name, file_name=weather_s4_input_filename)
# print(weather_0.shape)
# weather_0.head()

In [68]:
# plot_hist(fl_0, 'mean', 'trait')

In [69]:
# check_for_nulls_duplicates(fl_0)

In [70]:
# check_for_subplots(fl_0, 'plot')

In [71]:
# check_unique_values(fl_0)

#### Add planting date 2017-04-20

In [72]:
day_of_planting = datetime.date(2017,4,20)
flower_df_1 = fl_0.copy()

flower_df_1['date_of_planting'] = day_of_planting
# print(flower_df_1.shape)
# flower_df_1.head(3)

#### Create datetime with days to flowering (`mean`)

In [73]:
timedelta = pd.Series([pd.Timedelta(days=i) for i in flower_df_1['mean'].values])
dates_of_flowering = []

for td in timedelta:
    
    date_of_flowering = day_of_planting + td
    dates_of_flowering.append(date_of_flowering)
    
# print(flower_df_1.shape[0])
# print(len(dates_of_flowering))

In [74]:
flower_df_2 = flower_df_1.copy()
flower_df_2['date_of_flowering'] = dates_of_flowering
# print(flower_df_2.shape)
# flower_df_2.head(3)

#### Add GDD to flowering dataframe

In [75]:
# slice weather df for date and cumulative gdd values only

season_4_gdd = weather_0[['date', 'gdd']]
print(season_4_gdd.shape)
season_4_gdd.head(3)

(150, 2)


Unnamed: 0,date,gdd
0,2017-04-20,14.0
1,2017-04-21,26.0
2,2017-04-22,41.0


In [76]:
flower_df_3 = flower_df_2.copy()
flower_df_3.date_of_flowering = pd.to_datetime(flower_df_3.date_of_flowering)
# flower_df_3.dtypes

In [77]:
season_4_gdd_1 = season_4_gdd.copy()
season_4_gdd_1['date'] = pd.to_datetime(season_4_gdd_1['date'])
# season_4_gdd_1.dtypes

In [78]:
flower_df_4 = flower_df_3.merge(season_4_gdd_1, how='left', left_on='date_of_flowering', right_on='date')
# print(flower_df_4.shape)
# flower_df_4.head(3)

In [79]:
flower_df_5 = extract_range_column_values(flower_df_4, 'plot')
flower_df_6 = add_season_4_blocking_height(flower_df_5, 'range')

print(flower_df_6.shape)
# flower_df_6.tail(3)

(156, 19)


In [80]:
flower_df_7 = rename_value_column(flower_df_6, 'mean', 'trait')
# flower_df_7.sample(n=3)

In [81]:
flower_df_8 = flower_df_7.rename({'flowering_time': 'days_to_flowering', 'gdd': 'gdd_to_flowering'}, axis=1)
# flower_df_8.head(2)

In [82]:
cols_to_drop = ['date_x', 'checked', 'author', 'season', 'date_of_planting', 'date_y']

flower_df_9 = flower_df_8.drop(labels=cols_to_drop, axis=1)
print(flower_df_9.shape)
# flower_df_9.sample(n=3)

(156, 12)


In [83]:
new_col_order = ['plot', 'range', 'column', 'scientificname', 'genotype', 'treatment', 'blocking_height', 
                 'method', 'date_of_flowering', 'days_to_flowering', 'gdd_to_flowering', 'method_type']

flower_df_10 = reorder_columns(flower_df_9, new_col_order)
# print(flower_df_10.shape)
# flower_df_10.head(3)

#### E. Days & GDD to Flag Leaf Emergence

In [84]:
flag_leaf_s4_url = 'https://de.cyverse.org/dl/d/DFE10710-4367-4974-91D2-4C6E10DE89D6/season_4_flag_leaf_emergence_time_manual.csv'
flag_leaf_s4_input_filename = 'flag_leaf_s4.csv'

In [85]:
download_csv(flag_leaf_s4_url, folder_name=folder_name, file_name=flag_leaf_s4_input_filename)

In [86]:
fle_0 = read_in_csv(folder_name=folder_name, file_name=flag_leaf_s4_input_filename)

In [87]:
# print(weather_0.shape)
# weather_0.tail()

In [88]:
# plot_hist(fle_0, 'mean', 'trait')

In [89]:
# check_for_nulls_duplicates(fle_0)

In [90]:
# check_for_subplots(fle_0, 'plot')

In [91]:
# check_unique_values(fle_0)

#### Add planting date 2017-04-20

In [92]:
day_of_planting = datetime.date(2017,4,20)
fle_df_1 = fle_0.copy()

fle_df_1['date_of_planting'] = day_of_planting
# print(fle_df_1.shape)
# fle_df_1.head(3)

#### Create timedelta using days to flag leaf emergence (`mean`)

In [93]:
timedelta = pd.Series([pd.Timedelta(days=i) for i in fle_df_1['mean'].values])
dates_of_flag_leaf_emergence = []

for td in timedelta:
    
    date_of_flag_leaf_emergence = day_of_planting + td
    dates_of_flag_leaf_emergence.append(date_of_flag_leaf_emergence)
    
# print(fle_df_1.shape[0])
# print(len(dates_of_flag_leaf_emergence))

In [94]:
fle_df_2 = fle_df_1.copy()
fle_df_2['date_of_flag_leaf_emergence'] = dates_of_flag_leaf_emergence
# print(fle_df_2.shape)
# fle_df_2.head(3)

#### Add GDD values to flag leaf emergence dataframe

In [95]:
# slice weather df for date and cumulative gdd values only

season_4_gdd = weather_0[['date', 'gdd']]
# print(season_4_gdd.shape)
# season_4_gdd.head(3)

In [96]:
fle_df_3 = fle_df_2.copy()
fle_df_3.date_of_flag_leaf_emergence = pd.to_datetime(fle_df_3.date_of_flag_leaf_emergence)
# fle_df_3.dtypes

In [97]:
season_4_gdd_1 = season_4_gdd.copy()
season_4_gdd_1['date'] = pd.to_datetime(season_4_gdd_1['date'])
# season_4_gdd_1.dtypes

In [98]:
fle_df_4 = fle_df_3.merge(season_4_gdd_1, how='left', left_on='date_of_flag_leaf_emergence', right_on='date')
# print(fle_df_4.shape)
# fle_df_4.head(3)

In [99]:
fle_df_5 = extract_range_column_values(fle_df_4, 'plot')
fle_df_6 = add_season_4_blocking_height(fle_df_5, 'range')

# print(fle_df_6.shape)
# fle_df_6.tail(3)

In [100]:
fle_df_7 = rename_value_column(fle_df_6, 'mean', 'trait')
# fle_df_7.sample(n=3)

In [101]:
fle_df_8 = fle_df_7.rename({'flag_leaf_emergence_time': 'days_to_flag_leaf_emergence', 'gdd': 'gdd_to_flag_leaf_emergence'}, axis=1)
# fle_df_8.head(2)

In [102]:
cols_to_drop = ['date_x', 'checked', 'author', 'season', 'date_of_planting', 'date_y']

fle_df_9 = fle_df_8.drop(labels=cols_to_drop, axis=1)
print(fle_df_9.shape)
# fle_df_9.sample(n=3)

(176, 12)


In [103]:
new_col_order = ['plot', 'range', 'column', 'scientificname', 'genotype', 'treatment', 'blocking_height', 
                 'method', 'date_of_flag_leaf_emergence', 'days_to_flag_leaf_emergence', 
                 'gdd_to_flag_leaf_emergence', 'method_type']

fle_df_10 = reorder_columns(fle_df_9, new_col_order)
# print(fle_df_10.shape)
# fle_df_10.head(3)

### Save all datasets to separate csv files

In [104]:
list_of_dfs = [adb_7, ch_8, chm_6, flower_df_10, fle_df_10]
list_of_output_filenames = ['mac_season_4_aboveground_dry_biomass.csv',
                           'mac_season_4_canopy_height_sensor.csv',
                           'mac_season_4_canopy_height_manual.csv',
                           'mac_season_4_days_gdd_to_flowering.csv',
                           'mac_season_4_days_gdd_to_flag_leaf_emergence.csv']

In [105]:
save_to_csv_without_timestamp(list_of_dfs, list_of_output_filenames)