### Maricopa Agricultural Center Season 6

### Citation for Input Trait Data

LeBauer, David et al. (2020), Data From: TERRA-REF, An open reference data set from high resolution genomics, phenomics, and imaging sensors, v6, Dryad, Dataset, https://doi.org/10.5061/dryad.4b8gtht99

##### Environmental weather data can be downloaded from the MAC weather station [website](https://cals.arizona.edu/azmet/06.htm)

Please email dlebauer@email.arizona.edu or ejcain@email.arizona.edu with any questions or comments, or create an issue in this [repository](https://github.com/genophenoenvo/terraref-datasets) 

In [1]:
import datetime
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import requests

In [2]:
def download_csv(url, folder_name, file_name):
    response = requests.get(url)
    with open(os.path.join(folder_name, file_name), 'wb') as f:
        f.write(response.content)

In [3]:
def read_in_csv(folder_name, file_name):
    df = pd.read_csv(folder_name + '/' + file_name, low_memory=False)
    return df

In [4]:
def plot_hist(df, value_column, trait_column):
    
    trait_name = df[trait_column].unique()[0]
    return df[value_column].hist(color='navy').set_xlabel(trait_name);

In [5]:
def check_for_nulls_duplicates(df):
    
    print(
        f'Sum of null values:\n{df.isnull().sum()}\n-----\n'
        f'Value counts for duplicates:\n{df.duplicated().value_counts()}'
    )

In [6]:
def check_unique_values(df):

    for col in df.columns:
        
        if df[col].nunique() < 5:
            print(f'{df[col].nunique()} unique value(s) for {col} column: {df[col].unique()}')
            
        else:
            print(f'{df[col].nunique()} values for {col} column')

In [7]:
def extract_range_column_values(working_df, plot_column):
    
    new_df = working_df.copy()

    new_df['range'] = new_df[plot_column].str.extract("Range (\d+)").astype(int)
    new_df['column'] = new_df[plot_column].str.extract("Column (\d+)").astype(int)
    
    return new_df

In [8]:
def convert_datetime_column(working_df, date_column):
    
    new_datetimes = pd.to_datetime(working_df[date_column])
    
    new_df_0 = working_df.drop(labels=date_column, axis=1)
    new_df_1 = new_df_0.copy()
    new_df_1['date'] = new_datetimes
    
    return new_df_1

In [9]:
def rename_value_column(working_df, value_column, trait_column):
    
    trait = working_df[trait_column].unique()[0]
    
    new_df_0 = working_df.rename({value_column: trait}, axis=1)
    new_df_1 = new_df_0.drop(labels=trait_column, axis=1)
    
    return new_df_1

In [10]:
def reorder_columns(working_df, new_col_order_list):
    
    working_df_1 = pd.DataFrame(data=working_df, columns=new_col_order_list)
    return working_df_1

In [11]:
def save_to_csv_with_timestamp(df, name_of_dataset):
    
    timestamp = datetime.datetime.now().replace(microsecond=0).isoformat()
    output_filename = ('data/processed/' + f'{name_of_dataset}_' + f'{timestamp}.csv').replace(':', '')

    df.to_csv(output_filename, index=False)

In [12]:
def save_to_csv_without_timestamp(list_of_dfs, list_of_output_filenames):

    for i,j in zip(list_of_dfs, list_of_output_filenames):
        i.to_csv(j, index=False)

#### A. Aboveground Dry Biomass

In [13]:
folder_name = 'data'
if not os.path.exists(folder_name):
    os.makedirs(folder_name)

In [14]:
aboveground_dry_biomass_s6_url = 'https://de.cyverse.org/dl/d/1333BF0F-9462-4F0A-8D35-2B446F0CC989/season_6_aboveground_dry_biomass_manual.csv'
aboveground_dry_biomass_s6_input_filename = 'aboveground_dry_biomass_s6.csv'

In [15]:
download_csv(aboveground_dry_biomass_s6_url, folder_name=folder_name, file_name=aboveground_dry_biomass_s6_input_filename)

In [16]:
adb_0 = read_in_csv(folder_name=folder_name, file_name=aboveground_dry_biomass_s6_input_filename)
# print(adb_0.shape)
# adb_0.head()

In [17]:
# plot_hist(adb_0, 'mean', 'trait')

In [18]:
# check_for_nulls_duplicates(adb_0)

In [19]:
adb_1 = extract_range_column_values(adb_0, 'plot')
# print(adb_1.shape)
# adb_1.sample(n=3)

#### Add Blocking Heights

In [20]:
bh_s6_url = 'https://de.cyverse.org/dl/d/73900334-1A0F-4C56-8F96-FAC303671431/s6_blocks.csv.txt'
bh_s6_input_filename = 'blocking_heights_s6.csv'

In [21]:
download_csv(bh_s6_url, folder_name=folder_name, file_name=bh_s6_input_filename)

In [22]:
bh_df = read_in_csv(folder_name=folder_name, file_name=bh_s6_input_filename)
# print(bh_df.shape)
# bh_df.head()

(999, 2)


Unnamed: 0,height_block,plot
0,border,MAC Field Scanner Season 6 Range 2 Column 2
1,border,MAC Field Scanner Season 6 Range 2 Column 3
2,border,MAC Field Scanner Season 6 Range 2 Column 4
3,border,MAC Field Scanner Season 6 Range 2 Column 5
4,border,MAC Field Scanner Season 6 Range 2 Column 6


In [23]:
# bh_df.height_block.value_counts()

In [24]:
# check_for_nulls_duplicates(bh_df)

In [25]:
bh_df_1 = bh_df.dropna(axis=0, how='all')
# bh_df_1.shape

#### Merge blocking heights with aboveground dry biomass dataframe

In [26]:
adb_2 = adb_1.merge(bh_df_1, how='left', left_on='plot', right_on='plot')
# print(adb_2.shape)
# adb_2.head(3)

In [27]:
adb_3 = convert_datetime_column(adb_2, 'date')
# print(adb_3.shape)
# adb_3.head(3)

In [28]:
adb_4 = rename_value_column(adb_3, 'mean', 'trait')
# print(adb_4.shape)
# adb_4.tail(3)

In [29]:
cols_to_drop = ['checked', 'author', 'season', 'treatment']

adb_5 = adb_4.drop(labels=cols_to_drop, axis=1)
# print(adb_5.shape)
# adb_5.head(3)

##### Add units (kg/ha) column to aboveground dry biomass dataset

In [30]:
adb_6 = adb_5.copy()
adb_6['units'] = 'kg/ha'

# print(adb_6.shape)
# adb_6.tail(3)

In [31]:
new_col_order = ['date', 'plot', 'range', 'column', 'scientificname', 'genotype', 'height_block', 'method', 
                 'aboveground_dry_biomass', 'units', 'method_type']

adb_7 = reorder_columns(adb_6, new_col_order)
# print(adb_7.shape)
# adb_7.head(3)

#### B. Canopy Height - Sensor

In [32]:
canopy_height_s6_url = 'https://de.cyverse.org/dl/d/D069737A-76F3-4B69-A213-4B8811A357C0/season_6_canopy_height_sensor.csv'
canopy_height_s6_input_filename = 'canopy_height_s6.csv'

In [33]:
download_csv(canopy_height_s6_url, folder_name=folder_name, file_name=canopy_height_s6_input_filename)

In [34]:
ch_0 = read_in_csv(folder_name=folder_name, file_name=canopy_height_s6_input_filename)
# print(ch_0.shape)
# ch_0.head()

In [35]:
# check_for_nulls_duplicates(ch_0)

#### Drop duplicates

In [36]:
ch_1 = ch_0.drop_duplicates(ignore_index=True)
# print(ch_1.shape)
# check_for_nulls_duplicates(ch_1)

In [37]:
ch_2 = extract_range_column_values(ch_1, 'plot')
# print(ch_2.shape)
# ch_2.sample(n=3)

In [38]:
ch_3 = convert_datetime_column(ch_2, 'date')
# print(ch_3.shape)
# ch_3.dtypes

In [39]:
ch_4 = rename_value_column(ch_3, 'mean', 'trait')
# print(ch_4.shape)
# ch_4.tail(3)

In [40]:
# add units (cm) to column name

ch_5 = ch_4.rename({'canopy_height': 'canopy_height_cm'}, axis=1)
# ch_5.sample(n=3)

#### Add blocking heights

In [41]:
# bh_df_1.head(3)

Unnamed: 0,height_block,plot
0,border,MAC Field Scanner Season 6 Range 2 Column 2
1,border,MAC Field Scanner Season 6 Range 2 Column 3
2,border,MAC Field Scanner Season 6 Range 2 Column 4


In [42]:
print(bh_df_1['plot'].nunique())
print(ch_0['plot'].nunique())

714
809


There is not a height block provided for every plot, so the final canopy height dataframe will contain some nulls.

In [43]:
ch_6 = ch_5.merge(bh_df_1, how='left', left_on='plot', right_on='plot')
# print(ch_6.shape)
# ch_6.tail(3)

In [44]:
ch_7 = ch_6.drop(labels=['checked', 'author', 'season', 'treatment'], axis=1)
# print(ch_7.shape)
# ch_7.tail(3)

In [45]:
# ch_7.isnull().sum()

In [46]:
new_col_order = ['date', 'plot', 'range', 'column', 'scientificname', 'genotype', 'method', 'canopy_height_cm', 
                 'height_block', 'method_type']

ch_8 = reorder_columns(ch_7, new_col_order)
# print(ch_8.shape)
# ch_8.head(3)

#### IV. Write derived data to csv files

In [47]:
list_of_dfs = [adb_7, ch_8]
list_of_file_output_names = ['mac_season_6_aboveground_dry_biomass.csv',
                            'mac_season_6_canopy_height_sensor.csv']

save_to_csv_without_timestamp(list_of_dfs, list_of_file_output_names)