### Citation for Input Trait Data

LeBauer, David et al. (2020), Data From: TERRA-REF, An open reference data set from high resolution genomics, phenomics, and imaging sensors, v6, Dryad, Dataset, https://doi.org/10.5061/dryad.4b8gtht99

##### Environmental weather data can be downloaded from the MAC weather station [website](https://cals.arizona.edu/azmet/06.htm)

Please email ejcain@arizona.edu with any questions or comments or create an issue in this [repository](https://github.com/MagicMilly/for-data-publication) 

### Table of Contents

#### I. Import Python packages

#### II. Functions

#### III. Read in Datasets
  - Aboveground Dry Biomass
  - Canopy Height - Sensor

#### IV. Write derived data to csv files

### MAC Season 6 Data Cleaning

#### Season Dates
- Planting: 2018-04-25
- Last Day of Harvest: 2018-08-01

### I. Import Python packages

In [None]:
import datetime
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

### II. Functions

In [None]:
def plot_hist(df, value_column, trait_column):
    
    trait_name = df[trait_column].unique()[0]
    return df[value_column].hist(color='navy').set_xlabel(trait_name);

In [None]:
def check_for_nulls_duplicates(df):
    
    print(
        f'Sum of null values:\n{df.isnull().sum()}\n-----\n'
        f'Value counts for duplicates:\n{df.duplicated().value_counts()}'
    )

In [None]:
def check_unique_values(df):

    for col in df.columns:
        
        if df[col].nunique() < 5:
            print(f'{df[col].nunique()} unique value(s) for {col} column: {df[col].unique()}')
            
        else:
            print(f'{df[col].nunique()} values for {col} column')

In [None]:
def extract_range_column_values(working_df, plot_column):
    
    new_df = working_df.copy()

    new_df['range'] = new_df[plot_column].str.extract("Range (\d+)").astype(int)
    new_df['column'] = new_df[plot_column].str.extract("Column (\d+)").astype(int)
    
    return new_df

In [None]:
def convert_datetime_column(working_df, date_column):
    
    new_datetimes = pd.to_datetime(working_df[date_column])
    
    new_df_0 = working_df.drop(labels=date_column, axis=1)
    new_df_1 = new_df_0.copy()
    new_df_1['date'] = new_datetimes
    
    return new_df_1

In [None]:
def rename_value_column(working_df, value_column, trait_column):
    
    trait = working_df[trait_column].unique()[0]
    
    new_df_0 = working_df.rename({value_column: trait}, axis=1)
    new_df_1 = new_df_0.drop(labels=trait_column, axis=1)
    
    return new_df_1

In [None]:
def reorder_columns(working_df, new_col_order_list):
    
    working_df_1 = pd.DataFrame(data=working_df, columns=new_col_order_list)
    return working_df_1

In [None]:
def check_for_subplots(df, plot_col):

    for name in df[plot_col].values:
        
        if (name.endswith(' E')) | (name.endswith(' W')):
             return 'This dataset contains subplot designations.'
        
        else:
            return 'No subplot designations.'

In [None]:
def save_to_csv_with_timestamp(df, name_of_dataset):
    
    timestamp = datetime.datetime.now().replace(microsecond=0).isoformat()
    output_filename = ('data/processed/' + f'{name_of_dataset}_' + f'{timestamp}.csv').replace(':', '')

    df.to_csv(output_filename, index=False)

In [None]:
def save_to_csv_without_timestamp(list_of_dfs, list_of_output_filenames):

    for i,j in zip(list_of_dfs, list_of_output_filenames):
        i.to_csv(j, index=False)

### III. Read in datasets
- MAC season six trait data can be downloaded from Dryad https://doi.org/10.5061/dryad.4b8gtht99
- Each trait - separated by method, if applicable - can be found in its own `.csv` file
- Changes applied to all datasets
    - Extract range and column values to add to dataframe
    - Convert string date column values to datetime objects
    - Rename values column (usually 'mean') to the trait being measured
    - Added blocking heights
- Columns dropped from all datasets
    - `checked` 
    - `author`
    - `season`
    - `treatment`

#### A. Aboveground Dry Biomass

In [None]:
adb_0 = pd.read_csv('data/raw/season_6_traits/season_6_aboveground_dry_biomass_manual.csv')
print(adb_0.shape)
# adb_0.head(3)

In [None]:
plot_hist(adb_0, 'mean', 'trait')

In [None]:
check_for_nulls_duplicates(adb_0)

In [None]:
check_for_subplots(adb_0, 'plot')

In [None]:
# check_unique_values(adb_0)

In [None]:
adb_1 = extract_range_column_values(adb_0, 'plot')
print(adb_1.shape)
# adb_1.sample(n=3)

#### Add Blocking Heights

In [None]:
bh_df = pd.read_csv('data/raw/s6_blocks.csv.txt')
print(bh_df.shape)
bh_df.head(3)

In [None]:
bh_df.height_block.value_counts()

In [None]:
check_for_nulls_duplicates(bh_df)

In [None]:
bh_df_1 = bh_df.dropna(axis=0, how='all')
bh_df_1.shape

In [None]:
check_for_nulls_duplicates(bh_df_1)

In [None]:
check_for_subplots(bh_df_1, 'plot')

#### Merge blocking heights with aboveground dry biomass dataframe

In [None]:
adb_2 = adb_1.merge(bh_df_1, how='left', left_on='plot', right_on='plot')
print(adb_2.shape)
# adb_2.head(3)

In [None]:
adb_2.height_block.value_counts()

In [None]:
adb_3 = convert_datetime_column(adb_2, 'date')
print(adb_3.shape)
# adb_3.head(3)

In [None]:
adb_4 = rename_value_column(adb_3, 'mean', 'trait')
print(adb_4.shape)
# adb_4.tail(3)

In [None]:
cols_to_drop = ['checked', 'author', 'season', 'treatment']

adb_5 = adb_4.drop(labels=cols_to_drop, axis=1)
print(adb_5.shape)
# adb_5.head(3)

##### Add units (kg/ha) column to aboveground dry biomass dataset

In [None]:
adb_6 = adb_5.copy()
adb_6['units'] = 'kg/ha'

print(adb_6.shape)
# adb_6.tail(3)

In [None]:
new_col_order = ['date', 'plot', 'range', 'column', 'scientificname', 'genotype', 'height_block', 'method', 
                 'aboveground_dry_biomass', 'units', 'method_type']

adb_7 = reorder_columns(adb_6, new_col_order)
print(adb_7.shape)
adb_7.head(3)

#### B. Canopy Height - Sensor

In [None]:
ch_0 = pd.read_csv('data/raw/season_6_traits/season_6_canopy_height_sensor.csv')
print(ch_0.shape)
# ch_0.head()

In [None]:
# check_unique_values(ch_0)

In [None]:
check_for_nulls_duplicates(ch_0)

In [None]:
check_for_subplots(ch_0, 'plot')

#### Drop duplicates

In [None]:
ch_1 = ch_0.drop_duplicates(ignore_index=True)
print(ch_1.shape)
# check_for_nulls_duplicates(ch_1)

In [None]:
ch_2 = extract_range_column_values(ch_1, 'plot')
print(ch_2.shape)
# ch_2.sample(n=3)

In [None]:
ch_3 = convert_datetime_column(ch_2, 'date')
print(ch_3.shape)
# ch_3.dtypes

In [None]:
ch_4 = rename_value_column(ch_3, 'mean', 'trait')
print(ch_4.shape)
# ch_4.tail(3)

In [None]:
# add units (cm) to column name

ch_5 = ch_4.rename({'canopy_height': 'canopy_height_cm'}, axis=1)
# ch_5.sample(n=3)

#### Add blocking heights

In [None]:
bh_df_1.head(3)

In [None]:
bh_df_1['plot'].nunique()

In [None]:
ch_0['plot'].nunique()

There is not a height block provided for every plot, so the final canopy height dataframe will contain some nulls.

In [None]:
check_for_nulls_duplicates(bh_df_1)

In [None]:
ch_6 = ch_5.merge(bh_df_1, how='left', left_on='plot', right_on='plot')
print(ch_6.shape)
# ch_6.tail(3)

In [None]:
ch_7 = ch_6.drop(labels=['checked', 'author', 'season', 'treatment'], axis=1)
print(ch_7.shape)
# ch_7.tail(3)

In [None]:
# ch_7.isnull().sum()

In [None]:
new_col_order = ['date', 'plot', 'range', 'column', 'scientificname', 'genotype', 'method', 'canopy_height_cm', 
                 'height_block', 'method_type']

ch_8 = reorder_columns(ch_7, new_col_order)
print(ch_8.shape)
ch_8.head(3)

#### IV. Write derived data to csv files

In [None]:
list_of_dfs = [adb_7, ch_8]
list_of_file_output_names = ['data/processed/mac_season_6_aboveground_dry_biomass.csv',
                            'data/processed/mac_season_6_canopy_height_sensor.csv']

save_to_csv_without_timestamp(list_of_dfs, list_of_file_output_names)