### Citation for Input Trait Data

LeBauer, David et al. (2020), Data From: TERRA-REF, An open reference data set from high resolution genomics, phenomics, and imaging sensors, v6, Dryad, Dataset, https://doi.org/10.5061/dryad.4b8gtht99

##### Environmental weather data 
- downloaded from the MAC weather station [website](https://cals.arizona.edu/azmet/06.htm)
- cleaned using the code in the `season_4_weather_data_cleaning` notebook in this repository

Please email ejcain@arizona.edu with any questions or comments or create an issue in this [repository](https://github.com/MagicMilly/for-data-publication) 

## Table of Contents

### I. Import Python packages

### II. Functions Used

### III. Read in Datasets for cleaning
  - Aboveground Dry Biomass
  - Canopy Height - Sensor
  - Canopy Height - Manual
  - Days & GDD to Flowering
  - Days & GDD to Flag Leaf Emergence

### IV. Save Derived Datasets to csv

### MAC Season 4 Data Cleaning

#### Traits
- aboveground dry biomass
- canopy height
- days & growing degree days (GDD) to flowering
- days & GDD to flag leaf emergence

### I. Import Python packages

In [None]:
import datetime
import numpy as np
import pandas as pd
import sqlite3

### II. Functions

In [None]:
def plot_hist(df, value_column, trait_column):
    
    trait_name = df[trait_column].unique()[0]
    return df[value_column].hist(color='navy').set_xlabel(trait_name);

In [None]:
def check_for_nulls_duplicates(df):
    
    print(
        f'Sum of null values:\n{df.isnull().sum()}\n-----\n'
        f'Value counts for duplicates:\n{df.duplicated().value_counts()}'
    )

In [None]:
def check_unique_values(df):

    for col in df.columns:
        if df[col].nunique() < 5:
            print(f'{df[col].nunique()} unique value(s) for {col} column: {df[col].unique()}')    
        else:
            print(f'{df[col].nunique()} values for {col} column')

In [None]:
def extract_range_column_values(working_df, plot_column):
    
    new_df = working_df.copy()
    new_df['range'] = new_df[plot_column].str.extract("Range (\d+)").astype(int)
    new_df['column'] = new_df[plot_column].str.extract("Column (\d+)").astype(int)
    
    return new_df

In [None]:
def convert_datetime_column(working_df, date_column):
    
    new_datetimes = pd.to_datetime(working_df[date_column])
    new_df_0 = working_df.drop(labels=date_column, axis=1)
    new_df_1 = new_df_0.copy()
    new_df_1['date'] = new_datetimes
    
    return new_df_1

In [None]:
def rename_value_column(working_df, value_column, trait_column):
    
    trait = working_df[trait_column].unique()[0]
    new_df_0 = working_df.rename({value_column: trait}, axis=1)
    new_df_1 = new_df_0.drop(labels=trait_column, axis=1)
    
    return new_df_1

Blocking height experiment description for season 4 can be found [here](https://terraref.ncsa.illinois.edu/bety/api/v1/experiments?name=~MAC+Season+4:+All+BAP+With+Late+Season+Drought)


In [None]:
def add_season_4_blocking_height(working_df, range_column):
    
    short_blocks = [11, 20, 46, 50]
    medium_blocks = [10, 12, 18, 24, 27, 29, 31, 33, 38, 51]
    tall_blocks = [3, 4, 5, 6, 7, 8, 9, 13, 14, 15, 16, 17, 19, 21, 22, 23, 25, 26, 28, 30, 32, 34, 35, 36, 37, 
                   39, 40, 41, 42, 43, 44, 45, 47, 48, 49, 52]
    border = [1, 2, 53, 54]
    
    range_values = working_df[range_column].values
    blocking_heights = []
    
    for r in range_values:
        
        if r in short_blocks:
            blocking_heights.append('short')
            
        elif r in medium_blocks:
            blocking_heights.append('medium')
            
        elif r in tall_blocks:
            blocking_heights.append('tall')
            
        elif r in border:
            blocking_heights.append('border')
            
        else:
            print(f'Error with range value {r}')
        
    working_df_1 = working_df.copy()
    working_df_1['blocking_height'] = blocking_heights
    
    return working_df_1

In [None]:
def reorder_columns(working_df, new_col_order_list):
    
    working_df_1 = pd.DataFrame(data=working_df, columns=new_col_order_list)
    return working_df_1

In [None]:
def check_for_subplots(df, plot_col):

    for name in df[plot_col].values:
        if (name.endswith(' E')) | (name.endswith(' W')):
             return 'This dataset contains subplot designations'
        else:
            return 'No subplot designations'

In [None]:
def strip_subplots(working_df, plot_col, new_plot_col_name):
    
    plot_names = working_df[plot_col].values
    new_plot_names = []
    
    for n in plot_names:
        if (n.endswith(' E') | (n.endswith(' W'))):
            new_plot_names.append(n[:-2])    
        else:
            new_plot_names.append(n)
            
    working_df_1 = working_df.drop(labels=plot_col, axis=1)
    working_df_2 = working_df_1.copy()
    
    working_df_2[new_plot_col_name] = new_plot_names
    return working_df_2

In [None]:
def save_to_csv_with_timestamp(df, name_of_dataset):
    
    timestamp = datetime.datetime.now().replace(microsecond=0).isoformat()
    output_filename = ('data/processed/' + f'{name_of_dataset}_' + f'{timestamp}.csv').replace(':', '')

    df.to_csv(output_filename, index=False)

In [None]:
def save_to_csv_without_timestamp(list_of_dfs, list_of_output_filenames):

    for i,j in zip(list_of_dfs, list_of_output_filenames):
        i.to_csv(j, index=False)

### III. Read in datasets
- Season four trait data can be downloaded from Dryad https://doi.org/10.5061/dryad.4b8gtht99
- Each trait - separated by method, if applicable - can be found in its own `.csv` file
- Changes applied to all datasets
    - Extract `range` and `column` values to add to dataframe
    - Convert string date column values to datetime objects
    - Rename values column (usually `mean`) to the trait being measured
    - Add `blocking_height` column
- Columns dropped from all datasets
    - `checked` 
    - `author`
    - `season`

#### A. Aboveground Dry Biomass

In [None]:
adb_0 = pd.read_csv('data/raw/season_4_traits/season_4_aboveground_dry_biomass_manual.csv')
print(adb_0.shape)
# adb_0.head()

In [None]:
plot_hist(adb_0, 'mean', 'trait')

In [None]:
check_for_nulls_duplicates(adb_0)

In [None]:
# check_unique_values(adb_0)

In [None]:
adb_1 = extract_range_column_values(adb_0, 'plot')
print(adb_1.shape)
# adb_1.sample(n=3)

In [None]:
adb_2 = convert_datetime_column(adb_1, 'date')
print(adb_2.shape)
# adb_2.head()

In [None]:
adb_3 = rename_value_column(adb_2, 'mean', 'trait')
print(adb_3.shape)
# adb_3.tail()

In [None]:
cols_to_drop = ['checked', 'author', 'season']

adb_4 = adb_3.drop(labels=cols_to_drop, axis=1)
print(adb_4.shape)
# adb_4.head(3)

In [None]:
adb_5 = add_season_4_blocking_height(adb_4, 'range')
print(adb_5.shape)
# adb_5.sample(n=3)

##### Add units (kg/ha) column to aboveground dry biomass dataset

In [None]:
adb_6 = adb_5.copy()
adb_6['units'] = 'kg/ha'

print(adb_6.shape)
# adb_6.tail(3)

In [None]:
new_col_order = ['date', 'plot', 'range', 'column', 'scientificname', 'genotype', 'treatment', 'blocking_height', 
                 'method', 'aboveground_dry_biomass', 'units', 'method_type']

adb_7 = reorder_columns(adb_6, new_col_order)
print(adb_7.shape)
adb_7.head(3)

#### B. Canopy Height - Sensor

In [None]:
ch_0 = pd.read_csv('data/raw/season_4_traits/season_4_canopy_height_sensor.csv')
print(ch_0.shape)
# ch_0.head()

In [None]:
# check_unique_values(ch_0)

In [None]:
check_for_nulls_duplicates(ch_0)

#### Drop duplicates

In [None]:
ch_1 = ch_0.drop_duplicates(ignore_index=True)
print(ch_1.shape)
check_for_nulls_duplicates(ch_1)

In [None]:
check_for_subplots(ch_1, 'plot')

In [None]:
ch_2 = extract_range_column_values(ch_1, 'plot')
print(ch_2.shape)
# ch_2.sample(n=3)

In [None]:
ch_3 = convert_datetime_column(ch_2, 'date')
print(ch_3.shape)
# ch_3.dtypes

In [None]:
ch_4 = rename_value_column(ch_3, 'mean', 'trait')
print(ch_4.shape)
# ch_4.tail(3)

In [None]:
ch_5 = add_season_4_blocking_height(ch_4, 'range')
# ch_5.sample(n=3)

In [None]:
ch_6 = ch_5.drop(labels=['checked', 'author', 'season'], axis=1)
print(ch_6.shape)

#### Add units column
- cm

In [None]:
ch_7 = ch_6.copy()
ch_7['units'] = 'cm'
print(ch_7.shape)
# ch_7.head(3)

In [None]:
new_col_order = ['date', 'plot', 'range', 'column', 'scientificname', 'genotype', 'treatment', 'blocking_height', 
                 'method', 'canopy_height', 'units', 'method_type']

ch_8 = reorder_columns(ch_7, new_col_order)
print(ch_8.shape)
ch_8.head(3)

#### C. Canopy Height - Manual
- using SQLite for `groupby`

In [None]:
chm_0 = pd.read_csv('data/raw/season_4_traits/season_4_canopy_height_manual.csv')
print(chm_0.shape)
# chm_0.head()

In [None]:
check_for_nulls_duplicates(chm_0)

In [None]:
# check_unique_values(chm_0)

In [None]:
chm_1 = extract_range_column_values(chm_0, 'plot')
print(chm_1.shape)
# chm_1.sample(n=3)

In [None]:
chm_2 = convert_datetime_column(chm_1, 'date')
print(chm_2.shape)
# chm_2.head()

#### Identify and Remove Subplot Designations

In [None]:
check_for_subplots(chm_2, 'plot')

In [None]:
chm_3 = strip_subplots(chm_2, 'plot', 'plot')
print(chm_3.shape)
# chm_3.sample(n=3)

In [None]:
check_for_subplots(chm_3, 'plot')

In [None]:
# check for plot/date/mean/treatment duplicates

chm_3.duplicated(subset=['plot', 'date', 'mean', 'treatment']).value_counts()

In [None]:
# Drop Duplicates

chm_4 = chm_3.drop_duplicates(ignore_index=True, subset=['plot', 'genotype', 'treatment', 'mean', 'range', 'column',
                                                        'date'])

print(chm_4.shape)
chm_4.duplicated().value_counts()

In [None]:
chm_5 = add_season_4_blocking_height(chm_4, 'range')
print(chm_5.shape)
# chm_5.sample(n=3)

#### Use sqlite database to group by `plot`, `date`, and `mean` 
- rename `mean` to `canopy_height_cm`
- can also drop and reorder columns at this time

In [None]:
conn = sqlite3.connect('data/interim/canopy_heights_manual_season_4.sqlite')
cursor = conn.cursor()
print("Opened database successfully")

In [None]:
# comment next line out if db has already been created
chm_5.to_sql('canopy_heights_manual_season_4.sqlite', conn)

In [None]:
chm_6 = pd.read_sql_query("""
                            SELECT date, plot, range, column, scientificname, genotype, treatment, blocking_height,
                            method, ROUND(AVG([mean]), 2) AS canopy_height_cm, method_type
                            FROM 'canopy_heights_manual_season_4.sqlite'
                            GROUP BY plot, date,[mean]
                            ORDER BY date ASC;
                            """, conn)

print(chm_6.shape)
chm_6.head(3)

#### D. Days & GDD to Flowering

In [None]:
fl_0 = pd.read_csv('data/raw/season_4_traits/season_4_flowering_time_manual.csv')
print(fl_0.shape)
# fl_0.head()

#### Read in updated processed weather dataset for season 4

In [None]:
weather_0 = pd.read_csv('data/processed/mac_season_4_daily_weather_2020-07-01T144735.csv')
print(weather_0.shape)
# weather_0.head()

In [None]:
plot_hist(fl_0, 'mean', 'trait')

In [None]:
check_for_nulls_duplicates(fl_0)

In [None]:
check_for_subplots(fl_0, 'plot')

In [None]:
# check_unique_values(fl_0)

#### Add planting date 2017-04-20

In [None]:
day_of_planting = datetime.date(2017,4,20)
flower_df_1 = fl_0.copy()

flower_df_1['date_of_planting'] = day_of_planting
print(flower_df_1.shape)
# flower_df_1.head(3)

#### Create datetime with days to flowering (`mean`)

In [None]:
timedelta = pd.Series([pd.Timedelta(days=i) for i in flower_df_1['mean'].values])
dates_of_flowering = []

for td in timedelta:
    
    date_of_flowering = day_of_planting + td
    dates_of_flowering.append(date_of_flowering)
    
print(flower_df_1.shape[0])
print(len(dates_of_flowering))

In [None]:
flower_df_2 = flower_df_1.copy()
flower_df_2['date_of_flowering'] = dates_of_flowering
print(flower_df_2.shape)
# flower_df_2.head(3)

#### Add GDD to flowering dataframe

In [None]:
# slice weather df for date and cumulative gdd values only

season_4_gdd = weather_0[['date', 'gdd']]
print(season_4_gdd.shape)
# season_4_gdd.head(3)

In [None]:
season_4_gdd.dtypes

In [None]:
flower_df_3 = flower_df_2.copy()
flower_df_3.date_of_flowering = pd.to_datetime(flower_df_3.date_of_flowering)
# flower_df_3.dtypes

In [None]:
season_4_gdd_1 = season_4_gdd.copy()
season_4_gdd_1['date'] = pd.to_datetime(season_4_gdd_1['date'])
season_4_gdd_1.dtypes

In [None]:
flower_df_4 = flower_df_3.merge(season_4_gdd_1, how='left', left_on='date_of_flowering', right_on='date')
print(flower_df_4.shape)
# flower_df_4.head(3)

In [None]:
flower_df_5 = extract_range_column_values(flower_df_4, 'plot')
flower_df_6 = add_season_4_blocking_height(flower_df_5, 'range')

print(flower_df_6.shape)
# flower_df_6.tail(3)

In [None]:
flower_df_7 = rename_value_column(flower_df_6, 'mean', 'trait')
# flower_df_7.sample(n=3)

In [None]:
flower_df_8 = flower_df_7.rename({'flowering_time': 'days_to_flowering', 'gdd': 'gdd_to_flowering'}, axis=1)
# flower_df_8.head(2)

In [None]:
cols_to_drop = ['date_x', 'checked', 'author', 'season', 'date_of_planting', 'date_y']

flower_df_9 = flower_df_8.drop(labels=cols_to_drop, axis=1)
print(flower_df_9.shape)
# flower_df_9.sample(n=3)

In [None]:
new_col_order = ['plot', 'range', 'column', 'scientificname', 'genotype', 'treatment', 'blocking_height', 
                 'method', 'date_of_flowering', 'days_to_flowering', 'gdd_to_flowering', 'method_type']

flower_df_10 = reorder_columns(flower_df_9, new_col_order)
print(flower_df_10.shape)
flower_df_10.head(3)

#### E. Days & GDD to Flag Leaf Emergence

In [None]:
fle_0 = pd.read_csv('data/raw/season_4_traits/season_4_flag_leaf_emergence_time_manual.csv')
print(fle_0.shape)
# fle_0.head()

#### Read in updated processed weather dataset for season 4
Code used to process weather data for season 4 can be found in the `season_4_weather_data_cleaning` notebook in this repository

In [None]:
weather_0 = pd.read_csv('data/processed/mac_season_4_daily_weather_2020-07-01T144735.csv')
print(weather_0.shape)
# weather_0.head()

In [None]:
plot_hist(fle_0, 'mean', 'trait')

In [None]:
check_for_nulls_duplicates(fle_0)

In [None]:
check_for_subplots(fle_0, 'plot')

In [None]:
# check_unique_values(fle_0)

#### Add planting date 2017-04-20

In [None]:
day_of_planting = datetime.date(2017,4,20)
fle_df_1 = fle_0.copy()

fle_df_1['date_of_planting'] = day_of_planting
print(fle_df_1.shape)
# fle_df_1.head(3)

#### Create timedelta using days to flag leaf emergence (`mean`)

In [None]:
timedelta = pd.Series([pd.Timedelta(days=i) for i in fle_df_1['mean'].values])
dates_of_flag_leaf_emergence = []

for td in timedelta:
    
    date_of_flag_leaf_emergence = day_of_planting + td
    dates_of_flag_leaf_emergence.append(date_of_flag_leaf_emergence)
    
print(fle_df_1.shape[0])
print(len(dates_of_flag_leaf_emergence))

In [None]:
fle_df_2 = fle_df_1.copy()
fle_df_2['date_of_flag_leaf_emergence'] = dates_of_flag_leaf_emergence
print(fle_df_2.shape)
# fle_df_2.head(3)

#### Add GDD values to flag leaf emergence dataframe

In [None]:
# slice weather df for date and cumulative gdd values only

season_4_gdd = weather_0[['date', 'gdd']]
print(season_4_gdd.shape)
# season_4_gdd.head(3)

In [None]:
fle_df_3 = fle_df_2.copy()
fle_df_3.date_of_flag_leaf_emergence = pd.to_datetime(fle_df_3.date_of_flag_leaf_emergence)
# fle_df_3.dtypes

In [None]:
season_4_gdd_1 = season_4_gdd.copy()
season_4_gdd_1['date'] = pd.to_datetime(season_4_gdd_1['date'])
season_4_gdd_1.dtypes

In [None]:
fle_df_4 = fle_df_3.merge(season_4_gdd_1, how='left', left_on='date_of_flag_leaf_emergence', right_on='date')
print(fle_df_4.shape)
# fle_df_4.head(3)

In [None]:
fle_df_5 = extract_range_column_values(fle_df_4, 'plot')
fle_df_6 = add_season_4_blocking_height(fle_df_5, 'range')

print(fle_df_6.shape)
# fle_df_6.tail(3)

In [None]:
fle_df_7 = rename_value_column(fle_df_6, 'mean', 'trait')
# fle_df_7.sample(n=3)

In [None]:
fle_df_8 = fle_df_7.rename({'flag_leaf_emergence_time': 'days_to_flag_leaf_emergence', 'gdd': 'gdd_to_flag_leaf_emergence'}, axis=1)
# fle_df_8.head(2)

In [None]:
cols_to_drop = ['date_x', 'checked', 'author', 'season', 'date_of_planting', 'date_y']

fle_df_9 = fle_df_8.drop(labels=cols_to_drop, axis=1)
print(fle_df_9.shape)
# fle_df_9.sample(n=3)

In [None]:
new_col_order = ['plot', 'range', 'column', 'scientificname', 'genotype', 'treatment', 'blocking_height', 
                 'method', 'date_of_flag_leaf_emergence', 'days_to_flag_leaf_emergence', 
                 'gdd_to_flag_leaf_emergence', 'method_type']

fle_df_10 = reorder_columns(fle_df_9, new_col_order)
print(fle_df_10.shape)
fle_df_10.head(3)

### Save all datasets to separate csv files

In [None]:
list_of_dfs = [adb_7, ch_8, chm_6, flower_df_10, fle_df_10]
list_of_output_filenames = ['data/processed/mac_season_4_aboveground_dry_biomass.csv',
                           'data/processed/mac_season_4_canopy_height_sensor.csv',
                           'data/processed/mac_season_4_canopy_height_manual.csv',
                           'data/processed/mac_season_4_days_gdd_to_flowering.csv',
                           'data/processed/mac_season_4_days_gdd_to_flag_leaf_emergence.csv']

In [None]:
save_to_csv_without_timestamp(list_of_dfs, list_of_output_filenames)