Please e-mail ejcain@email.arizona.edu or dlebauer@email.arizona.edu with any questions, or create an issue in this GitHub [repository](https://github.com/genophenoenvo/terraref-datasets).

### MAC Season 4 Data Cleaning

#### Selected Traits
- aboveground dry biomass
- canopy height
- days & growing degree days (GDD) to flowering
- days & GDD to flag leaf emergence

In [14]:
import datetime
import numpy as np
import os
import pandas as pd
import requests
import sqlite3

In [9]:
def download_csv(url, folder_name, file_name):
    response = requests.get(url)
    with open(os.path.join(folder_name, file_name), 'wb') as f:
        f.write(response.content)

In [10]:
def read_in_csv(folder_name, file_name):
    df = pd.read_csv(folder_name + '/' + file_name, low_memory=False)
    return df

In [15]:
def plot_hist(df, value_column, trait_column):
    
    trait_name = df[trait_column].unique()[0]
    return df[value_column].hist(color='navy').set_xlabel(trait_name);

In [16]:
def check_for_nulls_duplicates(df):
    
    print(
        f'Sum of null values:\n{df.isnull().sum()}\n-----\n'
        f'Value counts for duplicates:\n{df.duplicated().value_counts()}'
    )

In [17]:
def check_unique_values(df):

    for col in df.columns:
        if df[col].nunique() < 5:
            print(f'{df[col].nunique()} unique value(s) for {col} column: {df[col].unique()}')    
        else:
            print(f'{df[col].nunique()} values for {col} column')

In [18]:
def extract_range_column_values(working_df, plot_column):
    
    new_df = working_df.copy()
    new_df['range'] = new_df[plot_column].str.extract("Range (\d+)").astype(int)
    new_df['column'] = new_df[plot_column].str.extract("Column (\d+)").astype(int)
    
    return new_df

In [19]:
def convert_datetime_column(working_df, date_column):
    
    new_datetimes = pd.to_datetime(working_df[date_column])
    new_df_0 = working_df.drop(labels=date_column, axis=1)
    new_df_1 = new_df_0.copy()
    new_df_1['date'] = new_datetimes
    
    return new_df_1

In [20]:
def rename_value_column(working_df, value_column, trait_column):
    
    trait = working_df[trait_column].unique()[0]
    new_df_0 = working_df.rename({value_column: trait}, axis=1)
    new_df_1 = new_df_0.drop(labels=trait_column, axis=1)
    
    return new_df_1

Blocking height experiment description for season 4 can be found [here](https://terraref.ncsa.illinois.edu/bety/api/v1/experiments?name=~MAC+Season+4:+All+BAP+With+Late+Season+Drought)


In [21]:
def add_season_4_blocking_height(working_df, range_column):
    
    short_blocks = [11, 20, 46, 50]
    medium_blocks = [10, 12, 18, 24, 27, 29, 31, 33, 38, 51]
    tall_blocks = [3, 4, 5, 6, 7, 8, 9, 13, 14, 15, 16, 17, 19, 21, 22, 23, 25, 26, 28, 30, 32, 34, 35, 36, 37, 
                   39, 40, 41, 42, 43, 44, 45, 47, 48, 49, 52]
    border = [1, 2, 53, 54]
    
    range_values = working_df[range_column].values
    blocking_heights = []
    
    for r in range_values:
        
        if r in short_blocks:
            blocking_heights.append('short')
            
        elif r in medium_blocks:
            blocking_heights.append('medium')
            
        elif r in tall_blocks:
            blocking_heights.append('tall')
            
        elif r in border:
            blocking_heights.append('border')
            
        else:
            print(f'Error with range value {r}')
        
    working_df_1 = working_df.copy()
    working_df_1['blocking_height'] = blocking_heights
    
    return working_df_1

In [22]:
def reorder_columns(working_df, new_col_order_list):
    
    working_df_1 = pd.DataFrame(data=working_df, columns=new_col_order_list)
    return working_df_1

In [23]:
def check_for_subplots(df, plot_col):

    for name in df[plot_col].values:
        if (name.endswith(' E')) | (name.endswith(' W')):
             return 'This dataset contains subplot designations'
        else:
            return 'No subplot designations'

In [24]:
def strip_subplots(working_df, plot_col, new_plot_col_name):
    
    plot_names = working_df[plot_col].values
    new_plot_names = []
    
    for n in plot_names:
        if (n.endswith(' E') | (n.endswith(' W'))):
            new_plot_names.append(n[:-2])    
        else:
            new_plot_names.append(n)
            
    working_df_1 = working_df.drop(labels=plot_col, axis=1)
    working_df_2 = working_df_1.copy()
    
    working_df_2[new_plot_col_name] = new_plot_names
    return working_df_2

In [25]:
def save_to_csv_with_timestamp(df, name_of_dataset):
    
    timestamp = datetime.datetime.now().replace(microsecond=0).isoformat()
    output_filename = ('data/processed/' + f'{name_of_dataset}_' + f'{timestamp}.csv').replace(':', '')

    df.to_csv(output_filename, index=False)

In [26]:
def save_to_csv_without_timestamp(list_of_dfs, list_of_output_filenames):

    for i,j in zip(list_of_dfs, list_of_output_filenames):
        i.to_csv(j, index=False)

#### MAC Season 4

In [4]:
folder_name = 'data'
if not os.path.exists(folder_name):
    os.makedirs(folder_name)

In [3]:
season_4_url = 'https://de.cyverse.org/dl/d/D3168AC5-82BE-436E-B8B5-AB8DD78CAF28/mac_season_four_2020-04-22.csv'
season_4_input_filename = 'raw_mac_season_4_data.csv.'

In [13]:
download_csv(season_4_url, folder_name=folder_name, file_name=season_4_input_filename)

In [110]:
df = read_in_csv(folder_name=folder_name, file_name=season_4_input_filename)
# print(df.shape)

In [111]:
cols_to_drop = ['Unnamed: 0', 'checked', 'result_type', 'id', 'citation_id', 'site_id', 'treatment_id', 
                'scientificname', 'commonname', 'genus', 'species_id', 'cultivar_id', 'author', 
                'citation_year', 'time', 'month', 'year', 'n', 'statname', 'stat', 'notes', 'access_level', 
                'entity', 'view_url', 'edit_url', 'date', 'dateloc', 'city']

df_1 = df.drop(labels=cols_to_drop, axis=1)
# print(df_1.shape)
# df_1.head(3)

In [112]:
df_2 = extract_range_column_values(df_1, 'sitename')
# print(df_2.shape)
# df_2.sample(n=3)

In [113]:
df_3 = convert_datetime_column(df_2, 'raw_date')
# df_3.head(3)

#### A. Aboveground Dry Biomass

In [114]:
adb_0 = df_3.loc[df['trait'] == 'aboveground_dry_biomass']
# print(adb_0.shape)
# adb_0.head()

In [115]:
# plot_hist(adb_0, 'mean', 'trait')

In [116]:
# check_for_nulls_duplicates(adb_0)

In [117]:
adb_1 = adb_0.drop_duplicates(ignore_index=True)
# print(adb_1.shape)
# adb_1.head(3)

In [118]:
# check_for_subplots(adb_1, 'sitename')

In [119]:
adb_2 = rename_value_column(adb_1, 'mean', 'trait')
# print(adb_2.shape)
# adb_2.tail()

In [120]:
adb_3 = add_season_4_blocking_height(adb_2, 'range')
# print(adb_3.shape)
# adb_3.sample(n=3)

In [123]:
new_col_order = ['date', 'sitename', 'range', 'column', 'lat', 'lon', 'cultivar', 'aboveground_dry_biomass', 
                 'units', 'method_name', 'treatment', 'blocking_height']

adb_4 = reorder_columns(adb_3, new_col_order)
# print(adb_4.shape)
# adb_4.head(3)

(544, 12)


Unnamed: 0,date,sitename,range,column,lat,lon,cultivar,aboveground_dry_biomass,units,method_name,treatment,blocking_height
0,2017-09-12 00:00:00-05:00,MAC Field Scanner Season 4 Range 30 Column 5,30,5,33.07559,-111.974983,PI152727,22040.0,kg / ha,Whole above ground biomass at harvest,MAC Season 4: BAP water-deficit stress Aug 1-14,tall
1,2017-09-15 00:00:00-05:00,MAC Field Scanner Season 4 Range 30 Column 9,30,9,33.07559,-111.974917,PI329518,12940.0,kg / ha,Whole above ground biomass at harvest,MAC Season 4: BAP water-deficit stress Aug 15-30,tall
2,2017-09-12 00:00:00-05:00,MAC Field Scanner Season 4 Range 2 Column 5,2,5,33.074584,-111.974982,SP1615,37330.0,kg / ha,Whole above ground biomass at harvest,MAC Season 4: BAP water-deficit stress Aug 1-14,border


#### B. Canopy Height - Sensor

In [124]:
ch_0 = df_3.loc[(df_3.trait == 'canopy_height') & (df_3.method_name == '3D scanner to 98th quantile height')]
# print(ch_0.shape)
# ch_0.head()

In [125]:
# check_for_nulls_duplicates(ch_0)

In [126]:
ch_1 = ch_0.drop_duplicates(ignore_index=True)
# print(ch_1.shape)
# check_for_nulls_duplicates(ch_1)

In [127]:
# check_for_subplots(ch_1, 'sitename')

In [128]:
ch_2 = extract_range_column_values(ch_1, 'sitename')
# print(ch_2.shape)
# ch_2.sample(n=3)

In [129]:
ch_3 = rename_value_column(ch_2, 'mean', 'trait')
# print(ch_3.shape)
# ch_3.tail(3)

In [130]:
ch_4 = add_season_4_blocking_height(ch_3, 'range')
# ch_4.sample(n=3)

In [131]:
new_col_order = ['date', 'sitename', 'range', 'column', 'lat', 'lon', 'cultivar', 'trait_description', 
                 'canopy_height', 'units', 'method_name', 'treatment', 'blocking_height']

ch_5 = reorder_columns(ch_4, new_col_order)
# print(ch_5.shape)
# ch_5.head(3)

(29671, 13)


Unnamed: 0,date,sitename,range,column,lat,lon,cultivar,trait_description,canopy_height,units,method_name,treatment,blocking_height
0,2017-05-01 14:00:00-05:00,MAC Field Scanner Season 4 Range 40 Column 6,40,6,33.075949,-111.974966,PI329841,"top of the general canopy of the plant, discou...",9.0,cm,3D scanner to 98th quantile height,MAC Season 4: BAP water-deficit stress Aug 1-14,tall
1,2017-05-02 14:00:00-05:00,MAC Field Scanner Season 4 Range 40 Column 6,40,6,33.075949,-111.974966,PI329841,"top of the general canopy of the plant, discou...",9.0,cm,3D scanner to 98th quantile height,MAC Season 4: BAP water-deficit stress Aug 1-14,tall
2,2017-05-03 14:00:00-05:00,MAC Field Scanner Season 4 Range 40 Column 6,40,6,33.075949,-111.974966,PI329841,"top of the general canopy of the plant, discou...",9.0,cm,3D scanner to 98th quantile height,MAC Season 4: BAP water-deficit stress Aug 1-14,tall


#### C. Canopy Height - Manual
- using SQLite for `groupby`

In [132]:
chm_0 = df_3.loc[(df_3.trait == 'canopy_height') & (df_3.method_name == 'Manual canopy height')] 
# print(chm_0.shape)
# chm_0.head()

In [133]:
# check_for_nulls_duplicates(chm_0)

In [134]:
# check_for_subplots(chm_0, 'sitename')

In [135]:
chm_1 = strip_subplots(chm_0, 'sitename', 'sitename')
# print(chm_1.shape)
# chm_1.sample(n=3)

In [140]:
chm_2 = chm_1.drop_duplicates(ignore_index=True)
# print(chm_2.shape)
# chm_2.head(3)

In [141]:
chm_3 = add_season_4_blocking_height(chm_2, 'range')
# print(chm_3.shape)
# chm_3.sample(n=3)

#### Use sqlite database to group by `sitename`, `date`, and `mean` 
- rename `mean` to `canopy_height_cm`
- can also drop and reorder columns at this time

In [144]:
conn = sqlite3.connect('data/canopy_heights_manual_season_4.sqlite')
cursor = conn.cursor()
print("Opened database successfully")

Opened database successfully


In [145]:
# comment next line out if db has already been created
chm_3.to_sql('canopy_heights_manual_season_4.sqlite', conn)

In [146]:
chm_4 = pd.read_sql_query("""
                            SELECT date, sitename, range, column, lat, lon, cultivar, trait_description, ROUND(AVG([mean]), 2) 
                            AS canopy_height_cm, method_name, treatment, blocking_height
                            FROM 'canopy_heights_manual_season_4.sqlite'
                            GROUP BY sitename, date, [mean]
                            ORDER BY date ASC;
                            """, conn)

# print(chm_4.shape)
# chm_4.head(3)

(3958, 12)


Unnamed: 0,date,sitename,range,column,lat,lon,cultivar,trait_description,canopy_height_cm,method_name,treatment,blocking_height
0,2017-05-25 00:00:00-05:00,MAC Field Scanner Season 4 Range 10 Column 11,10,11,33.074871,-111.97488,PI195754,"top of the general canopy of the plant, discou...",47.0,Manual canopy height,MAC Season 4: BAP water-deficit stress Aug 15-30,medium
1,2017-05-25 00:00:00-05:00,MAC Field Scanner Season 4 Range 10 Column 12,10,12,33.074871,-111.974872,PI329501,"top of the general canopy of the plant, discou...",31.0,Manual canopy height,MAC Season 4: BAP water-deficit stress Aug 15-30,medium
2,2017-05-25 00:00:00-05:00,MAC Field Scanner Season 4 Range 10 Column 4,10,4,33.074871,-111.975003,PI297155,"top of the general canopy of the plant, discou...",30.0,Manual canopy height,MAC Season 4: BAP water-deficit stress Aug 15-30,medium


#### D. Days & GDD to Flowering

In [148]:
fl_0 = df_3.loc[(df_3.trait == 'flowering_time')] 
# print(fl_0.shape)
# fl_0.head()

(163, 13)


Unnamed: 0,sitename,lat,lon,treatment,trait,trait_description,mean,units,cultivar,method_name,range,column,date
4652,MAC Field Scanner Season 4 Range 12 Column 13,33.074943,-111.974851,MAC Season 4: BAP water-deficit stress Aug 15-30,flowering_time,Number of days from sowing to the date when 50...,78.0,days,PI641824,Visual classification of sorghum growth stages...,12,13,2017-07-21 00:00:00-05:00
4653,MAC Field Scanner Season 4 Range 33 Column 13,33.075698,-111.974852,MAC Season 4: BAP water-deficit stress Aug 15-30,flowering_time,Number of days from sowing to the date when 50...,72.0,days,PI170787,Visual classification of sorghum growth stages...,33,13,2017-07-21 00:00:00-05:00
4654,MAC Field Scanner Season 4 Range 38 Column 14,33.075878,-111.974835,MAC Season 4: BAP water-deficit stress Aug 15-30,flowering_time,Number of days from sowing to the date when 50...,56.0,days,PI218112,Visual classification of sorghum growth stages...,38,14,2017-07-21 00:00:00-05:00
14321,MAC Field Scanner Season 4 Range 5 Column 14,33.074691,-111.974835,MAC Season 4: BAP water-deficit stress Aug 15-30,flowering_time,Number of days from sowing to the date when 50...,62.0,days,PI653617,Visual classification of sorghum growth stages...,5,14,2017-07-21 00:00:00-05:00
14322,MAC Field Scanner Season 4 Range 11 Column 9,33.074907,-111.974917,MAC Season 4: BAP water-deficit stress Aug 15-30,flowering_time,Number of days from sowing to the date when 50...,68.0,days,PI52606,Visual classification of sorghum growth stages...,11,9,2017-07-21 00:00:00-05:00


#### Read in updated processed weather dataset for season 4

In [None]:
weather_0 = pd.read_csv('data/processed/mac_season_4_daily_weather_2020-07-01T144735.csv')
print(weather_0.shape)
# weather_0.head()

In [None]:
plot_hist(fl_0, 'mean', 'trait')

In [None]:
check_for_nulls_duplicates(fl_0)

In [None]:
check_for_subplots(fl_0, 'plot')

In [None]:
# check_unique_values(fl_0)

#### Add planting date 2017-04-20

In [None]:
day_of_planting = datetime.date(2017,4,20)
flower_df_1 = fl_0.copy()

flower_df_1['date_of_planting'] = day_of_planting
print(flower_df_1.shape)
# flower_df_1.head(3)

#### Create datetime with days to flowering (`mean`)

In [None]:
timedelta = pd.Series([pd.Timedelta(days=i) for i in flower_df_1['mean'].values])
dates_of_flowering = []

for td in timedelta:
    
    date_of_flowering = day_of_planting + td
    dates_of_flowering.append(date_of_flowering)
    
print(flower_df_1.shape[0])
print(len(dates_of_flowering))

In [None]:
flower_df_2 = flower_df_1.copy()
flower_df_2['date_of_flowering'] = dates_of_flowering
print(flower_df_2.shape)
# flower_df_2.head(3)

#### Add GDD to flowering dataframe

In [None]:
# slice weather df for date and cumulative gdd values only

season_4_gdd = weather_0[['date', 'gdd']]
print(season_4_gdd.shape)
# season_4_gdd.head(3)

In [None]:
season_4_gdd.dtypes

In [None]:
flower_df_3 = flower_df_2.copy()
flower_df_3.date_of_flowering = pd.to_datetime(flower_df_3.date_of_flowering)
# flower_df_3.dtypes

In [None]:
season_4_gdd_1 = season_4_gdd.copy()
season_4_gdd_1['date'] = pd.to_datetime(season_4_gdd_1['date'])
season_4_gdd_1.dtypes

In [None]:
flower_df_4 = flower_df_3.merge(season_4_gdd_1, how='left', left_on='date_of_flowering', right_on='date')
print(flower_df_4.shape)
# flower_df_4.head(3)

In [None]:
flower_df_5 = extract_range_column_values(flower_df_4, 'plot')
flower_df_6 = add_season_4_blocking_height(flower_df_5, 'range')

print(flower_df_6.shape)
# flower_df_6.tail(3)

In [None]:
flower_df_7 = rename_value_column(flower_df_6, 'mean', 'trait')
# flower_df_7.sample(n=3)

In [None]:
flower_df_8 = flower_df_7.rename({'flowering_time': 'days_to_flowering', 'gdd': 'gdd_to_flowering'}, axis=1)
# flower_df_8.head(2)

In [None]:
cols_to_drop = ['date_x', 'checked', 'author', 'season', 'date_of_planting', 'date_y']

flower_df_9 = flower_df_8.drop(labels=cols_to_drop, axis=1)
print(flower_df_9.shape)
# flower_df_9.sample(n=3)

In [None]:
new_col_order = ['plot', 'range', 'column', 'scientificname', 'genotype', 'treatment', 'blocking_height', 
                 'method', 'date_of_flowering', 'days_to_flowering', 'gdd_to_flowering', 'method_type']

flower_df_10 = reorder_columns(flower_df_9, new_col_order)
print(flower_df_10.shape)
flower_df_10.head(3)

#### E. Days & GDD to Flag Leaf Emergence

In [None]:
fle_0 = pd.read_csv('data/raw/season_4_traits/season_4_flag_leaf_emergence_time_manual.csv')
print(fle_0.shape)
# fle_0.head()

#### Read in updated processed weather dataset for season 4
Code used to process weather data for season 4 can be found in the `season_4_weather_data_cleaning` notebook in this repository

In [None]:
weather_0 = pd.read_csv('data/processed/mac_season_4_daily_weather_2020-07-01T144735.csv')
print(weather_0.shape)
# weather_0.head()

In [None]:
plot_hist(fle_0, 'mean', 'trait')

In [None]:
check_for_nulls_duplicates(fle_0)

In [None]:
check_for_subplots(fle_0, 'plot')

In [None]:
# check_unique_values(fle_0)

#### Add planting date 2017-04-20

In [None]:
day_of_planting = datetime.date(2017,4,20)
fle_df_1 = fle_0.copy()

fle_df_1['date_of_planting'] = day_of_planting
print(fle_df_1.shape)
# fle_df_1.head(3)

#### Create timedelta using days to flag leaf emergence (`mean`)

In [None]:
timedelta = pd.Series([pd.Timedelta(days=i) for i in fle_df_1['mean'].values])
dates_of_flag_leaf_emergence = []

for td in timedelta:
    
    date_of_flag_leaf_emergence = day_of_planting + td
    dates_of_flag_leaf_emergence.append(date_of_flag_leaf_emergence)
    
print(fle_df_1.shape[0])
print(len(dates_of_flag_leaf_emergence))

In [None]:
fle_df_2 = fle_df_1.copy()
fle_df_2['date_of_flag_leaf_emergence'] = dates_of_flag_leaf_emergence
print(fle_df_2.shape)
# fle_df_2.head(3)

#### Add GDD values to flag leaf emergence dataframe

In [None]:
# slice weather df for date and cumulative gdd values only

season_4_gdd = weather_0[['date', 'gdd']]
print(season_4_gdd.shape)
# season_4_gdd.head(3)

In [None]:
fle_df_3 = fle_df_2.copy()
fle_df_3.date_of_flag_leaf_emergence = pd.to_datetime(fle_df_3.date_of_flag_leaf_emergence)
# fle_df_3.dtypes

In [None]:
season_4_gdd_1 = season_4_gdd.copy()
season_4_gdd_1['date'] = pd.to_datetime(season_4_gdd_1['date'])
season_4_gdd_1.dtypes

In [None]:
fle_df_4 = fle_df_3.merge(season_4_gdd_1, how='left', left_on='date_of_flag_leaf_emergence', right_on='date')
print(fle_df_4.shape)
# fle_df_4.head(3)

In [None]:
fle_df_5 = extract_range_column_values(fle_df_4, 'plot')
fle_df_6 = add_season_4_blocking_height(fle_df_5, 'range')

print(fle_df_6.shape)
# fle_df_6.tail(3)

In [None]:
fle_df_7 = rename_value_column(fle_df_6, 'mean', 'trait')
# fle_df_7.sample(n=3)

In [None]:
fle_df_8 = fle_df_7.rename({'flag_leaf_emergence_time': 'days_to_flag_leaf_emergence', 'gdd': 'gdd_to_flag_leaf_emergence'}, axis=1)
# fle_df_8.head(2)

In [None]:
cols_to_drop = ['date_x', 'checked', 'author', 'season', 'date_of_planting', 'date_y']

fle_df_9 = fle_df_8.drop(labels=cols_to_drop, axis=1)
print(fle_df_9.shape)
# fle_df_9.sample(n=3)

In [None]:
new_col_order = ['plot', 'range', 'column', 'scientificname', 'genotype', 'treatment', 'blocking_height', 
                 'method', 'date_of_flag_leaf_emergence', 'days_to_flag_leaf_emergence', 
                 'gdd_to_flag_leaf_emergence', 'method_type']

fle_df_10 = reorder_columns(fle_df_9, new_col_order)
print(fle_df_10.shape)
fle_df_10.head(3)

### Save all datasets to separate csv files

In [None]:
list_of_dfs = [adb_4, ch_5, chm_4]
list_of_output_filenames = ['data/processed/mac_season_4_aboveground_dry_biomass.csv',
                           'data/processed/mac_season_4_canopy_height_sensor.csv',
                           'data/processed/mac_season_4_canopy_height_manual.csv',
                           'data/processed/mac_season_4_days_gdd_to_flowering.csv',
                           'data/processed/mac_season_4_days_gdd_to_flag_leaf_emergence.csv']

In [None]:
save_to_csv_without_timestamp(list_of_dfs, list_of_output_filenames)