### Clemson Sorghum Experiments Data Cleaning Notebook
#### Data from [Brenton et al., 2016](https://www.genetics.org/content/204/1/21) collected from Clemson University Pee Dee Research and Education Center in 2014
- goal: to gather more cultivar and environmental data in addition to MAC Sorghum Seasons 4 & 6 and KSU Experiments
- please contact David LeBauer at dlebauer@email.arizona.edu or Emily Cain at ejcain@email.arizona.edu with any questions or feedback, or create an issue in this [repository](https://github.com/genophenoenvo/terraref-datasets)

In [1]:
import datetime
import numpy as np
import os
import pandas as pd
import requests

#### Read in data queried from betydb in `R` using this code:
```
library(traits)

options(betydb_url = "https://terraref.ncsa.illinois.edu/bety/",
        betydb_api_version = 'v1',
        betydb_key = 'secret_api_key_123456_abcde')
        
clemson <- betydb_query(sitename  = "~Clemson",
                         limit     =  "none")
                      
write.csv(clemson, file = "clemson_data_2020-06-01.csv")
```

In [2]:
def download_csv(url, folder_name, file_name):
    response = requests.get(url)
    with open(os.path.join(folder_name, file_name), 'wb') as f:
        f.write(response.content)

In [3]:
def read_in_csv(folder_name, file_name):
    df = pd.read_csv(folder_name + '/' + file_name, low_memory=False)
    return df

In [4]:
def save_to_csv_without_timestamp(list_of_dfs, list_of_output_filenames):

    for i,j in zip(list_of_dfs, list_of_output_filenames):
        i.to_csv(j, index=False)

In [5]:
folder_name = 'data'
if not os.path.exists(folder_name):
    os.makedirs(folder_name)

In [6]:
clemson_url = 'https://de.cyverse.org/dl/d/D946B9A0-870C-4071-847C-F6377E61D30B/clemson_data_2020-06-01.csv'
clemson_input_filename = 'clemson_trait_data.csv'

In [7]:
download_csv(clemson_url, folder_name=folder_name, file_name=clemson_input_filename)

In [8]:
df_0 = read_in_csv(folder_name=folder_name, file_name=clemson_input_filename)
# print(df_0.shape)
# df_0.head()

#### Slice for selected traits
- plant height
- days & GDD to flowering
- aboveground dry biomass

In [9]:
df_0.trait.unique()

array(['ndf', 'adf', 'flowering_time', 'plant_height',
       'aboveground_dry_biomass'], dtype=object)

In [10]:
df_1 = df_0.loc[(df_0.trait == 'flowering_time') | (df_0.trait == 'plant_height') | (df_0.trait == 'aboveground_dry_biomass')]
# print(df_1.shape)
# df_1.tail()

#### Drop & Rename Columns
- rename `mean` to `value`
- convert `raw_date` to new datetime object
- new datetime object will be in `date` column
- drop `raw_date` column

In [11]:
cols_to_drop = ['Unnamed: 0', 'checked', 'result_type', 'id', 'citation_id', 'site_id', 'treatment_id', 
                'commonname', 'genus', 'species_id', 'cultivar_id', 'month', 'year', 'dateloc', 'n', 'statname', 
                'stat', 'notes', 'access_level', 'entity', 'view_url', 'edit_url', 'date', 'time', 'method_name', 
                'treatment']

df_2 = df_1.drop(labels=cols_to_drop, axis=1)
print(df_2.shape)
# df_2.head()

(931, 13)


#### Convert `raw_date` to datetime object

In [12]:
new_dates = pd.to_datetime(df_2.raw_date)

df_3 = df_2.copy()
df_3['date'] = new_dates

# print(df_2.shape)
# print(df_3.shape)

# df_3.head(3)

In [13]:
df_4 = df_3.rename({'mean': 'value'}, axis=1)
# print(df_4.shape)
# df_4.tail(3)

In [14]:
df_5 = df_4.drop(labels=['raw_date'], axis=1)
# print(df_5.shape)
# df_5.head()

### Add Weather Data
- downloaded from [Daymet](https://daymet.ornl.gov/getdata) and [Climate Engine](http://climateengine.org/data)

In [15]:
clemson_weather_url = 'https://de.cyverse.org/dl/d/1EB28C81-10A1-4E1B-A406-1D0C6A20AF2D/clemson_weather.csv'
clemson_weather_input_filename = 'clemson_weather_data.csv'

In [16]:
download_csv(clemson_weather_url, folder_name=folder_name, file_name=clemson_weather_input_filename)

In [17]:
weather_0 = read_in_csv(folder_name=folder_name, file_name=clemson_weather_input_filename)
print(weather_0.shape)
weather_0.head()

(163, 14)


Unnamed: 0,date,day_of_year,temp_min,temp_max,temp_mean,gdd,rh_min,rh_max,rh_mean,vpd_mean,precip,precip_cumulative,first_water_deficit_treatment,second_water_deficit_treatment
0,2014-05-06,126,15.0,32.5,23.75,14.0,22.64,79.11,50.87,1.47,0,0,False,False
1,2014-05-07,127,14.5,33.0,23.75,28.0,22.29,73.16,47.73,1.57,0,0,False,False
2,2014-05-08,128,15.5,33.5,24.5,42.0,24.37,78.09,51.23,1.53,0,0,False,False
3,2014-05-09,129,16.5,33.5,25.0,57.0,28.85,70.5,49.67,1.63,0,0,False,False
4,2014-05-10,130,19.0,32.5,25.75,73.0,35.96,79.65,57.81,1.43,0,0,False,False


### A. Days & GDD to Flowering
- planting date: 2014-05-06

In [18]:
fl_0 = df_5.loc[df_5.trait == 'flowering_time']
print(fl_0.shape)
# fl_0.head()

(168, 13)


#### Add planting date

In [19]:
day_of_planting = datetime.date(2014,5,6)
fl_1 = fl_0.copy()

fl_1['date_of_planting'] = day_of_planting
# print(fl_1.shape)
# fl_1.head(5)

#### Create timedelta using days to flowering

In [20]:
timedelta_values = fl_1['value'].values
dates_of_flowering = []

for val in timedelta_values:
    
    date_of_flowering = day_of_planting + datetime.timedelta(days=val)
    dates_of_flowering.append(date_of_flowering)
    
# print(fl_1.shape[0])
# print(len(dates_of_flowering))

In [21]:
fl_2 = fl_1.copy()
fl_2['date_of_flowering'] = dates_of_flowering
# print(fl_2.shape)
# fl_2.tail()

#### Merge temperature data from weather dataframe with flowering dataframe

In [22]:
temp_df = weather_0[['date', 'day_of_year', 'temp_min', 'temp_max', 'gdd']]
# print(temp_df.shape)
# temp_df.head()

#### Convert dates to datetime objects
- date of flowering
- date in the weather df

In [23]:
fl_3 = fl_2.copy()
fl_3.date_of_flowering = pd.to_datetime(fl_3.date_of_flowering)
# fl_3.dtypes

In [24]:
temp_df_1 = temp_df.copy()
temp_df_1.date = pd.to_datetime(temp_df_1.date)

In [25]:
fl_4 = fl_3.merge(temp_df_1, how='left', left_on='date_of_flowering', right_on='date')
# print(fl_4.shape)
# fl_4.tail()

#### Rename & Drop Columns

In [26]:
cols_to_drop = ['scientificname', 'author', 'citation_year', 'trait', 'units', 'date_of_planting', 'date_x', 'date_y',
               'temp_min', 'temp_max']

fl_5 = fl_4.drop(labels=cols_to_drop, axis=1)
# print(fl_5.shape)
# fl_5.head()

In [27]:
fl_6 = fl_5.rename({'value': 'days_to_flowering', 'gdd': 'gdd_to_flowering'}, axis=1)
# print(fl_6.shape)
# fl_6.head()

In [28]:
new_col_order = ['sitename', 'city', 'lat', 'lon', 'trait_description', 'cultivar', 
                 'days_to_flowering', 'gdd_to_flowering', 'date_of_flowering']

fl_7 = pd.DataFrame(data=fl_6, columns=new_col_order)
# print(fl_7.shape)
# fl_7.head(3)

### B. Plant Height

In [29]:
ph_0 = df_5.loc[df_5.trait == 'plant_height']
# print(ph_0.shape)
# ph_0.head()

#### Rename, Drop, & Reorder Columns

In [30]:
ph_1 = ph_0.rename({'value': 'plant_height_cm'}, axis=1)
# ph_1.tail()

In [31]:
cols_to_drop = ['author', 'citation_year', 'trait', 'units']

ph_2 = ph_1.drop(labels=cols_to_drop, axis=1)
# print(ph_2.shape)
# ph_2.head()

In [32]:
new_col_order = ['sitename', 'city', 'lat', 'lon', 'scientificname', 'trait_description', 'cultivar', 
                'plant_height_cm', 'date']

ph_3 = pd.DataFrame(data=ph_2, columns=new_col_order)
# print(ph_3.shape)
# ph_3.head()

### C. Aboveground dry biomass

In [33]:
adb_0 = df_5.loc[df_5.trait == 'aboveground_dry_biomass']
# print(adb_0.shape)
# adb_0.head()

#### Rename, Drop, & Reorder Columns

In [34]:
adb_1 = adb_0.rename({'value': 'aboveground_dry_biomass'}, axis=1)
# adb_1.tail()

In [35]:
cols_to_drop = ['author', 'citation_year', 'trait', 'trait_description']

adb_2 = adb_1.drop(labels=cols_to_drop, axis=1)
# print(adb_2.shape)
# adb_2.head()

In [36]:
new_col_order = ['date', 'sitename', 'city', 'lat', 'lon', 'scientificname', 'cultivar', 
                'aboveground_dry_biomass', 'units']

adb_3 = pd.DataFrame(data=adb_2, columns=new_col_order)
print(adb_3.shape)
adb_3.head()

(382, 9)


Unnamed: 0,date,sitename,city,lat,lon,scientificname,cultivar,aboveground_dry_biomass,units
943,2014-09-02 19:00:00-05:00,Clemson University Pee Dee Research and Educat...,Florence,34.289,-79.737,Sorghum bicolor,PI197542,5380.0,kg / ha
944,2014-08-30 19:00:00-05:00,Clemson University Pee Dee Research and Educat...,Florence,34.289,-79.737,Sorghum bicolor,PI365512,6090.0,kg / ha
945,2014-08-23 19:00:00-05:00,Clemson University Pee Dee Research and Educat...,Florence,34.289,-79.737,Sorghum bicolor,PI533998,6090.0,kg / ha
946,2014-09-16 19:00:00-05:00,Clemson University Pee Dee Research and Educat...,Florence,34.289,-79.737,Sorghum bicolor,PI533792,6990.0,kg / ha
947,2014-09-17 19:00:00-05:00,Clemson University Pee Dee Research and Educat...,Florence,34.289,-79.737,Sorghum bicolor,PI655978,6990.0,kg / ha


In [37]:
list_of_dfs = [fl_7, ph_3, adb_3]
list_of_output_filenames = ['clemson_days_gdd_to_flowering.csv', 'clemson_plant_height.csv', 
                           'clemson_aboveground_dry_biomass.csv']

save_to_csv_without_timestamp(list_of_dfs=list_of_dfs, list_of_output_filenames=list_of_output_filenames)