### Clemson Sorghum Experiments Data Cleaning Notebook
#### Data from [Brenton et al., 2016](https://www.genetics.org/content/204/1/21) collected from Clemson University Pee Dee Research and Education Center in 2014
- goal: to gather more cultivar and environmental data in addition to MAC Sorghum Seasons 4 & 6 and KSU Experiments
- please contact Emily Cain at ejcain@arizona.edu with any questions or feedback

In [1]:
import datetime
import numpy as np
import pandas as pd

#### Read in data queried from betydb in `R` using this code:
```
library(traits)

options(betydb_url = "https://terraref.ncsa.illinois.edu/bety/",
        betydb_api_version = 'v1',
        betydb_key = 'secret_api_key_123456_abcde')
        
clemson <- betydb_query(sitename  = "~Clemson",
                         limit     =  "none")
                      
write.csv(clemson, file = "clemson_data_2020-06-01.csv")
```

In [4]:
%cd '/Users/ejcain/UA-AG/for-data-publication/'


/Users/ejcain/UA-AG/for-data-publication


In [5]:
df_0 = pd.read_csv('data/raw/clemson/clemson_data_2020-06-01.csv')
print(df_0.shape)
df_0.head(3)

(1657, 39)


Unnamed: 0.1,Unnamed: 0,checked,result_type,id,citation_id,site_id,treatment_id,sitename,city,lat,...,n,statname,stat,notes,access_level,cultivar,entity,method_name,view_url,edit_url
0,1,0,traits,6005980183,6000000027,6000025503,6000000042,Clemson University Pee Dee Research and Educat...,Florence,34.289,...,,,,,2,PI641862,,,https://terraref.ncsa.illinois.edu/bety/traits...,https://terraref.ncsa.illinois.edu/bety/traits...
1,2,0,traits,6005980184,6000000027,6000025503,6000000042,Clemson University Pee Dee Research and Educat...,Florence,34.289,...,,,,,2,PI641862,,,https://terraref.ncsa.illinois.edu/bety/traits...,https://terraref.ncsa.illinois.edu/bety/traits...
2,3,0,traits,6005980185,6000000027,6000025503,6000000042,Clemson University Pee Dee Research and Educat...,Florence,34.289,...,,,,,2,PI653616,,,https://terraref.ncsa.illinois.edu/bety/traits...,https://terraref.ncsa.illinois.edu/bety/traits...


In [6]:
print(df_0.raw_date.min())
print(df_0.raw_date.max())

2014-07-12 19:00:00 -0500
2014-10-15 19:00:00 -0500


In [10]:
df_0.loc[df_0.raw_date == '2014-10-13 19:00:00 -0500']['trait'].value_counts()

Series([], Name: trait, dtype: int64)

#### Slice for selected traits
- plant height
- days & GDD to flowering
- aboveground dry biomass
- may use other traits as needed for future models

In [None]:
df_0.trait.unique()

In [None]:
df_1 = df_0.loc[(df_0.trait == 'flowering_time') | (df_0.trait == 'plant_height') | (df_0.trait == 'aboveground_dry_biomass')]
print(df_1.shape)
# df_1.tail()

#### Drop & Rename Columns
- rename `mean` to `value`
- convert `raw_date` to new datetime object
- new datetime object will be in `date` column
- drop `raw_date` column

In [None]:
# df_1.columns

In [None]:
# Can drop most columns with only one value

# for col in df_1.columns:
    
#     if df_1[col].nunique() < 5:
#         print(f'Unique values for {col}: {df_1[col].unique()}')

In [None]:
cols_to_drop = ['Unnamed: 0', 'checked', 'result_type', 'id', 'citation_id', 'site_id', 'treatment_id', 
                'commonname', 'genus', 'species_id', 'cultivar_id', 'month', 'year', 'dateloc', 'n', 'statname', 
                'stat', 'notes', 'access_level', 'entity', 'view_url', 'edit_url', 'date', 'time', 'method_name', 
                'treatment']

df_2 = df_1.drop(labels=cols_to_drop, axis=1)
print(df_2.shape)
# df_2.head()

#### Convert `raw_date` to datetime object

In [None]:
# df_2.dtypes

In [None]:
new_dates = pd.to_datetime(df_2.raw_date)

df_3 = df_2.copy()
df_3['date'] = new_dates

print(df_2.shape)
print(df_3.shape)

# df_3.head(3)

In [None]:
df_4 = df_3.rename({'mean': 'value'}, axis=1)
print(df_4.shape)
# df_4.tail(3)

In [None]:
df_5 = df_4.drop(labels=['raw_date'], axis=1)
print(df_5.shape)
# df_5.head()

### Add Weather Data
- downloaded from [Daymet](https://daymet.ornl.gov/getdata) and [Climate Engine](http://climateengine.org/data)
- Raw data transformed to interim data using [code](https://github.com/MagicMilly/for-data-publication/tree/main/notebooks) within the `clemson_2014_daily_weather` notebook
- will use temperature values to calculate Growing Degree Days to Flowering

In [None]:
weather_0 = pd.read_csv('data/interim/updated_clemson_weather_2014_2020-06-24.csv')
print(weather_0.shape)
# weather_0.head()

### A. Days & GDD to Flowering
- planting date: 2014-05-06

In [None]:
fl_0 = df_5.loc[df_5.trait == 'flowering_time']
print(fl_0.shape)
# fl_0.head()

#### Add planting date

In [None]:
day_of_planting = datetime.date(2014,5,6)
fl_1 = fl_0.copy()

fl_1['date_of_planting'] = day_of_planting
print(fl_1.shape)
# fl_1.head(5)

#### Create timedelta using days to flowering

In [None]:
timedelta_values = fl_1['value'].values
dates_of_flowering = []

for val in timedelta_values:
    
    date_of_flowering = day_of_planting + datetime.timedelta(days=val)
    dates_of_flowering.append(date_of_flowering)
    
print(fl_1.shape[0])
print(len(dates_of_flowering))

In [None]:
fl_2 = fl_1.copy()
fl_2['date_of_flowering'] = dates_of_flowering
print(fl_2.shape)
# fl_2.tail()

#### Merge temperature data from weather dataframe with flowering dataframe

In [None]:
temp_df = weather_0[['date', 'day_of_year', 'temp_min_c', 'temp_max_c']]
print(temp_df.shape)
# temp_df.head()

#### Add GDD to weather df for seasonal dates

In [None]:
temp_df_1 = temp_df.loc[temp_df['date'] >= '2014-05-06']
print(temp_df_1.shape)
# temp_df_1.head()

In [None]:
temp_df_2 = temp_df_1.copy()
temp_df_2['daily_gdd'] = (((temp_df_2['temp_min_c'] + temp_df_2['temp_max_c'])) / 2) - 10
print(temp_df_2.shape)
# temp_df_2.head(10)

In [None]:
# Check for negative daily gdd values
# Can disregard if negative values are on days after harvesting

# temp_df_2.loc[temp_df_2.daily_gdd < 0]

In [None]:
# Uncomment this cell if needed
# Change all negative values to 0

# temp_df_3 = temp_df_2.copy()

# for k,v in temp_df_2.iteritems():
    
#     if k == 'daily_gdd':
#         v[v < 0] = 0

In [None]:
# Should return empty df now

# temp_df_3.loc[temp_df_2.daily_gdd < 0]

In [None]:
# compare to df with negative values

# temp_df_3.loc[temp_df_2.daily_gdd == 0]

In [None]:
temp_df_3 = temp_df_2.copy()
temp_df_3['gdd'] = np.rint(np.cumsum(temp_df_3['daily_gdd']))

print(temp_df_3.shape)
# temp_df_3.head()

In [None]:
temp_df_4 = temp_df_3[['date', 'gdd']]
print(temp_df_4.shape)
# temp_df_4.head()

#### Convert dates to datetime objects
- date of flowering
- date in the weather df

In [None]:
fl_3 = fl_2.copy()
fl_3.date_of_flowering = pd.to_datetime(fl_3.date_of_flowering)
# fl_3.dtypes

In [None]:
# temp_df_4.dtypes

In [None]:
temp_df_5 = temp_df_4.copy()
temp_df_5.date = pd.to_datetime(temp_df_5.date)

In [None]:
temp_df_5.dtypes

In [None]:
fl_4 = fl_3.merge(temp_df_5, how='left', left_on='date_of_flowering', right_on='date')
print(fl_4.shape)
# fl_4.tail()

#### Rename & Drop Columns

In [None]:
# fl_4.columns

In [None]:
cols_to_drop = ['scientificname', 'author', 'citation_year', 'trait', 'units', 'date_of_planting', 'date_x', 'date_y']

fl_5 = fl_4.drop(labels=cols_to_drop, axis=1)
print(fl_5.shape)
# fl_5.head()

In [None]:
fl_6 = fl_5.rename({'value': 'days_to_flowering', 'gdd': 'gdd_to_flowering'}, axis=1)
print(fl_6.shape)
# fl_6.head()

In [None]:
# fl_6.columns

In [None]:
new_col_order = ['sitename', 'city', 'lat', 'lon', 'trait_description', 'cultivar', 
                 'days_to_flowering', 'gdd_to_flowering', 'date_of_flowering']

fl_7 = pd.DataFrame(data=fl_6, columns=new_col_order)
print(fl_7.shape)
fl_7.head(3)

#### Write days to flowering df to `.csv`

In [None]:
timestamp = datetime.datetime.now().replace(microsecond=0).isoformat()
output_filename = f'data/processed/clemson_days_to_flowering_{timestamp}.csv'.replace(':', '')

fl_7.to_csv(output_filename, index=False)

### B. Plant Height

In [None]:
# df_5.trait.unique()

In [None]:
ph_0 = df_5.loc[df_5.trait == 'plant_height']
print(ph_0.shape)
# ph_0.head()

#### Rename, Drop, & Reorder Columns

In [None]:
ph_1 = ph_0.rename({'value': 'plant_height_cm'}, axis=1)
# ph_1.tail()

In [None]:
cols_to_drop = ['author', 'citation_year', 'trait', 'units']

ph_2 = ph_1.drop(labels=cols_to_drop, axis=1)
print(ph_2.shape)
# ph_2.head()

In [None]:
new_col_order = ['sitename', 'city', 'lat', 'lon', 'scientificname', 'trait_description', 'cultivar', 
                'plant_height_cm', 'date']

ph_3 = pd.DataFrame(data=ph_2, columns=new_col_order)
print(ph_3.shape)
# ph_3.head()

In [None]:
ph_4 = ph_3.set_index(keys=['date'])
# ph_4.tail()

In [None]:
# sort index ascending

ph_5 = ph_4.sort_index()
print(ph_5.shape)
ph_5.head(3)

#### Write canopy heights to `.csv`

In [None]:
timestamp = datetime.datetime.now().replace(microsecond=0).isoformat()
output_filename = f'data/processed/clemson_canopy_heights_{timestamp}.csv'.replace(':', '')

ph_5.to_csv(output_filename, index=True, index_label='date')

### C. Aboveground dry biomass

In [None]:
adb_0 = df_5.loc[df_5.trait == 'aboveground_dry_biomass']
print(adb_0.shape)
# adb_0.head()

#### Rename, Drop, & Reorder Columns

In [None]:
adb_1 = adb_0.rename({'value': 'aboveground_dry_biomass'}, axis=1)
# adb_1.tail()

In [None]:
cols_to_drop = ['author', 'citation_year', 'trait', 'trait_description']

adb_2 = adb_1.drop(labels=cols_to_drop, axis=1)
print(adb_2.shape)
# adb_2.head()

In [None]:
new_col_order = ['date', 'sitename', 'city', 'lat', 'lon', 'scientificname', 'cultivar', 
                'aboveground_dry_biomass', 'units']

adb_3 = pd.DataFrame(data=adb_2, columns=new_col_order)
print(adb_3.shape)
# adb_3.head()

In [None]:
adb_4 = adb_3.set_index(keys=['date'])
adb_4.tail(3)

#### Write aboveground dry biomass df to `.csv`

In [None]:
timestamp = datetime.datetime.now().replace(microsecond=0).isoformat()
output_filename = f'data/processed/clemson_aboveground_dry_biomass_{timestamp}.csv'.replace(':', '')

adb_4.to_csv(output_filename, index=True)