### Kansas Sorghum Experiments Data Cleaning Notebook
#### Data from Danforth Plant Center
- goal: to gather more data for cultivars from MAC Sorghum Seasons 4 & 6
- please contact Emily Cain at ejcain@arizona.edu with any questions or feedback

In [38]:
import pandas as pd

#### A. Read in data queried from betydb in `R` using this code:
```
library(traits)

options(betydb_url = "https://terraref.ncsa.illinois.edu/bety/",
        betydb_api_version = 'v1',
        betydb_key = 'secret_api_key_123456_abcde')
        
kansas <- betydb_query(experiment  = "~KSU",
                         limit     =  "none")
                      
write.csv(kansas, file = "kansas_experiments_2020-03-24.csv")
```

In [39]:
df_0 = pd.read_csv('data/ksu_experiments_2020-03-24.csv', low_memory=False)
print(df_0.shape)
# df_0.head(3)

(1552266, 39)


Unnamed: 0.1,Unnamed: 0,checked,result_type,id,citation_id,site_id,treatment_id,sitename,city,lat,...,n,statname,stat,notes,access_level,cultivar,entity,method_name,view_url,edit_url
0,1,0,traits,6004546896,6000000000.0,6000005848,6000000000.0,MAC Field Scanner Season 4 Range 38 Column 14,Maricopa,33.075878,...,,,,,2,PI218112,,3D scanner to leaf angle distribution,https://terraref.ncsa.illinois.edu/bety/traits...,https://terraref.ncsa.illinois.edu/bety/traits...
1,2,0,traits,6004555749,6000000000.0,6000005848,6000000000.0,MAC Field Scanner Season 4 Range 38 Column 14,Maricopa,33.075878,...,,,,,2,PI218112,,3D scanner to leaf angle distribution,https://terraref.ncsa.illinois.edu/bety/traits...,https://terraref.ncsa.illinois.edu/bety/traits...
2,3,0,traits,6004555750,6000000000.0,6000005848,6000000000.0,MAC Field Scanner Season 4 Range 38 Column 14,Maricopa,33.075878,...,,,,,2,PI218112,,3D scanner to leaf angle distribution,https://terraref.ncsa.illinois.edu/bety/traits...,https://terraref.ncsa.illinois.edu/bety/traits...


#### B. Slice dataframe to only include sitenames that start with `Danforth`

In [40]:
danforth_sites = df_0.loc[df_0.sitename.str.startswith('Danforth')]
print(df_0.shape)
print(danforth_sites.shape)

(1552266, 39)
(4187, 39)


In [41]:
print(danforth_sites.raw_date.min())
print(danforth_sites.raw_date.max())

2014-06-03 13:11:14 -0500
2014-06-23 18:08:13 -0500


In [42]:
danforth_sites.sitename.unique()

array(['Danforth Plant Science Center Bellweather Phenotyping Facility'],
      dtype=object)

#### C. Check traits and cultivars that were observed 

In [43]:
danforth_sites.trait.unique()

array(['sv_area', 'tv_area', 'hull_area', 'solidity', 'plant_height',
       'perimeter'], dtype=object)

In [44]:
danforth_sites.cultivar.unique()

array(['BTx642', 'Tx430', 'PI564163', 'TX7000'], dtype=object)

#### D. Slice dataframe for `plant_height` and cultivar `PI564163` as this is the only trait / cultivar combinations also found in MAC Sorghum Seasons 4 & 6

In [45]:
danforth_plant_height_0 = danforth_sites.loc[(danforth_sites.trait == 'plant_height') & 
                                            (danforth_sites.cultivar == 'PI564163')]
print(danforth_plant_height_0.shape)
# danforth_plant_height_0.head(3)

(186, 39)


Unnamed: 0.1,Unnamed: 0,checked,result_type,id,citation_id,site_id,treatment_id,sitename,city,lat,...,n,statname,stat,notes,access_level,cultivar,entity,method_name,view_url,edit_url
115023,115024,0,traits,6000075489,6000000000.0,6000000866,6000000000.0,Danforth Plant Science Center Bellweather Phen...,St. Louis,38.674826,...,,,,,3,PI564163,Fr001AD006725,PlantCV,https://terraref.ncsa.illinois.edu/bety/traits...,https://terraref.ncsa.illinois.edu/bety/traits...
144646,144647,1,traits,6000008501,6000000000.0,6000000866,6000000000.0,Danforth Plant Science Center Bellweather Phen...,St. Louis,38.674826,...,,,,,3,PI564163,Fr001AC006701,PlantCV,https://terraref.ncsa.illinois.edu/bety/traits...,https://terraref.ncsa.illinois.edu/bety/traits...
144653,144654,1,traits,6000008639,6000000000.0,6000000866,6000000000.0,Danforth Plant Science Center Bellweather Phen...,St. Louis,38.674826,...,,,,,3,PI564163,Fr001AB006685,PlantCV,https://terraref.ncsa.illinois.edu/bety/traits...,https://terraref.ncsa.illinois.edu/bety/traits...


#### E. Drop columns not needed at this time & rename `mean` column

In [59]:
danforth_plant_height_0.columns

Index(['Unnamed: 0', 'checked', 'result_type', 'id', 'citation_id', 'site_id',
       'treatment_id', 'sitename', 'city', 'lat', 'lon', 'scientificname',
       'commonname', 'genus', 'species_id', 'cultivar_id', 'author',
       'citation_year', 'treatment', 'date', 'time', 'raw_date', 'month',
       'year', 'dateloc', 'trait', 'trait_description', 'mean', 'units', 'n',
       'statname', 'stat', 'notes', 'access_level', 'cultivar', 'entity',
       'method_name', 'view_url', 'edit_url'],
      dtype='object')

In [62]:
cols_to_drop = ['Unnamed: 0', 'checked', 'result_type', 'id', 'citation_id', 'site_id', 'treatment_id', 'scientificname',
                'commonname', 'genus', 'species_id', 'cultivar_id', 'author', 'citation_year', 'time', 'month', 'year',
                'dateloc', 'n', 'statname', 'stat', 'notes', 'access_level', 'entity', 'view_url', 'edit_url', 'date']

danforth_plant_height_1 = danforth_plant_height_0.drop(labels=cols_to_drop, axis=1).reset_index(drop=True)
print(danforth_plant_height_1.shape)
# danforth_plant_height_1.tail(3)

(186, 12)


Unnamed: 0,sitename,city,lat,lon,treatment,raw_date,trait,trait_description,mean,units,cultivar,method_name
183,Danforth Plant Science Center Bellweather Phen...,St. Louis,38.674826,-90.396971,60%: 130.2 ml water (27.3% VWC),2014-06-04 14:26:11 -0500,plant_height,Maximum vertical height from the base of the p...,57.0,cm,PI564163,PlantCV
184,Danforth Plant Science Center Bellweather Phen...,St. Louis,38.674826,-90.396971,80%: 173.6 ml water (37.5% VWC),2014-06-08 14:50:30 -0500,plant_height,Maximum vertical height from the base of the p...,444.0,cm,PI564163,PlantCV
185,Danforth Plant Science Center Bellweather Phen...,St. Louis,38.674826,-90.396971,60%: 130.2 ml water (27.3% VWC),2014-06-14 16:04:04 -0500,plant_height,Maximum vertical height from the base of the p...,671.0,cm,PI564163,PlantCV


In [78]:
danforth_plant_height_2 = danforth_plant_height_1.rename({'mean': 'value'}, axis=1)
# danforth_plant_height_2.head(3)

#### F. Convert `raw_date` to datetime object & Rename new values as `date`

In [80]:
danforth_plant_height_2.dtypes

sitename              object
city                  object
lat                  float64
lon                  float64
treatment             object
raw_date              object
trait                 object
trait_description     object
value                float64
units                 object
cultivar              object
method_name           object
dtype: object

In [81]:
danforth_plant_height_3 = danforth_plant_height_2.copy()

danforth_plant_height_3['date'] = pd.to_datetime(danforth_plant_height_3.raw_date)

# Values should be the same
print(len(danforth_plant_height_3['date']))
print(danforth_plant_height_2.shape[0])

186
186


In [82]:
# danforth_plant_height_3.head()

Unnamed: 0,sitename,city,lat,lon,treatment,raw_date,trait,trait_description,value,units,cultivar,method_name,date
0,Danforth Plant Science Center Bellweather Phen...,St. Louis,38.674826,-90.396971,40%: 86.8 ml water (17.2% VWC),2014-06-03 15:27:29 -0500,plant_height,Maximum vertical height from the base of the p...,32.0,cm,PI564163,PlantCV,2014-06-03 15:27:29-05:00
1,Danforth Plant Science Center Bellweather Phen...,St. Louis,38.674826,-90.396971,60%: 130.2 ml water (27.3% VWC),2014-06-16 14:26:55 -0500,plant_height,Maximum vertical height from the base of the p...,569.0,cm,PI564163,PlantCV,2014-06-16 14:26:55-05:00
2,Danforth Plant Science Center Bellweather Phen...,St. Louis,38.674826,-90.396971,80%: 173.6 ml water (37.5% VWC),2014-06-09 16:44:21 -0500,plant_height,Maximum vertical height from the base of the p...,360.0,cm,PI564163,PlantCV,2014-06-09 16:44:21-05:00
3,Danforth Plant Science Center Bellweather Phen...,St. Louis,38.674826,-90.396971,100%: 217 ml water (47.6% VWC),2014-06-07 14:18:42 -0500,plant_height,Maximum vertical height from the base of the p...,140.0,cm,PI564163,PlantCV,2014-06-07 14:18:42-05:00
4,Danforth Plant Science Center Bellweather Phen...,St. Louis,38.674826,-90.396971,100%: 217 ml water (47.6% VWC),2014-06-10 16:17:08 -0500,plant_height,Maximum vertical height from the base of the p...,288.0,cm,PI564163,PlantCV,2014-06-10 16:17:08-05:00


G. Drop `raw_date` and reorder columns

In [84]:
new_col_order = ['sitename', 'cultivar', 'lat', 'lon', 'date', 'trait', 'trait_description', 'value', 'units',
                 'method_name', 'treatment']

danforth_plant_height_4 = pd.DataFrame(data=danforth_plant_height_3, columns=new_col_order)
print(danforth_plant_height_4.shape)
# danforth_plant_height_4.tail(3)

(186, 11)


Unnamed: 0,sitename,cultivar,lat,lon,date,trait,trait_description,value,units,method_name,treatment
183,Danforth Plant Science Center Bellweather Phen...,PI564163,38.674826,-90.396971,2014-06-04 14:26:11-05:00,plant_height,Maximum vertical height from the base of the p...,57.0,cm,PlantCV,60%: 130.2 ml water (27.3% VWC)
184,Danforth Plant Science Center Bellweather Phen...,PI564163,38.674826,-90.396971,2014-06-08 14:50:30-05:00,plant_height,Maximum vertical height from the base of the p...,444.0,cm,PlantCV,80%: 173.6 ml water (37.5% VWC)
185,Danforth Plant Science Center Bellweather Phen...,PI564163,38.674826,-90.396971,2014-06-14 16:04:04-05:00,plant_height,Maximum vertical height from the base of the p...,671.0,cm,PlantCV,60%: 130.2 ml water (27.3% VWC)


#### H. Sort dataframe by `date`

In [85]:
danforth_plant_height_5 = danforth_plant_height_4.sort_values(by='date', ascending=True).reset_index(drop=True)
# danforth_plant_height_5.head()

Unnamed: 0,sitename,cultivar,lat,lon,date,trait,trait_description,value,units,method_name,treatment
0,Danforth Plant Science Center Bellweather Phen...,PI564163,38.674826,-90.396971,2014-06-03 13:11:14-05:00,plant_height,Maximum vertical height from the base of the p...,79.0,cm,PlantCV,100%: 217 ml water (47.6% VWC)
1,Danforth Plant Science Center Bellweather Phen...,PI564163,38.674826,-90.396971,2014-06-03 14:20:49-05:00,plant_height,Maximum vertical height from the base of the p...,73.0,cm,PlantCV,60%: 130.2 ml water (27.3% VWC)
2,Danforth Plant Science Center Bellweather Phen...,PI564163,38.674826,-90.396971,2014-06-03 14:20:49-05:00,plant_height,Maximum vertical height from the base of the p...,73.0,cm,PlantCV,60%: 130.2 ml water (27.3% VWC)
3,Danforth Plant Science Center Bellweather Phen...,PI564163,38.674826,-90.396971,2014-06-03 14:23:38-05:00,plant_height,Maximum vertical height from the base of the p...,53.0,cm,PlantCV,100%: 217 ml water (47.6% VWC)
4,Danforth Plant Science Center Bellweather Phen...,PI564163,38.674826,-90.396971,2014-06-03 14:23:38-05:00,plant_height,Maximum vertical height from the base of the p...,53.0,cm,PlantCV,100%: 217 ml water (47.6% VWC)


In [86]:
# danforth_plant_height_5.tail()

Unnamed: 0,sitename,cultivar,lat,lon,date,trait,trait_description,value,units,method_name,treatment
181,Danforth Plant Science Center Bellweather Phen...,PI564163,38.674826,-90.396971,2014-06-20 15:43:01-05:00,plant_height,Maximum vertical height from the base of the p...,838.0,cm,PlantCV,40%: 86.8 ml water (17.2% VWC)
182,Danforth Plant Science Center Bellweather Phen...,PI564163,38.674826,-90.396971,2014-06-20 15:43:01-05:00,plant_height,Maximum vertical height from the base of the p...,838.0,cm,PlantCV,40%: 86.8 ml water (17.2% VWC)
183,Danforth Plant Science Center Bellweather Phen...,PI564163,38.674826,-90.396971,2014-06-20 16:16:11-05:00,plant_height,Maximum vertical height from the base of the p...,880.0,cm,PlantCV,100%: 217 ml water (47.6% VWC)
184,Danforth Plant Science Center Bellweather Phen...,PI564163,38.674826,-90.396971,2014-06-23 18:01:40-05:00,plant_height,Maximum vertical height from the base of the p...,1350.0,cm,PlantCV,40%: 86.8 ml water (17.2% VWC)
185,Danforth Plant Science Center Bellweather Phen...,PI564163,38.674826,-90.396971,2014-06-23 18:01:40-05:00,plant_height,Maximum vertical height from the base of the p...,1350.0,cm,PlantCV,40%: 86.8 ml water (17.2% VWC)


#### I. Write dataframe to `.csv`

In [87]:
danforth_plant_height_5.to_csv('data/processed/danforth_plant_height_2020-04-01.csv')