### Phenotype Table for Initial Set of Training Data

In [1]:
import datetime
import pandas as pd
import numpy as np
import sqlite3

#### Import dataset downloaded from betydb in R

In [2]:
df_0 = pd.read_csv('../data/raw/mac_season_4.csv')
df_0.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0.1,Unnamed: 0,checked,result_type,id,citation_id,site_id,treatment_id,sitename,city,lat,...,n,statname,stat,notes,access_level,cultivar,entity,method_name,view_url,edit_url
0,1,0,traits,6001958927,6000000000.0,6000005673,6000000000.0,MAC Field Scanner Season 4 Range 11 Column 5,Maricopa,33.074907,...,,,,,2,PI181083,,Visual assessment of leaf dessication,https://terraref.ncsa.illinois.edu/bety/traits...,https://terraref.ncsa.illinois.edu/bety/traits...
1,2,0,traits,6001958928,6000000000.0,6000005676,6000000000.0,MAC Field Scanner Season 4 Range 11 Column 6,Maricopa,33.074907,...,,,,,2,PI564163,,Visual assessment of leaf dessication,https://terraref.ncsa.illinois.edu/bety/traits...,https://terraref.ncsa.illinois.edu/bety/traits...
2,3,0,traits,6001958931,6000000000.0,6000005685,6000000000.0,MAC Field Scanner Season 4 Range 11 Column 9,Maricopa,33.074907,...,,,,,2,PI52606,,Visual assessment of leaf dessication,https://terraref.ncsa.illinois.edu/bety/traits...,https://terraref.ncsa.illinois.edu/bety/traits...
3,4,0,traits,6001958933,6000000000.0,6000005691,6000000000.0,MAC Field Scanner Season 4 Range 11 Column 11,Maricopa,33.074907,...,,,,,2,PI533792,,Visual assessment of leaf dessication,https://terraref.ncsa.illinois.edu/bety/traits...,https://terraref.ncsa.illinois.edu/bety/traits...
4,5,0,traits,6001958936,6000000000.0,6000005700,6000000000.0,MAC Field Scanner Season 4 Range 11 Column 14,Maricopa,33.074907,...,,,,,2,PI535794,,Visual assessment of leaf dessication,https://terraref.ncsa.illinois.edu/bety/traits...,https://terraref.ncsa.illinois.edu/bety/traits...


In [3]:
df_0.columns

Index(['Unnamed: 0', 'checked', 'result_type', 'id', 'citation_id', 'site_id',
       'treatment_id', 'sitename', 'city', 'lat', 'lon', 'scientificname',
       'commonname', 'genus', 'species_id', 'cultivar_id', 'author',
       'citation_year', 'treatment', 'date', 'time', 'raw_date', 'month',
       'year', 'dateloc', 'trait', 'trait_description', 'mean', 'units', 'n',
       'statname', 'stat', 'notes', 'access_level', 'cultivar', 'entity',
       'method_name', 'view_url', 'edit_url'],
      dtype='object')

In [4]:
df_1 = df_0.drop(labels=['Unnamed: 0', 'checked', 'citation_id', 'city', 'scientificname', 'commonname', 'genus',
                        'species_id', 'author', 'citation_year', 'trait_description', 'units', 'n', 'statname',
                        'stat', 'access_level', 'view_url', 'edit_url'], axis=1)

#### Extract Range and Column Values

In [5]:
df_2 = df_1.copy()

In [6]:
df_2['range'] = df_2['sitename'].str.extract("Range (\d+)").astype(int)
df_2['column'] = df_2['sitename'].str.extract("Column (\d+)").astype(int)

#### Convert table to wide format
* Each trait should have its own column
* Rename `mean` column to `value` for easier understanding

In [7]:
df_3 = df_2.rename({'mean': 'value'}, axis=1)

In [8]:
traits_to_keep = ['leaf_temperature', 'ambient_humidity', 'proximal_air_temperature', 'surface_temperature',
                  'aboveground_dry_biomass', 'canopy_height', 'flag_leaf_emergence_time', 'flowering_time',
                  'canopy_cover']

In [9]:
empty_df = pd.DataFrame(data=df_3, index=df_3.index, columns=traits_to_keep)

In [10]:
df_4 = pd.concat([df_3, empty_df.reindex(df_3.index)], axis=1)

#### Drop more unecessary (at this time) columns

In [11]:
df_5 = df_4.drop(labels=['result_type', 'treatment_id', 'treatment', 'dateloc'], axis=1)

#### Populate empty columns with available values

In [21]:
# This is very slow - needs refactoring for .py script and reproducible notebook 

run_slow_stuff = False

if run_slow_stuff:

    counter = 0

    for index, row in df_5.iterrows():            
        if counter % 1000 == 0:
            print(counter)            
            counter += 1
        for trait in traits_to_keep:
            if row['trait'] == trait:                
                df_5.loc[index, [trait]] = row['value']

#### Change plots to index

In [13]:
df_6 = df_5.set_index('sitename')

#### Drop some columns that are redundant or can be explained in data dictionary
* `month`
* `year`
* `notes`
* `trait` - now have trait values in wide format (one column per trait requested for this iteration of dataset)
* `method_name`
* `notes` 

In [14]:
df_6.drop(labels=['month', 'year', 'notes', 'trait', 'method_name', 'notes'], axis=1, inplace=True)

#### Read in `df_6` 

In [15]:
# df_6 = pd.read_csv(('../data/processed/pheno-table_populated_traits_2019-11-18T071126.csv')

#### Calculate average canopy heights
1. Convert df to sqlite db
2. Group by range and column values (to bypass the current E and W plots which are still in the dataset) and date 
3. Generate average canopy height values to add to dataset

#### Establish connection to sqlite db

In [41]:
conn = sqlite3.connect('season_4_phenos.sqlite')
cursor = conn.cursor()
print("Opened database successfully")

Opened database successfully


#### Convert df to sqlite db and generate average canopy height df

In [42]:
df_6.to_sql('season_4_phenos.sqlite', conn)

In [47]:
avg_canopy_heights = pd.read_sql_query("""
                                        Select range, column, date, avg(canopy_height)
                                        FROM 'season_4_phenos.sqlite'
                                        WHERE canopy_height NOTNULL
                                        GROUP BY range, column, date
                                        ORDER BY date DESC;
                                        """, conn)

In [48]:
# avg_canopy_heights.head()

Unnamed: 0,range,column,date,avg(canopy_height)
0,2,2,2017 May 8,13.0
1,2,3,2017 May 8,14.0
2,2,4,2017 May 8,12.0
3,2,5,2017 May 8,13.0
4,2,6,2017 May 8,13.0


#### Join `avg_canopy_heights` df with main df

In [50]:
# avg_canopy_heights.shape

(34834, 4)

In [53]:
df_7 = pd.merge(left= df_6, right=avg_canopy_heights, on=['range', 'column', 'date'], left_index=True)

In [54]:
df_7.shape

(186874, 23)

In [55]:
df_6.shape

(372363, 22)

In [56]:
# sanity check
# duplicated might be including null values, which is why the number of True values is higher than the number of rows
# generated by the SQL query

df_6.duplicated(subset=['range', 'column', 'date'], keep=False, ).value_counts()

True     356686
False     15677
dtype: int64

#### Add temperatures to calculate GDD

#### Add growing degree days

In [None]:
# from tutorials

#   mutate(date = as.Date(time), 
#          air_temp_converted = air_temperature - 273.15) %>% 
#   group_by(date) %>% 
#   summarise(min_temp = min(air_temp_converted), 
#             max_temp = max(air_temp_converted), 
#             gdd = ifelse(sum(min_temp, max_temp) / 2 > 10, 
#                          (max_temp + min_temp) / 2 - 10, 0))

#### Add max canopy height

#### Re-order column names

#### Add planting date column

In [16]:
# Update df_* with timestamp
# df_6 will be renamed

timestamp = datetime.datetime.now().replace(microsecond=0).isoformat()
output_filename = f'pheno-table_{timestamp}.csv'.replace(':', '')
df_6.to_csv(f'../data/processed/{output_filename}')