## Query betydb for phenotypes dataset
* Season 4 & Season 6
* Wide format: one row per plot
* Days to Emergence
* Days to Flowering
* Aboveground dry biomass

In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

%matplotlib inline

In [None]:
# %pwd
# %cd '/Users/ejcain/Dev/UA-AG/phenotypes/terraref-datasets/'

In [None]:
# MAC Season 4 

mac_season_4_df = pd.read_csv('../data/raw/mac_season_4.csv')
mac_season_4_df.head()

In [None]:
# try specifying dtype to eliminate DtypeWarning

df2 = pd.read_csv('../data/raw/mac_season_4.csv', sep=',', dtype={'a': str})
df2.head()

In [None]:
# confirm same shape from R

df2.shape

In [None]:
df2.columns

In [None]:
# mixed dtypes in columns 18, 32, 35

print(df2.columns[18], df2.columns[32], df2.columns[35])

In [None]:
df2['treatment'].head()

In [None]:
df2['notes'].head()

In [None]:
df2['entity'].head()

In [None]:
df2.result_type.unique()

In [None]:
df2.iloc[5]

In [None]:
df2.trait.unique()

In [None]:
df2.sitename.nunique()

In [None]:
df2.site_id.nunique()

In [None]:
df2.sitename.unique()

In [None]:
df2.sitename.head(25)

In [None]:
df2.loc[df2.trait == 'aboveground_dry_biomass'].head()

In [None]:
df2.iloc[864]

In [None]:
# sitename - column 7
# trait - column 25
# mean - column 27
# units - column 28

In [None]:
# test_pivot_df = df2.pivot(index = 'sitename', columns = 'trait', values = ['mean', 'units'])
# test_pivot_df.head()

In [None]:
# inspect duplicates

duplicates = df2.duplicated(subset = ['sitename', 'trait'])

In [None]:
duplicates.value_counts()

In [None]:
duplicates.head()

In [None]:
# rows 1:5 - sitename, trait, mean, units

test_sample = df2.loc[1:5, ['sitename', 'trait', 'mean', 'units']]
test_sample

In [None]:
# days to emergence
# days to flowering
# aboveground dry biomass

In [None]:
# aboveground dry biomass should only have been measured once per plot - at the end of harvest date
# check for duplicates 

dry_biomass_trait = df2.loc[df2['trait'] == 'aboveground_dry_biomass']
dry_biomass_trait.duplicated(subset = 'sitename').value_counts()

In [None]:
# days to flowering

flowering_time = df2.loc[df2['trait'] == 'flowering_time']
flowering_time.duplicated(subset = 'sitename').value_counts()

In [None]:
# days to emergence

flag_leaf_emergence = df2.loc[df2['trait'] == 'flag_leaf_emergence_time']
flag_leaf_emergence.duplicated(subset = 'sitename').value_counts()


In [None]:
test_check = df2.loc[df2['sitename'].str.contains('MAC Field Scanner Season 4 Range 3 Column 12')]

In [None]:
df2.iloc[371831]

In [None]:
test_check.shape

In [None]:
print(test_check.raw_date.max())
print(test_check.raw_date.min())

In [None]:
test_check.loc[test_check.raw_date == test_check.raw_date.max()]['trait']

In [None]:
# drop all rows that do not contain trait values for days to flowering, days to emergence, and aboveground dry biomass

selected_traits_only = df2[(df2['trait'] == 'aboveground_dry_biomass') | (df2['trait'] == 'flowering_time') | \
                          (df2['trait'] == 'flag_leaf_emergence_time')]

In [None]:
selected_traits_only.shape

In [None]:
selected_traits_only.trait.unique()

In [None]:
selected_traits_only.columns

In [None]:
selected_traits_only.head()



In [None]:
print(selected_traits_only.city.unique())
print(selected_traits_only.id.nunique())
print(selected_traits_only.citation_id.nunique())
print(selected_traits_only.site_id.nunique())
print(selected_traits_only.lat.nunique())
print(selected_traits_only.scientificname.nunique())
print(selected_traits_only.species_id.nunique())
print(selected_traits_only.cultivar_id.nunique())
print(selected_traits_only.trait_description.unique())
print(selected_traits_only.n.unique())
print(selected_traits_only.notes.unique())
print(selected_traits_only.cultivar.nunique())
print(selected_traits_only.method_name.unique())

In [None]:
selected_traits_only.shape

In [None]:
cols_to_drop = ['Unnamed: 0', 'checked', 'result_type', 'citation_id', 'scientificname', 'species_id', 'author', \
               'citation_year', 'treatment', 'date', 'month', 'year', 'dateloc', 'n', 'statname', 'stat', 'notes', \
               'access_level', 'entity', 'method_name', 'view_url', 'edit_url', 'treatment_id', 'city', 'commonname', \
               'genus', 'time', 'raw_date']

selected_and_dropped = selected_traits_only.drop(labels = cols_to_drop, axis = 1)

In [None]:
selected_and_dropped.head()

In [None]:
selected_and_dropped.shape

In [None]:
# Check for duplicates

selected_and_dropped.duplicated(subset = ['site_id', 'sitename', 'trait_description']).value_counts()

In [None]:
selected_and_dropped.to_csv('selected_traits_with_cultivar_and_site_ids.csv', index = False)

In [None]:
print(selected_and_dropped.trait.unique())
print(selected_and_dropped.trait_description.unique())
print(selected_and_dropped.units.unique())

In [None]:
id_cols_to_drop = ['id', 'site_id', 'lat', 'lon', 'cultivar_id', 'trait_description', 'units']
bare_bones_df = selected_and_dropped.drop(labels = id_cols_to_drop, axis = 1)

In [None]:
bare_bones_df.tail()

In [None]:
print(bare_bones_df.sitename.nunique())
print(bare_bones_df.cultivar.nunique())
print(bare_bones_df.trait.nunique())

In [None]:
# two indices?

site_cultivar_indices = pd.pivot_table(data = bare_bones_df, index = ['sitename', 'cultivar'], columns = 'trait',
                                      values = 'mean')

In [None]:
site_cultivar_indices.head()

In [None]:
site_cultivar_indices.shape

In [None]:
site_cultivar_indices.to_csv('site_cultivar_indices.csv')

In [None]:
print(bare_bones_df.shape)
print(bare_bones_df.sitename.nunique())
print(bare_bones_df.cultivar.nunique())

In [None]:
minus_cultivar = bare_bones_df.drop(columns = 'cultivar', axis = 1)

In [None]:
minus_cultivar.head()

In [None]:
cultivar_and_sitename = bare_bones_df[['sitename', 'cultivar']]
cultivar_and_sitename.head()

In [None]:
cultivar_and_sitename.set_index('sitename')
cultivar_and_sitename.head()

In [None]:
site_cultivar_df = pd.read_csv('site_cultivar_indices.csv')
site_cultivar_df.head()

In [None]:
filled_nas = site_cultivar_df.fillna(value='')
filled_nas.head()

In [None]:
filled_nas.sample(n=50, random_state=42)

In [None]:
filled_nas_sample = filled_nas.sample(n=50, random_state=43)
filled_nas_sample.tail()

In [None]:
filled_nas_sample.to_csv('sample_subset_season_4.csv', index=False)

In [None]:
filled_nas.to_csv('transformed_data_season_4.csv', index=False)

#### Need Max Canopy Height

In [None]:
season_four = pd.read_csv('../data/raw/mac_season_4.csv')
season_four.head()

In [None]:
season_four.trait.unique()

In [None]:
canopy_height = season_four.loc[season_four.trait == 'canopy_height']
canopy_height.shape

In [None]:
# drop similar columns to above but keep date cols for now

cols_to_drop = ['Unnamed: 0', 'checked', 'result_type', 'citation_id', 'scientificname', 'species_id', 'author', \
                'citation_year', 'treatment', 'n', 'statname', 'stat', 'notes', 'access_level', 'entity', \
                'method_name', 'view_url', 'edit_url', 'treatment_id', 'city', 'commonname', 'genus', 'id', \
               'site_id', 'lat', 'lon', 'cultivar_id']

dropped_col_df = canopy_height.drop(labels=cols_to_drop, axis=1)
dropped_col_df.head()

In [None]:
dropped_col_df.trait_description.unique()

In [None]:
dropped_col_df.duplicated(subset='sitename')

In [None]:
three_traits_empty_strings = pd.read_csv('site_cultivar_indices.csv')
three_traits_empty_strings.head()

In [None]:
print(np.sum(three_traits_empty_strings.isnull()))

In [None]:
canopy_df = pd.read_csv('max_canopy_heights.csv')
canopy_df.head()

In [None]:
three_trait_df = pd.read_csv('transformed_data_season_4.csv')
three_trait_df.head()

In [None]:
three_trait_df.shape

In [None]:
canopy_df.shape

In [None]:
# join tables on sitenames

merged_df = pd.merge(left=canopy_df, right=three_trait_df, left_on=['new_sitename', 'cultivar'], 
                     right_on=['sitename', 'cultivar'], how='outer')
merged_df.head()

In [None]:
merged_df.tail(20)

In [None]:
print(merged_df.new_sitename.nunique())
print(merged_df.sitename.nunique())

In [None]:
# find differences between new_sitename and sitename

different_sitenames = list(set(merged_df.new_sitename) - set(merged_df.sitename))
different_sitenames

In [None]:
merged_df[merged_df.new_sitename.isin(different_sitenames)]

In [None]:
# drop sitename column

one_sitename_df = merged_df.drop(labels='sitename', axis=1)
one_sitename_df.head()

In [None]:
# fill NaNs with empty strings and rename sitename column

one_sitename_filled_nas = one_sitename_df.fillna(value='').rename(columns={'new_sitename': 'sitename'})
one_sitename_filled_nas.head(10)

In [None]:
one_sitename_filled_nas.to_csv('four_trait_df.csv', na_rep='', index=False)

In [None]:
# check how missing values are represented when csv read 
# without using na_values argument

testing_nans = pd.read_csv('four_trait_df.csv')
testing_nans.head(10)

In [None]:
# with na_values argument

testing_nans_2 = pd.read_csv('four_trait_df.csv', na_values='')
testing_nans_2.sample(n=15, random_state=42)

### Plot corrections

In [None]:
df = pd.read_csv('../data/raw/mac_season_4.csv')
df.head()

In [None]:
cols_to_drop = ['Unnamed: 0', 'checked', 'result_type', 'citation_id', 'scientificname', 'author', 'citation_year', \
                'treatment', 'n', 'statname', 'stat', 'notes', 'access_level', 'entity', 'method_name', 'view_url', \
                'edit_url', 'treatment_id', 'city', 'commonname', 'genus', 'species_id', 'month', 'year', 'dateloc']

In [None]:
print(df.site_id.unique())
print(df.site_id.nunique())

In [None]:
print(df.lat.nunique())
print(df.lon.nunique())

In [None]:
print(df.cultivar_id.nunique())

In [None]:
print(df.id.nunique())
print(df.shape)

In [None]:
df_dropped = df.drop(labels=cols_to_drop, axis=1)
df_dropped.head()

In [None]:
print(df.site_id.nunique())
print(df.sitename.nunique())

In [None]:
site_index = df_dropped.set_index(keys='site_id')
site_index.head()

In [None]:
# check that no duplicate sites were dropped

print(df.shape)
print(df_dropped.shape)
print(site_index.shape)

In [None]:
site_index.trait.unique()

In [None]:
site_index.loc[site_index.trait == 'emergence_count']

In [None]:
traits_to_keep = ['ambient_humidity', 'proximal_air_temperature', 'surface_temperature', 'leaf_temperature_differential', \
                 'aboveground_dry_biomass', 'seedling_emergence_rate', 'canopy_height', 'flag_leaf_emergence_time', \
                 'flowering_time', 'grain_stage_time', 'canopy_cover', 'emergence_count']

In [None]:
# test for keeping a row based on `trait` values

test_values = ['ambient_humidity', 'proximal_air_temperature']

site_index.loc[(site_index.trait == 'ambient_humidity') | (site_index.trait == 'proximal_air_temperature')].shape

In [None]:
test_trait_df = site_index.loc[site_index.trait.isin(test_values)]
test_trait_df.shape

In [None]:
test_trait_df.head()

In [None]:
test_trait_df.trait.unique()

In [None]:
df.loc[(df.trait == 'ambient_humidity') | (df.trait == 'proximal_air_temperature')].shape

In [None]:
selected_traits = site_index.loc[site_index.trait.isin(traits_to_keep)]
selected_traits.shape

In [None]:
e_w_sites = selected_traits.loc[(selected_traits.sitename.str.endswith(' W')) | (selected_traits.sitename.str.endswith(' E'))]

In [None]:
e_w_sites.shape

In [None]:
# is `site_id` different for E and W?
# . . . . . yes

e_w_sites.head(25)

In [None]:
# only need to find duplicates to calculate means where the sitename is the same for the beginning of the sitename string

e_w_sites.iloc[5].sitename

In [None]:
e_w_sites.iloc[5].sitename[:-2]


In [None]:
# e_w_sites['new_plot_name'] = new_plot_names
# TODO: Figure out how to get the REAL new plot name
e_w_sites['new_plot_name'] = ['new_plot_name' for i in range(5219)]
print(e_w_sites.shape)
e_w_sites.head()

In [None]:
new_duplicates = e_w_sites[e_w_sites.duplicated(['new_plot_name'], keep=False)]
new_duplicates.shape

In [None]:
new_duplicates.trait.unique()

In [None]:
canopy_height_only = new_duplicates.loc[new_duplicates.trait == 'canopy_height']
canopy_height_only.shape

In [None]:
canopy_height_only.new_plot_name.nunique()

In [None]:
canopy_height_only.duplicated(['new_plot_name', 'date'], keep=False).value_counts()

In [None]:
for i in canopy_height_only.index:
    print(i)
    break

In [None]:
range(len(canopy_height_only))

In [None]:
canopy_height_only.iloc[0]

In [None]:
for i in range(len(canopy_height_only)):
    print(canopy_height_only.iloc[i])
    break

In [None]:
canopy_height_only.iloc[0]['mean']

In [None]:
canopy_height_only.to_csv('canopy_height_only_season_4.csv')

In [None]:
selected_traits.to_csv('more_traits_after_feedback.csv')

In [None]:
canopy_only_df = pd.read_csv('canopy_height_only_season_4.csv')
canopy_only_df.head()

In [None]:
canopy_only_df['mean'].values

In [None]:
print(canopy_only_df.date.unique())
print(canopy_only_df.raw_date.unique())

In [None]:
duplicate_dates_sitenames = canopy_only_df

In [None]:
canopy_only_df.duplicated(subset=['new_plot_name', 'date'], keep=False).value_counts()

#### Canopy Height Corrections / Additions
* Correctly calculate `max_canopy_height`
* Add `days_to` or `day_of`?