## Phenotype Table I
December 2019
#### Traits
* Season 4 biomass at harvest
    * `aboveground_dry_biomass`
    * `aboveground_fresh_biomass`
* Season 4 `growing_degree_days`

#### Notes
* one plot per row

In [1]:
import datetime
import pandas as pd
import numpy as np

In [2]:
df_0 = pd.read_csv('../data/raw/mac_season_4.csv', low_memory=False)
print(df_0.shape)
# df_0.head()

(372363, 39)


### I. Set index to sitename 
* Maintain shape of dataframe
* No grouping
* Strip `E` and `W` from sitenames

In [3]:
e_w_subplots = df_0.loc[(df_0.sitename.str.endswith(' E')) | (df_0.sitename.str.endswith(' W'))]
e_w_subplots.shape

(9460, 39)

In [4]:
new_sitenames = []
e_w_counter = 0

for site in df_0.sitename.values:
    
    if (site.endswith(' W')) | (site.endswith(' E')):
        new_name = site[:-2]
        new_sitenames.append(new_name)
        e_w_counter += 1
        
    else:
        new_sitenames.append(site)
        
print(f'Length of original sitename values list: {len(df_0.sitename.values)}')
print(f'Length new sitename list: {len(new_sitenames)}')
print(f'Number of E and W subplots: {e_w_counter}')

Length of original sitename values list: 372363
Length new sitename list: 372363
Number of E and W subplots: 9460


In [5]:
df_0['plot_name'] = new_sitenames
print(df_0.shape)
# df_0.head()

(372363, 40)


In [6]:
df_1 = df_0.set_index(keys='plot_name')
print(df_1.shape)
# df_1.head()

(372363, 39)


In [7]:
print(f'Number of unique plot names: {df_1.index.nunique()}')

Number of unique plot names: 847


### II. Biomass at harvest

#### A. Aboveground dry biomass

In [85]:
dry_bio_values_only = df_1.loc[df_1.trait == 'aboveground_dry_biomass']
print(f'Number of aboveground dry biomass values in dataset: {dry_bio_values_only.shape[0]}')
print(f'Number of unique plots in biomass df: {dry_bio_values_only.index.nunique()}')

Number of aboveground dry biomass values in dataset: 200
Number of unique plots in biomass df: 199


In [86]:
# Identify the duplicate plot

duplicates = bio_values_only[bio_values_only.index.duplicated(keep=False)]
# duplicates

In [87]:
print(duplicates.index.unique())
print(duplicates['mean'].unique())

Index(['MAC Field Scanner Season 4 Range 20 Column 11'], dtype='object', name='plot_name')
[10950.]


In [88]:
non_duplicate_columns = []

for col in duplicates.columns:
    
    if duplicates.iloc[0][col] != duplicates.iloc[1][col]:
        non_duplicate_columns.append(col)
        
# print(non_duplicate_columns)

for col in non_duplicate_columns:
    print(f'{col}: {duplicates.iloc[0][col]}')
    print(f'{col}: {duplicates.iloc[1][col]}')

Unnamed: 0: 72499
Unnamed: 0: 279789
n: nan
n: nan
statname: nan
statname: nan
stat: nan
stat: nan
notes: nan
notes: nan
entity: nan
entity: nan


#### Create new column `aboveground_dry_biomass` and populate with values
* Null values will be represented by an empty string (after calculations)

In [13]:
# Create new empty column

df_1['aboveground_dry_biomass'] = np.nan

In [14]:
df_2 = df_1.copy()
# df_2.head()

In [15]:
# for rows where trait == aboveground_dry_biomass, use that value to populate new aboveground_dry_biomass column

for index, row in df_2.iterrows():            
    
        if row['trait'] == 'aboveground_dry_biomass':
            df_2.loc[index, ['aboveground_dry_biomass']] = row['mean']

#### B. Aboveground fresh biomass

In [89]:
fresh_bio_values_only = df_1.loc[df_1.trait == 'aboveground_fresh_biomass']
print(f'Number of aboveground fresh biomass values in dataset: {fresh_bio_values_only.shape[0]}')
print(f'Number of unique plots in biomass df: {fresh_bio_values_only.index.nunique()}')

Number of aboveground fresh biomass values in dataset: 543
Number of unique plots in biomass df: 543


In [90]:
# no duplicates! 

#### Create new column `aboveground_fresh_biomass` and populate with values
* Null values will be represented by an empty string (after calculations)

In [92]:
# Create new empty column

df_2['aboveground_fresh_biomass'] = np.nan

In [93]:
df_3 = df_2.copy()

In [94]:
# for rows where trait == aboveground_fresh_biomass, use that value to populate new aboveground_fresh_biomass column

for index, row in df_3.iterrows():            
    
        if row['trait'] == 'aboveground_fresh_biomass':
            df_3.loc[index, ['aboveground_fresh_biomass']] = row['mean']

### III. Drop columns

In [96]:
df_3.columns

Index(['Unnamed: 0', 'checked', 'result_type', 'id', 'citation_id', 'site_id',
       'treatment_id', 'sitename', 'city', 'lat', 'lon', 'scientificname',
       'commonname', 'genus', 'species_id', 'cultivar_id', 'author',
       'citation_year', 'treatment', 'date', 'time', 'raw_date', 'month',
       'year', 'dateloc', 'trait', 'trait_description', 'mean', 'units', 'n',
       'statname', 'stat', 'notes', 'access_level', 'cultivar', 'entity',
       'method_name', 'view_url', 'edit_url', 'aboveground_dry_biomass',
       'aboveground_fresh_biomass'],
      dtype='object')

In [97]:
cols_to_drop = ['Unnamed: 0', 'checked', 'site_id', 'result_type', 'id', 'citation_id', 'treatment_id', 'sitename', 'city', 
                'lat', 'lon', 'scientificname', 'commonname', 'genus', 'species_id', 'author', 'citation_year', 
                'treatment', 'time', 'raw_date', 'month', 'year', 'dateloc', 'trait', 'trait_description', 'mean', 
                'units', 'n', 'statname', 'stat', 'notes', 'access_level', 'entity', 'method_name', 'view_url', 
                'edit_url',]

In [98]:
df_4 = df_3.drop(labels=cols_to_drop, axis=1)
# df_4.head()

Unnamed: 0_level_0,cultivar_id,date,cultivar,aboveground_dry_biomass,aboveground_fresh_biomass
plot_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
MAC Field Scanner Season 4 Range 11 Column 5,6000000730,2017 Jun 14 (America/Phoenix),PI181083,,
MAC Field Scanner Season 4 Range 11 Column 6,6000000231,2017 Jun 14 (America/Phoenix),PI564163,,37520.0
MAC Field Scanner Season 4 Range 11 Column 9,6000000860,2017 Jun 14 (America/Phoenix),PI52606,,74180.0
MAC Field Scanner Season 4 Range 11 Column 11,6000000863,2017 Jun 14 (America/Phoenix),PI533792,,23020.0
MAC Field Scanner Season 4 Range 11 Column 14,6000000869,2017 Jun 14 (America/Phoenix),PI535794,,


#### Sort by date so that only the first duplicate `plot_name` (latest date) will be kept

In [99]:
df_4.dtypes

cultivar_id                    int64
date                          object
cultivar                      object
aboveground_dry_biomass      float64
aboveground_fresh_biomass    float64
dtype: object

In [101]:
df_5 = df_4.sort_values(by=['date'], ascending=False)
# df_5.head()

Unnamed: 0_level_0,cultivar_id,date,cultivar,aboveground_dry_biomass,aboveground_fresh_biomass
plot_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
MAC Field Scanner Season 4 Range 18 Column 13,6000000996,2017 Sep 15 (America/Phoenix),PI641817,,75880.0
MAC Field Scanner Season 4 Range 16 Column 13,6000000465,2017 Sep 15 (America/Phoenix),PI146890,,82700.0
MAC Field Scanner Season 4 Range 12 Column 14,6000000958,2017 Sep 15 (America/Phoenix),PI570109,,47750.0
MAC Field Scanner Season 4 Range 9 Column 13,6000000756,2017 Sep 15 (America/Phoenix),PI276837,,149200.0
MAC Field Scanner Season 4 Range 13 Column 12,6000000853,2017 Sep 15 (America/Phoenix),PI514456,,63950.0


In [102]:
print(f'Number of unique plot names: {df_5.index.nunique()}')
print(' ')
print(f'Shape of original data: {df_0.shape}')
print(f'Shape of current processed data with dropped columns: {df_5.shape}')

Number of unique plot names: 847
 
Shape of original data: (372363, 40)
Shape of current processed data with dropped columns: (372363, 5)


#### Drop duplicates by
* `plot_name`
* `cultivar_id`
* `cultivar`
* `aboveground_dry_biomass`
* `aboveground_fresh_biomass`

#### Notes
* Only the latest (harvest) date should be used since df sorted by descending date
* reset index so that duplicates can be checked in axis 1 columns only, rather than having to check for both axes

In [103]:
df_6 = df_5.reset_index()
# df_6.head()

Unnamed: 0,plot_name,cultivar_id,date,cultivar,aboveground_dry_biomass,aboveground_fresh_biomass
0,MAC Field Scanner Season 4 Range 18 Column 13,6000000996,2017 Sep 15 (America/Phoenix),PI641817,,75880.0
1,MAC Field Scanner Season 4 Range 16 Column 13,6000000465,2017 Sep 15 (America/Phoenix),PI146890,,82700.0
2,MAC Field Scanner Season 4 Range 12 Column 14,6000000958,2017 Sep 15 (America/Phoenix),PI570109,,47750.0
3,MAC Field Scanner Season 4 Range 9 Column 13,6000000756,2017 Sep 15 (America/Phoenix),PI276837,,149200.0
4,MAC Field Scanner Season 4 Range 13 Column 12,6000000853,2017 Sep 15 (America/Phoenix),PI514456,,63950.0


In [104]:
df_7 = df_6.drop_duplicates(subset=['plot_name', 'cultivar_id', 'cultivar', 'aboveground_dry_biomass',
                                   'aboveground_fresh_biomass'], keep='first')
df_7.head()

Unnamed: 0,plot_name,cultivar_id,date,cultivar,aboveground_dry_biomass,aboveground_fresh_biomass
0,MAC Field Scanner Season 4 Range 18 Column 13,6000000996,2017 Sep 15 (America/Phoenix),PI641817,,75880.0
1,MAC Field Scanner Season 4 Range 16 Column 13,6000000465,2017 Sep 15 (America/Phoenix),PI146890,,82700.0
2,MAC Field Scanner Season 4 Range 12 Column 14,6000000958,2017 Sep 15 (America/Phoenix),PI570109,,47750.0
3,MAC Field Scanner Season 4 Range 9 Column 13,6000000756,2017 Sep 15 (America/Phoenix),PI276837,,149200.0
4,MAC Field Scanner Season 4 Range 13 Column 12,6000000853,2017 Sep 15 (America/Phoenix),PI514456,,63950.0


In [105]:
print(f'Number of unique plot names: {df_4.index.nunique()}')
print(f'Number of rows after duplicates dropped: {df_7.shape[0]}')

Number of unique plot names: 847
Number of rows after duplicates dropped: 847


In [106]:
# check number of null values

df_7.isnull().sum()

plot_name                      0
cultivar_id                    0
date                           0
cultivar                       0
aboveground_dry_biomass      648
aboveground_fresh_biomass    304
dtype: int64

In [108]:
# check proper dates - harvest dates for non-null values should be between 2017-9-10 to 2017-9-15 (Arizona dates)
# the raw harvest UTC dates in the metadata are between 2017-09-11 to 2017-9-16

non_nulls = df_7.loc[(df_6.aboveground_dry_biomass.notna()) | (df_7.aboveground_fresh_biomass.notna())]
non_nulls.shape

(547, 6)

In [109]:
print(f'Earliest date for biomass harvest: {non_nulls.date.min()}')
print(f'Latest date for biomass harvest: {non_nulls.date.max()}')

Earliest date for biomass harvest: 2017 Sep 10 (America/Phoenix)
Latest date for biomass harvest: 2017 Sep 15 (America/Phoenix)


### IV. Add Growing Degree Days

#### Final Step
* null values converted to empty strings

In [110]:
# intentional csv creation
# change to True to create csv

do_you_need_to_create_another_csv = True

if do_you_need_to_create_another_csv:

    timestamp = datetime.datetime.now().replace(microsecond=0).isoformat()
    output_filename = f'aboveground_biomass_harvest_{timestamp}.csv'.replace(':', '')
    df_7.to_csv(f'../data/processed/{output_filename}')

## For Future Tests

In [None]:
# print(f'Shape of original dataframe: {df_0.shape}')
# print(f'Shape of re-indexed dataframe: {df_1.shape}')

#### II. Determine number of rows for processed dataset
* Unique sitenames
* Take `E` and `W` subplots into consideration if present
    * check for subplots

#### Extract Range and Column Values

In [None]:
# saving code

# df_4['range'] = df_4['sitename'].str.extract("Range (\d+)").astype(int)
# df_4['column'] = df_4['sitename'].str.extract("Column (\d+)").astype(int)