### Phenotype Table I
December 2019
#### Traits
* Season 4 `end_of_season_biomass`
* Season 4 `growing_degree_days`

#### Notes
* one plot per row

In [1]:
import datetime
import pandas as pd
import numpy as np

In [2]:
df_0 = pd.read_csv('../data/raw/mac_season_4.csv', low_memory=False)
print(df_0.shape)
# df_0.head()

(372363, 39)


### I. Reset index to sitename 
* Maintain shape of dataframe
* No grouping
* Strip `E` and `W` from sitenames

In [3]:
e_w_subplots = df_0.loc[(df_0.sitename.str.endswith(' E')) | (df_0.sitename.str.endswith(' W'))]
e_w_subplots.shape

(9460, 39)

In [4]:
new_sitenames = []
e_w_counter = 0

for site in df_0.sitename.values:
    
    if (site.endswith(' W')) | (site.endswith(' E')):
        new_name = site[:-2]
        new_sitenames.append(new_name)
        e_w_counter += 1
        
    else:
        new_sitenames.append(site)
        
print(f'Length of original sitename values list: {len(df_0.sitename.values)}')
print(f'Length new sitename list: {len(new_sitenames)}')
print(f'Number of E and W subplots: {e_w_counter}')

Length of original sitename values list: 372363
Length new sitename list: 372363
Number of E and W subplots: 9460


In [5]:
df_0['plot_name'] = new_sitenames
print(df_0.shape)
# df_0.head()

(372363, 40)


In [6]:
df_1 = df_0.set_index(keys='plot_name')
print(df_1.shape)
# df_1.head()

(372363, 39)


In [7]:
print(f'Number of unique plot names: {df_1.index.nunique()}')

Number of unique plot names: 847


### II. Aboveground dry biomass (end of season)

#### Check for null `aboveground_dry_biomass` values

In [8]:
bio_values_only = df_1.loc[df_1.trait == 'aboveground_dry_biomass']
print(f'Number of aboveground dry biomass values in dataset: {bio_values_only.shape[0]}')
print(f'Number of unique plots in biomass df: {bio_values_only.index.nunique()}')

Number of aboveground dry biomass values in dataset: 200
Number of unique plots in biomass df: 199


In [9]:
# Identify the duplicate plot

duplicates = bio_values_only[bio_values_only.index.duplicated(keep=False)]
# duplicates

In [10]:
print(duplicates.index.unique())
print(duplicates['mean'].unique())

Index(['MAC Field Scanner Season 4 Range 20 Column 11'], dtype='object', name='plot_name')
[10950.]


In [11]:
non_duplicate_columns = []

for col in duplicates.columns:
    
    if duplicates.iloc[0][col] != duplicates.iloc[1][col]:
        non_duplicate_columns.append(col)
        
# print(non_duplicate_columns)

for col in non_duplicate_columns:
    print(f'{col}: {duplicates.iloc[0][col]}')
    print(f'{col}: {duplicates.iloc[1][col]}')

Unnamed: 0: 72499
Unnamed: 0: 279789
n: nan
n: nan
statname: nan
statname: nan
stat: nan
stat: nan
notes: nan
notes: nan
entity: nan
entity: nan


Since the `aboveground_dry_biomass` value and plot name are duplicated, one of these rows can be dropped from the dataset.

In [12]:
# df_1.loc[(df_1.index == 'MAC Field Scanner Season 4 Range 20 Column 11') & (df_1['mean'] == 10950)]

#### Create new column `aboveground_dry_biomass` and populate with values
* Null values will be represented by an empty string (after calculations)

In [13]:
# Create new empty column

df_1['aboveground_dry_biomass'] = np.nan

In [14]:
df_2 = df_1.copy()
# df_2.head()

In [15]:
# for rows where trait == aboveground_dry_biomass, use that value to populate new aboveground_dry_biomass column

for index, row in df_2.iterrows():            
    
        if row['trait'] == 'aboveground_dry_biomass':
            df_2.loc[index, ['aboveground_dry_biomass']] = row['mean']

In [None]:
# df_2.to_csv('data/interim/tall_format_with_biomass_column_2019-12-10.csv')

### III. Drop columns

In [16]:
df_2.columns

Index(['Unnamed: 0', 'checked', 'result_type', 'id', 'citation_id', 'site_id',
       'treatment_id', 'sitename', 'city', 'lat', 'lon', 'scientificname',
       'commonname', 'genus', 'species_id', 'cultivar_id', 'author',
       'citation_year', 'treatment', 'date', 'time', 'raw_date', 'month',
       'year', 'dateloc', 'trait', 'trait_description', 'mean', 'units', 'n',
       'statname', 'stat', 'notes', 'access_level', 'cultivar', 'entity',
       'method_name', 'view_url', 'edit_url', 'aboveground_dry_biomass'],
      dtype='object')

In [17]:
cols_to_drop = ['Unnamed: 0', 'checked', 'result_type', 'id', 'citation_id', 'treatment_id', 'sitename', 'city', 
                'lat', 'lon', 'scientificname', 'commonname', 'genus', 'species_id', 'author', 'citation_year', 
                'treatment', 'time', 'raw_date', 'month', 'year', 'dateloc', 'trait', 'trait_description', 'mean', 
                'units', 'n', 'statname', 'stat', 'notes', 'access_level', 'entity', 'method_name', 'view_url', 
                'edit_url']

In [18]:
df_3 = df_2.drop(labels=cols_to_drop, axis=1)
# df_3.head()

#### Sort by biomass values so that the first duplicate `plot_name` can be dropped

In [19]:
df_3.head()

Unnamed: 0_level_0,site_id,cultivar_id,date,cultivar,aboveground_dry_biomass
plot_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
MAC Field Scanner Season 4 Range 11 Column 5,6000005673,6000000730,2017 Jun 14 (America/Phoenix),PI181083,
MAC Field Scanner Season 4 Range 11 Column 6,6000005676,6000000231,2017 Jun 14 (America/Phoenix),PI564163,
MAC Field Scanner Season 4 Range 11 Column 9,6000005685,6000000860,2017 Jun 14 (America/Phoenix),PI52606,
MAC Field Scanner Season 4 Range 11 Column 11,6000005691,6000000863,2017 Jun 14 (America/Phoenix),PI533792,
MAC Field Scanner Season 4 Range 11 Column 14,6000005700,6000000869,2017 Jun 14 (America/Phoenix),PI535794,


In [30]:
df_3.dtypes

site_id                      int64
cultivar_id                  int64
date                        object
cultivar                    object
aboveground_dry_biomass    float64
dtype: object

In [35]:
df_4 = df_3.sort_values(by='aboveground_dry_biomass', ascending=False)
df_4.head()

Unnamed: 0_level_0,site_id,cultivar_id,date,cultivar,aboveground_dry_biomass
plot_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
MAC Field Scanner Season 4 Range 19 Column 6,6000005513,6000000821,2017 Aug 10,PI330185,52200.0
MAC Field Scanner Season 4 Range 19 Column 6,6000005513,6000000821,2017 Jul 6,PI330185,52200.0
MAC Field Scanner Season 4 Range 19 Column 6,6000005513,6000000821,2017 May 5,PI330185,52200.0
MAC Field Scanner Season 4 Range 19 Column 6,6000005513,6000000821,2017 Jul 6,PI330185,52200.0
MAC Field Scanner Season 4 Range 19 Column 6,6000005513,6000000821,2017 Jul 31,PI330185,52200.0


In [37]:
print(f'Number of unique plot names: {df_4.index.nunique()}')
print(' ')
print(f'Shape of original data: {df_0.shape}')
print(f'Shape of current processed data with dropped columns: {df_4.shape}')

Number of unique plot names: 847
 
Shape of original data: (372363, 40)
Shape of current processed data with dropped columns: (372363, 5)


#### Drop duplicates by
* `plot_name`
* `cultivar_id`
* `cultivar`
* `aboveground_dry_biomass`

#### Notes
* `site_id` may be different / leftover from E and W subplots
* should not be different dates, but if so - later date will be used
* reset index so that duplicates can be checked in axis 1 columns only, rather than having to check for both axes

In [41]:
df_5 = df_4.reset_index()
df_5.head()

Unnamed: 0,plot_name,site_id,cultivar_id,date,cultivar,aboveground_dry_biomass
0,MAC Field Scanner Season 4 Range 19 Column 6,6000005513,6000000821,2017 Aug 10,PI330185,52200.0
1,MAC Field Scanner Season 4 Range 19 Column 6,6000005513,6000000821,2017 Jul 6,PI330185,52200.0
2,MAC Field Scanner Season 4 Range 19 Column 6,6000005513,6000000821,2017 May 5,PI330185,52200.0
3,MAC Field Scanner Season 4 Range 19 Column 6,6000005513,6000000821,2017 Jul 6,PI330185,52200.0
4,MAC Field Scanner Season 4 Range 19 Column 6,6000005513,6000000821,2017 Jul 31,PI330185,52200.0


In [48]:
# confirm that order hasn't been changed - need better way for future tests

# df_5.iloc[600:625]
# df_4.iloc[600:625]

In [49]:
df_6 = df_5.drop_duplicates(subset=['plot_name', 'cultivar_id', 'cultivar', 'aboveground_dry_biomass'],
                           keep='first')
df_6.head()

Unnamed: 0,plot_name,site_id,cultivar_id,date,cultivar,aboveground_dry_biomass
0,MAC Field Scanner Season 4 Range 19 Column 6,6000005513,6000000821,2017 Aug 10,PI330185,52200.0
425,MAC Field Scanner Season 4 Range 12 Column 6,6000005732,6000000741,2017 Jun 12,PI221548,45750.0
798,MAC Field Scanner Season 4 Range 13 Column 7,6000005566,6000000811,2017 Jun 17,PI329711,43840.0
1202,MAC Field Scanner Season 4 Range 33 Column 4,6000005279,6000000881,2017 Jun 24,PI562971,43140.0
1608,MAC Field Scanner Season 4 Range 38 Column 13,6000005849,6000000747,2017 Jun 24,PI253986,42420.0


In [51]:
print(f'Number of unique plot names: {df_4.index.nunique()}')
print(f'Number of rows with dropped duplicates: {df_6.shape[0]}')

Number of unique plot names: 847
Number of rows with dropped duplicates: 847


In [52]:
# check number of null values

df_6.isnull().sum()

plot_name                    0
site_id                      0
cultivar_id                  0
date                         0
cultivar                     0
aboveground_dry_biomass    648
dtype: int64

In [None]:
# intentional csv creation

do_you_need_to_create_another_csv = True

if do_you_need_to_create_another_csv:

    timestamp = datetime.datetime.now().replace(microsecond=0).isoformat()
    output_filename = f'aboveground_biomass_harvest_{timestamp}.csv'.replace(':', '')
    df_6.to_csv(f'../data/processed/{output_filename}')

## For Future Tests

In [None]:
# print(f'Shape of original dataframe: {df_0.shape}')
# print(f'Shape of re-indexed dataframe: {df_1.shape}')

#### II. Determine number of rows for processed dataset
* Unique sitenames
* Take `E` and `W` subplots into consideration if present
    * check for subplots

#### Extract Range and Column Values

In [None]:
# saving code

# df_4['range'] = df_4['sitename'].str.extract("Range (\d+)").astype(int)
# df_4['column'] = df_4['sitename'].str.extract("Column (\d+)").astype(int)