## Tall Format Trait Table Season 4

In [1]:
import datetime
import numpy as np
import pandas as pd

In [5]:
df_0 = pd.read_csv('data/raw/mac_season_4.csv', low_memory=False)
df_0.shape

(372363, 39)

In [1]:
# df_0.trait.unique()

In [2]:
want_to_print_traits = False

if want_to_print_traits:

    for trait in df_0.trait.unique():
        print(f'Unit for {trait}: {df_0.loc[df_0.trait == trait].units.unique()}')

In [5]:
# df_0.columns

In [6]:
cols_to_drop = ['Unnamed: 0', 'checked', 'result_type', 'id', 'citation_id', 'site_id', 'treatment_id', 'city', 
                'scientificname', 'commonname', 'genus', 'species_id', 'cultivar_id', 'author', 'citation_year',
                'treatment', 'time', 'raw_date', 'month', 'year', 'dateloc', 'trait_description', 'units', 'n', 'statname',
                'stat', 'notes', 'access_level', 'entity', 'method_name', 'view_url', 'edit_url']

In [7]:
df_1 = df_0.drop(labels=cols_to_drop, axis=1)
# df_1.head()

In [None]:
# df_1.shape

In [None]:
# print(df_1.date.min())
# print(df_1.date.max())

### I. Change date values to iso date format
* date(s) as index?

In [8]:
new_dates = []

for d in df_1.date.values:
    
    if 'Phoenix' in d:
        new_name = d[:-18]
        new_dates.append(new_name)
    
    else:
        new_name = d
        new_dates.append(new_name)
        
print(len(new_dates))

372363


In [9]:
iso_format_dates = pd.to_datetime(new_dates)

In [10]:
df_1['date_1'] = iso_format_dates
# df_1.head()

Unnamed: 0,sitename,lat,lon,date,trait,mean,cultivar,date_1
0,MAC Field Scanner Season 4 Range 11 Column 5,33.074907,-111.974982,2017 Jun 14 (America/Phoenix),leaf_desiccation_present,0.0,PI181083,2017-06-14
1,MAC Field Scanner Season 4 Range 11 Column 6,33.074907,-111.974966,2017 Jun 14 (America/Phoenix),leaf_desiccation_present,0.0,PI564163,2017-06-14
2,MAC Field Scanner Season 4 Range 11 Column 9,33.074907,-111.974917,2017 Jun 14 (America/Phoenix),leaf_desiccation_present,0.0,PI52606,2017-06-14
3,MAC Field Scanner Season 4 Range 11 Column 11,33.074907,-111.974884,2017 Jun 14 (America/Phoenix),leaf_desiccation_present,0.0,PI533792,2017-06-14
4,MAC Field Scanner Season 4 Range 11 Column 14,33.074907,-111.974835,2017 Jun 14 (America/Phoenix),leaf_desiccation_present,0.0,PI535794,2017-06-14


### II. Change sitenames to strip E and W subplot designations
* Check which traits are associated with subplots

In [11]:
e_w_plots = df_1.loc[(df_1.sitename.str.endswith(' E')) | (df_1.sitename.str.endswith(' W'))]
e_w_plots.trait.unique()

array(['leaf_temperature', 'planter_seed_drop', 'seedling_emergence_rate',
       'panicle_height', 'stand_count', 'canopy_height', 'leaf_length',
       'leaf_width', 'canopy_cover', 'stem_elongated_internodes_number',
       'emergence_count', 'stalk_diameter_fixed_height',
       'stalk_diameter_minor_axis', 'stalk_diameter_major_axis',
       'plant_basal_tiller_number'], dtype=object)

In [12]:
e_w_plot_traits = e_w_plots.trait.unique()

In [13]:
sitename_values = df_1.sitename.values
no_e_w_names = []

for name in sitename_values:
    
    if name.endswith(' W') | name.endswith(' E'):
        name = name[:-2]
        no_e_w_names.append(name)
        
    else:
        no_e_w_names.append(name)

print(len(no_e_w_names))

372363


In [14]:
df_1['sitename_1'] = no_e_w_names
# df_1.sample(n=7)

In [15]:
# df_1.loc[df_1.sitename.str.endswith(' E')].iloc[0]

#### III. Extract Range and Column Values

In [16]:
df_1['range'] = df_1['sitename'].str.extract("Range (\d+)").astype(int)
df_1['column'] = df_1['sitename'].str.extract("Column (\d+)").astype(int)

# df_1.sample(n=7)

#### Drop duplicated Columns, Reorder, Rename, & Sort by Date

In [17]:
df_2 = df_1.drop(labels=['sitename', 'date'], axis=1)
df_2.shape

(372363, 9)

In [18]:
df_2.columns

Index(['lat', 'lon', 'trait', 'mean', 'cultivar', 'date_1', 'sitename_1',
       'range', 'column'],
      dtype='object')

In [19]:
col_reorder = ['date_1', 'sitename_1', 'range', 'column', 'lat', 'lon', 'cultivar', 'trait', 'mean']

In [20]:
df_3 = pd.DataFrame(data=df_2, columns=col_reorder, index=df_2.index)
# df_3.head()

Unnamed: 0,date_1,sitename_1,range,column,lat,lon,cultivar,trait,mean
0,2017-06-14,MAC Field Scanner Season 4 Range 11 Column 5,11,5,33.074907,-111.974982,PI181083,leaf_desiccation_present,0.0
1,2017-06-14,MAC Field Scanner Season 4 Range 11 Column 6,11,6,33.074907,-111.974966,PI564163,leaf_desiccation_present,0.0
2,2017-06-14,MAC Field Scanner Season 4 Range 11 Column 9,11,9,33.074907,-111.974917,PI52606,leaf_desiccation_present,0.0
3,2017-06-14,MAC Field Scanner Season 4 Range 11 Column 11,11,11,33.074907,-111.974884,PI533792,leaf_desiccation_present,0.0
4,2017-06-14,MAC Field Scanner Season 4 Range 11 Column 14,11,14,33.074907,-111.974835,PI535794,leaf_desiccation_present,0.0


In [21]:
df_4 = df_3.rename({'date_1': 'date', 'sitename_1': 'sitename', 'mean': 'value'}, axis=1)
# df_4.tail()

Unnamed: 0,date,sitename,range,column,lat,lon,cultivar,trait,value
372358,2017-06-06,MAC Field Scanner Season 4 Range 44 Column 9,44,9,33.076093,-111.974917,PI329286,leaf_angle_beta,2.052416
372359,2017-06-06,MAC Field Scanner Season 4 Range 44 Column 9,44,9,33.076093,-111.974917,PI329286,leaf_angle_chi,2.140986
372360,2017-06-06,MAC Field Scanner Season 4 Range 44 Column 13,44,13,33.076093,-111.974852,PI329843,leaf_angle_mean,0.401801
372361,2017-06-06,MAC Field Scanner Season 4 Range 44 Column 13,44,13,33.076093,-111.974852,PI329843,leaf_angle_alpha,3.217374
372362,2017-06-06,MAC Field Scanner Season 4 Range 44 Column 13,44,13,33.076093,-111.974852,PI329843,leaf_angle_beta,1.980157


In [22]:
df_5 = df_4.sort_values(by=['date'], ascending=True)
print(df_5.shape)
print(df_4.shape)
# df_5.head()

(372363, 9)
(372363, 9)


### III. Drop traits not needed at this time
* dropping `canopy_height` because of the E W subplots - dealing with those values separately

In [None]:
# df_5.trait.unique()

In [23]:
traits_to_keep = ['flowering_time', 'flag_leaf_emergence_time', 'light_intensity_PAR', 'aboveground_dry_biomass']

In [24]:
df_6 = df_5.loc[df_5.trait.isin(traits_to_keep)]
df_6.shape

(2338, 9)

In [25]:
df_6.isnull().sum()

date        0
sitename    0
range       0
column      0
lat         0
lon         0
cultivar    0
trait       0
value       0
dtype: int64

In [3]:
# df_6.head()

In [27]:
df_7 = df_6.set_index('date')
# df_7.head()

Unnamed: 0_level_0,sitename,range,column,lat,lon,cultivar,trait,value
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2017-07-20,MAC Field Scanner Season 4 Range 20 Column 11,20,11,33.075231,-111.974884,PI527045,flowering_time,56.0
2017-07-20,MAC Field Scanner Season 4 Range 16 Column 7,16,7,33.075087,-111.97495,PI152651,flag_leaf_emergence_time,41.0
2017-07-20,MAC Field Scanner Season 4 Range 20 Column 9,20,9,33.075231,-111.974917,PI535792,flowering_time,74.0
2017-07-20,MAC Field Scanner Season 4 Range 20 Column 9,20,9,33.075231,-111.974917,PI535792,flag_leaf_emergence_time,70.0
2017-07-20,MAC Field Scanner Season 4 Range 20 Column 3,20,3,33.075231,-111.975015,PI656026,flowering_time,65.0


#### Final Steps
* Create `.csv`

In [28]:
need_to_create_csv = False

if need_to_create_csv:

    timestamp = datetime.datetime.now().replace(microsecond=0).isoformat()
    output_filename = f'tall_format_traits_season_4_{timestamp}.csv'.replace(':', '')
    df_7.to_csv(f'data/processed/{output_filename}')