## Tall Format Trait Table Season 6

In [1]:
import datetime
import numpy as np
import pandas as pd

In [5]:
df_0 = pd.read_csv('data/raw/season_6_traits.csv', low_memory=False)
df_0.shape

(925563, 38)

In [6]:
df_0.columns

Index(['checked', 'result_type', 'id', 'citation_id', 'site_id',
       'treatment_id', 'sitename', 'city', 'lat', 'lon', 'scientificname',
       'commonname', 'genus', 'species_id', 'cultivar_id', 'author',
       'citation_year', 'treatment', 'date', 'time', 'raw_date', 'month',
       'year', 'dateloc', 'trait', 'trait_description', 'mean', 'units', 'n',
       'statname', 'stat', 'notes', 'access_level', 'cultivar', 'entity',
       'method_name', 'view_url', 'edit_url'],
      dtype='object')

In [7]:
cols_to_drop = ['checked', 'result_type', 'id', 'citation_id', 'site_id', 'treatment_id', 'city', 
                'scientificname', 'commonname', 'genus', 'species_id', 'cultivar_id', 'author', 'citation_year',
                'treatment', 'time', 'dateloc', 'trait_description', 'units', 'n', 'statname',
                'stat', 'notes', 'access_level', 'entity', 'method_name', 'view_url', 'edit_url']

In [8]:
df_1 = df_0.drop(labels=cols_to_drop, axis=1)
# df_1.head()

### Season Dates

In [9]:
df_1.year.unique()

array([2018, 2017])

In [11]:
year_2017 = df_1.loc[df_1.year == 2017]
year_2017.shape

(368, 10)

In [12]:
# year_2017.head()

for col in year_2017.columns:
    print(f'Number of unique values for {col}: {year_2017[col].nunique()}')
    
    if year_2017[col].nunique() < 5:
        print(f'Unique values for {col}: {year_2017[col].unique()}')

Number of unique values for sitename: 185
Number of unique values for lat: 185
Number of unique values for lon: 185
Number of unique values for date: 1
Unique values for date: ['2017 Jul 5']
Number of unique values for raw_date: 1
Unique values for raw_date: ['2017-07-05 14:00:00 -0500']
Number of unique values for month: 1
Unique values for month: [7]
Number of unique values for year: 1
Unique values for year: [2017]
Number of unique values for trait: 1
Unique values for trait: ['surface_temperature']
Number of unique values for mean: 185
Number of unique values for cultivar: 121


### I. Slice the dataframe to only include 2018 dates
* '2017-7-05' does not belong - follow up on this

In [13]:
df_2 = df_1.loc[df_1.year == 2018]
df_2.shape

(925195, 10)

In [None]:
# print(df_2.raw_date.nunique())
# print(df_2.date.nunique())

### I. Change AZ date values to iso date format and strip `America/Phoenix` from string dates
* date(s) as index?

In [14]:
new_dates = []

for d in df_2.date.values:
    
    if 'Phoenix' in d:
        new_name = d[:-18]
        new_dates.append(new_name)
    
    else:
        new_name = d
        new_dates.append(new_name)
        
print(len(new_dates))

925195


In [15]:
iso_format_dates = pd.to_datetime(new_dates)

In [16]:
df_3 = df_2.copy()

df_3['date_1'] = iso_format_dates
# df_3.head()

In [17]:
df_3.dtypes

sitename            object
lat                float64
lon                float64
date                object
raw_date            object
month                int64
year                 int64
trait               object
mean               float64
cultivar            object
date_1      datetime64[ns]
dtype: object

#### Drop other date columns

In [18]:
other_date_cols = ['date', 'raw_date', 'month', 'year']
df_4 = df_3.drop(other_date_cols, axis=1)
df_4.shape

(925195, 7)

In [19]:
df_4.trait.unique()

array(['leaf_width', 'surface_temperature', 'canopy_cover', 'leaf_length',
       'canopy_height', 'panicle_count', 'panicle_volume',
       'panicle_surface_area', 'aboveground_dry_biomass',
       'aboveground_fresh_biomass', 'stalk_diameter_fixed_height',
       'aboveground_biomass_moisture', 'leaf_angle_mean',
       'leaf_angle_alpha', 'leaf_angle_beta', 'leaf_angle_chi'],
      dtype=object)

### II. Subset traits
Needed now:
* `aboveground_dry_biomass`
* `canopy_height`

In [20]:
df_5 = df_4.loc[(df_4.trait == 'aboveground_dry_biomass') | (df_4.trait == 'canopy_height')]
df_5.shape

(48663, 7)

In [21]:
df_5.date_1.nunique()

83

In [None]:
# df_5.loc[df_5.trait == 'aboveground_dry_biomass'].date_1.nunique()

In [None]:
# df_5.loc[df_5.trait == 'canopy_height'].date_1.nunique()

### III. Extract Range and Column Values

In [22]:
df_6 = df_5.copy()

df_6['range'] = df_6['sitename'].str.extract("Range (\d+)").astype(int)
df_6['column'] = df_6['sitename'].str.extract("Column (\d+)").astype(int)

# df_6.sample(n=7)

#### Check for E W subplots

In [23]:
df_6.loc[(df_6.sitename.str.endswith(' E')) | (df_6.sitename.str.endswith(' W'))]

Unnamed: 0,sitename,lat,lon,trait,mean,cultivar,date_1,range,column


### IV. Reorder & Rename Columns
* Set date column as index

In [24]:
df_7 = df_6.rename({'date_1': 'date', 'mean': 'value'}, axis=1)
# df_7.head()

In [25]:
df_8 = df_7.set_index(keys='date')
print(df_7.shape)
print(df_8.shape)

(48663, 9)
(48663, 8)


In [26]:
col_reorder = ['sitename', 'range', 'column', 'lat', 'lon', 'cultivar', 'trait', 'value']
df_9 = pd.DataFrame(data=df_8, columns=col_reorder, index=df_8.index)
# df_9.head()

In [27]:
df_10 = df_9.sort_index()

In [1]:
# df_10.tail()

#### Final Steps
* Create `.csv`

In [None]:
need_to_create_csv = False

if need_to_create_csv:

    timestamp = datetime.datetime.now().replace(microsecond=0).isoformat()
    output_filename = f'tall_format_traits_season_6{timestamp}.csv'.replace(':', '')
    df_10.to_csv(f'data/processed/{output_filename}')