### Canopy Heights
Two methods / authors for data collection

In [109]:
import datetime
import pandas as pd
import numpy as np
import sqlite3
import sqlalchemy

In [2]:
df_0 = pd.read_csv('../data/raw/mac_season_4.csv', low_memory=False)
# df_0.head()

### A. Create in-memory sqlite database

In [46]:
engine = sqlalchemy.create_engine('sqlite://', echo=False)

In [51]:
df_0.to_sql('traits', con=engine)
traits_db = engine.execute("SELECT * FROM traits")

In [42]:
print(f'Rows in df with canopy height values: {df_0.loc[df_0.trait == "canopy_height"].shape[0]}')

Rows in df with canopy height values: 58056


#### Find all rows with canopy height values

In [57]:
canopy_height_df = pd.read_sql_query("SELECT * FROM traits WHERE trait = 'canopy_height';", engine)
canopy_height_df.shape

(58056, 40)

In [60]:
canopy_height_df.columns

Index(['index', 'Unnamed: 0', 'checked', 'result_type', 'id', 'citation_id',
       'site_id', 'treatment_id', 'sitename', 'city', 'lat', 'lon',
       'scientificname', 'commonname', 'genus', 'species_id', 'cultivar_id',
       'author', 'citation_year', 'treatment', 'date', 'time', 'raw_date',
       'month', 'year', 'dateloc', 'trait', 'trait_description', 'mean',
       'units', 'n', 'statname', 'stat', 'notes', 'access_level', 'cultivar',
       'entity', 'method_name', 'view_url', 'edit_url'],
      dtype='object')

In [66]:
dropped_cols = pd.read_sql_query("""
                                SELECT id, sitename, date, cultivar, cultivar_id, mean
                                FROM traits
                                WHERE trait = 'canopy_height';
                                """, engine)

# dropped_cols.head()

Unnamed: 0,id,sitename,date,cultivar,cultivar_id,mean
0,6001943979,MAC Field Scanner Season 4 Range 30 Column 9 E,2017 Jun 19 (America/Phoenix),PI329518,6000000788,91.0
1,6001942254,MAC Field Scanner Season 4 Range 20 Column 15 W,2017 Jun 4 (America/Phoenix),PI152828,6000000687,99.0
2,6001940853,MAC Field Scanner Season 4 Range 20 Column 15 W,2017 May 29 (America/Phoenix),PI152828,6000000687,63.0
3,6004768603,MAC Field Scanner Season 4 Range 35 Column 14,2017 May 20,PI619838,6000000989,22.0
4,6004768605,MAC Field Scanner Season 4 Range 36 Column 2,2017 May 20,PI155885,6000000707,26.0


In [67]:
dropped_cols.shape

(58056, 6)

### B. Take average value for E and W subplots when measured on the same date

Check number of unique sitenames

In [84]:
engine.execute("""
                SELECT COUNT(DISTINCT sitename)
                FROM traits
                WHERE 
                trait = 'canopy_height'
                """).fetchall()

[(2065,)]

In [85]:
df_0.loc[df_0.trait == 'canopy_height'].sitename.nunique()

2065

Check number of E W subplots

In [87]:
engine.execute("""
                SELECT COUNT(sitename)
                FROM traits
                WHERE
                (trait = 'canopy_height')
                AND
                ((sitename LIKE '% E')
                OR
                (sitename LIKE '% W'))
                """).fetchall()

[(4353,)]

In [94]:
df_1 = df_0.loc[df_0.trait == 'canopy_height']
df_1.shape

(58056, 39)

In [99]:
e_w_subplots = df_1.loc[((df_1.sitename.str.endswith(' E')) | (df_1.sitename.str.endswith(' W')))]
e_w_subplots.shape

(4353, 39)

In [100]:
e_w_subplots.sitename.nunique()

1347

#### Strip subplot designations

In [102]:
df_1.shape

(58056, 39)

In [103]:
sitename_values = df_1.sitename.values
no_e_w_names = []

for name in sitename_values:
    
    if name.endswith(' W') | name.endswith(' E'):
        name = name[:-2]
        no_e_w_names.append(name)
        
    else:
        no_e_w_names.append(name)

In [105]:
df_2 = df_1.copy()
df_2['new_sitenames'] = no_e_w_names
# df_2.head()

Unnamed: 0.1,Unnamed: 0,checked,result_type,id,citation_id,site_id,treatment_id,sitename,city,lat,...,statname,stat,notes,access_level,cultivar,entity,method_name,view_url,edit_url,new_sitenames
51339,51340,0,traits,6001943979,6000000000.0,6000007888,6000000000.0,MAC Field Scanner Season 4 Range 30 Column 9 E,Maricopa,33.07559,...,,,,4,PI329518,,Manual canopy height,https://terraref.ncsa.illinois.edu/bety/traits...,https://terraref.ncsa.illinois.edu/bety/traits...,MAC Field Scanner Season 4 Range 30 Column 9
51340,51341,0,traits,6001942254,6000000000.0,6000007579,6000000000.0,MAC Field Scanner Season 4 Range 20 Column 15 W,Maricopa,33.075231,...,,,,4,PI152828,,Manual canopy height,https://terraref.ncsa.illinois.edu/bety/traits...,https://terraref.ncsa.illinois.edu/bety/traits...,MAC Field Scanner Season 4 Range 20 Column 15
51341,51342,0,traits,6001940853,6000000000.0,6000007579,6000000000.0,MAC Field Scanner Season 4 Range 20 Column 15 W,Maricopa,33.075231,...,,,,4,PI152828,,Manual canopy height,https://terraref.ncsa.illinois.edu/bety/traits...,https://terraref.ncsa.illinois.edu/bety/traits...,MAC Field Scanner Season 4 Range 20 Column 15
75000,75001,0,traits,6004768603,6000000000.0,6000005801,,MAC Field Scanner Season 4 Range 35 Column 14,Maricopa,33.07577,...,,,,2,PI619838,,3D scanner to 98th quantile height,https://terraref.ncsa.illinois.edu/bety/traits...,https://terraref.ncsa.illinois.edu/bety/traits...,MAC Field Scanner Season 4 Range 35 Column 14
75001,75002,0,traits,6004768605,6000000000.0,6000005295,,MAC Field Scanner Season 4 Range 36 Column 2,Maricopa,33.075806,...,,,,2,PI155885,,3D scanner to 98th quantile height,https://terraref.ncsa.illinois.edu/bety/traits...,https://terraref.ncsa.illinois.edu/bety/traits...,MAC Field Scanner Season 4 Range 36 Column 2


In [111]:
# df_2.to_sql('canopy_heights_no_subplots', con=engine)
engine.execute("SELECT COUNT(DISTINCT sitename) FROM canopy_heights_no_subplots;").fetchall()

[(2065,)]

In [112]:
df_2.sitename.nunique()

2065

In [119]:
max_canopy_heights = pd.read_sql_query("""SELECT new_sitenames as sitename, date, cultivar, cultivar_id, 
                                        MAX(mean) as max_canopy_height 
                                        FROM canopy_heights_no_subplots GROUP BY new_sitenames;
                                        """, engine)

# max_canopy_heights.head()

Unnamed: 0,sitename,date,cultivar,cultivar_id,max_canopy_height
0,MAC Field Scanner Season 4 Range 10 Column 10,2017 Aug 25,PI152816,6000000686,350.0
1,MAC Field Scanner Season 4 Range 10 Column 11,2017 Jul 5,PI195754,6000000731,295.0
2,MAC Field Scanner Season 4 Range 10 Column 12,2017 Aug 26,PI329501,6000000575,373.0
3,MAC Field Scanner Season 4 Range 10 Column 13,2017 Jul 5,PI641860,6000001004,286.0
4,MAC Field Scanner Season 4 Range 10 Column 14,2017 Jul 5,PI19770,6000000737,288.0


In [120]:
max_canopy_heights.shape

(728, 5)

In [123]:
max_canopy_heights.dtypes

sitename              object
date                  object
cultivar              object
cultivar_id            int64
max_canopy_height    float64
dtype: object

In [125]:
max_canopy_heights_1 = max_canopy_heights.copy()

In [126]:
string_dates = max_canopy_heights_1.date.values

In [142]:
string_dates[14][:-18]

'2017 Jul 5'

In [149]:
new_dates = []

for d in string_dates:
    
    if 'Phoenix' in d:
        new_name = d[:-18]
        new_dates.append(new_name)
    
    else:
        new_name = d
        new_dates.append(new_name)

In [150]:
max_canopy_heights_1['new_dates'] = new_dates
# max_canopy_heights_1.head()

Unnamed: 0,sitename,date,cultivar,cultivar_id,max_canopy_height,new_dates
0,MAC Field Scanner Season 4 Range 10 Column 10,2017 Aug 25,PI152816,6000000686,350.0,2017 Aug 25
1,MAC Field Scanner Season 4 Range 10 Column 11,2017 Jul 5,PI195754,6000000731,295.0,2017 Jul 5
2,MAC Field Scanner Season 4 Range 10 Column 12,2017 Aug 26,PI329501,6000000575,373.0,2017 Aug 26
3,MAC Field Scanner Season 4 Range 10 Column 13,2017 Jul 5,PI641860,6000001004,286.0,2017 Jul 5
4,MAC Field Scanner Season 4 Range 10 Column 14,2017 Jul 5,PI19770,6000000737,288.0,2017 Jul 5


In [151]:
max_canopy_heights_1['date_1'] = pd.to_datetime(max_canopy_heights_1.new_dates)
# max_canopy_heights_1.tail()

Unnamed: 0,sitename,date,cultivar,cultivar_id,max_canopy_height,new_dates,date_1
723,MAC Field Scanner Season 4 Range 9 Column 5,2017 Aug 30,PI569452,6000000931,387.0,2017 Aug 30,2017-08-30
724,MAC Field Scanner Season 4 Range 9 Column 6,2017 Jul 5,PI329351,6000000564,289.0,2017 Jul 5,2017-07-05
725,MAC Field Scanner Season 4 Range 9 Column 7,2017 Aug 30,PI585961,6000000982,337.0,2017 Aug 30,2017-08-30
726,MAC Field Scanner Season 4 Range 9 Column 8,2017 Jul 5,PI563350,6000000906,249.0,2017 Jul 5,2017-07-05
727,MAC Field Scanner Season 4 Range 9 Column 9,2017 Jul 11 (America/Phoenix),PI643016,6000001010,290.0,2017 Jul 11,2017-07-11


In [161]:
new_col_order = ['sitename', 'date_1', 'cultivar', 'cultivar_id', 'max_canopy_height']

In [162]:
max_canopy_heights_2 = pd.DataFrame(data=max_canopy_heights_1, index=max_canopy_heights_1.index, columns=new_col_order)
# max_canopy_heights_2.head()

Unnamed: 0,sitename,date_1,cultivar,cultivar_id,max_canopy_height
0,MAC Field Scanner Season 4 Range 10 Column 10,2017-08-25,PI152816,6000000686,350.0
1,MAC Field Scanner Season 4 Range 10 Column 11,2017-07-05,PI195754,6000000731,295.0
2,MAC Field Scanner Season 4 Range 10 Column 12,2017-08-26,PI329501,6000000575,373.0
3,MAC Field Scanner Season 4 Range 10 Column 13,2017-07-05,PI641860,6000001004,286.0
4,MAC Field Scanner Season 4 Range 10 Column 14,2017-07-05,PI19770,6000000737,288.0


In [164]:
max_canopy_heights_3 = max_canopy_heights_2.set_index(keys='sitename')
# max_canopy_heights_3.tail()

Unnamed: 0_level_0,date_1,cultivar,cultivar_id,max_canopy_height
sitename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
MAC Field Scanner Season 4 Range 9 Column 5,2017-08-30,PI569452,6000000931,387.0
MAC Field Scanner Season 4 Range 9 Column 6,2017-07-05,PI329351,6000000564,289.0
MAC Field Scanner Season 4 Range 9 Column 7,2017-08-30,PI585961,6000000982,337.0
MAC Field Scanner Season 4 Range 9 Column 8,2017-07-05,PI563350,6000000906,249.0
MAC Field Scanner Season 4 Range 9 Column 9,2017-07-11,PI643016,6000001010,290.0


In [166]:
max_canopy_heights_4 = max_canopy_heights_3.rename({'date_1': 'date'}, axis=1)
# max_canopy_heights_4.head()

Unnamed: 0_level_0,date,cultivar,cultivar_id,max_canopy_height
sitename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
MAC Field Scanner Season 4 Range 10 Column 10,2017-08-25,PI152816,6000000686,350.0
MAC Field Scanner Season 4 Range 10 Column 11,2017-07-05,PI195754,6000000731,295.0
MAC Field Scanner Season 4 Range 10 Column 12,2017-08-26,PI329501,6000000575,373.0
MAC Field Scanner Season 4 Range 10 Column 13,2017-07-05,PI641860,6000001004,286.0
MAC Field Scanner Season 4 Range 10 Column 14,2017-07-05,PI19770,6000000737,288.0


In [167]:
max_canopy_heights_4.isnull().sum()

date                 0
cultivar             0
cultivar_id          0
max_canopy_height    0
dtype: int64

#### Last Steps

In [170]:
need_to_create_csv = False

if need_to_create_csv:

    timestamp = datetime.datetime.now().replace(microsecond=0).isoformat()
    output_filename = f'max_canopy_heights_{timestamp}.csv'.replace(':', '')
    max_canopy_heights.to_csv(f'data/processed/{output_filename}')

#### Extract column and range values for additional columns

In [38]:
# this needs to be a function. . . 

manual_df_3['range'] = manual_df_3['sitename'].str.extract("Range (\d+)").astype(int)
manual_df_3['column'] = manual_df_3['sitename'].str.extract("Column (\d+)").astype(int)

scanner_df_3['range'] = scanner_df_3['sitename'].str.extract("Range (\d+)").astype(int)
scanner_df_3['column'] = scanner_df_3['sitename'].str.extract("Column (\d+)").astype(int)

### For Future Tests

In [16]:
# if slicing df based on certain values, ensure that no rows were overlooked

print(manual_df.shape)
print(scanner_df.shape)
print(df_3.shape)

manual_df.shape[0] + scanner_df.shape[0] == df_3.shape[0]

(4872, 18)
(53184, 18)
(58056, 18)


True

In [None]:
# test for column and range extraction