## Canopy Heights End of Season 4
* wide format

### Season Dates
* Planting: 2017-04-20
* Last Day of Harvest: 2017-09-16

In [1]:
import datetime
import pandas as pd
import numpy as np
import sqlite3
import sqlalchemy

In [4]:
%pwd
%cd '/Users/ejcain/UA-AG/phenotypes/terraref-datasets/'

/Users/ejcain/UA-AG/phenotypes/terraref-datasets


In [5]:
df_0 = pd.read_csv('data/raw/mac_season_4.csv', low_memory=False)
# df_0.head()

In [8]:
df_0.shape

(372363, 39)

### I. Connect to sqlite database

In [23]:
conn = sqlite3.connect('end_of_season_canopy_heights.sqlite')
cursor = conn.cursor()
print("Opened database successfully")

Opened database successfully


In [7]:
df_1 = df_0.loc[df_0.trait == 'canopy_height']
df_1.shape

(58056, 39)

In [9]:
df_1.columns

Index(['Unnamed: 0', 'checked', 'result_type', 'id', 'citation_id', 'site_id',
       'treatment_id', 'sitename', 'city', 'lat', 'lon', 'scientificname',
       'commonname', 'genus', 'species_id', 'cultivar_id', 'author',
       'citation_year', 'treatment', 'date', 'time', 'raw_date', 'month',
       'year', 'dateloc', 'trait', 'trait_description', 'mean', 'units', 'n',
       'statname', 'stat', 'notes', 'access_level', 'cultivar', 'entity',
       'method_name', 'view_url', 'edit_url'],
      dtype='object')

In [10]:
cols_to_drop = ['Unnamed: 0', 'checked', 'result_type', 'id', 'citation_id', 'site_id', 'treatment_id', 'city', 
                'scientificname', 'commonname', 'genus', 'species_id', 'cultivar_id', 'author', 'citation_year', 
                'treatment', 'time', 'raw_date', 'month', 'year', 'dateloc', 'trait', 'trait_description', 'units', 'n',
                'statname', 'stat', 'notes', 'access_level', 'entity', 'method_name', 'view_url', 'edit_url']

In [11]:
df_2 = df_1.drop(labels=cols_to_drop, axis=1)
print(df_2.shape)
df_2.head()

(58056, 6)


Unnamed: 0,sitename,lat,lon,date,mean,cultivar
51339,MAC Field Scanner Season 4 Range 30 Column 9 E,33.07559,-111.974913,2017 Jun 19 (America/Phoenix),91.0,PI329518
51340,MAC Field Scanner Season 4 Range 20 Column 15 W,33.075231,-111.974823,2017 Jun 4 (America/Phoenix),99.0,PI152828
51341,MAC Field Scanner Season 4 Range 20 Column 15 W,33.075231,-111.974823,2017 May 29 (America/Phoenix),63.0,PI152828
75000,MAC Field Scanner Season 4 Range 35 Column 14,33.07577,-111.974835,2017 May 20,22.0,PI619838
75001,MAC Field Scanner Season 4 Range 36 Column 2,33.075806,-111.975032,2017 May 20,26.0,PI155885


### II. Strip subplot designations

In [12]:
sitename_values = df_2.sitename.values
no_e_w_names = []

for name in sitename_values:
    
    if name.endswith(' W') | name.endswith(' E'):
        name = name[:-2]
        no_e_w_names.append(name)
        
    else:
        no_e_w_names.append(name)

In [13]:
df_3 = df_2.copy()
df_3['sitename_1'] = no_e_w_names
# df_3.head()

### III. Extract Range and Column Values

In [14]:
df_4 = df_3.copy()

df_4['range'] = df_4['sitename_1'].str.extract("Range (\d+)").astype(int)
df_4['column'] = df_4['sitename_1'].str.extract("Column (\d+)").astype(int)

df_4.sample(n=7)

Unnamed: 0,sitename,lat,lon,date,mean,cultivar,sitename_1,range,column
103576,MAC Field Scanner Season 4 Range 25 Column 8,33.07541,-111.974933,2017 Jun 30,212.0,PI329338,MAC Field Scanner Season 4 Range 25 Column 8,25,8
190516,MAC Field Scanner Season 4 Range 25 Column 9,33.07541,-111.974917,2017 Aug 26,366.0,PI455217,MAC Field Scanner Season 4 Range 25 Column 9,25,9
353829,MAC Field Scanner Season 4 Range 21 Column 9,33.075266,-111.974917,2017 Aug 16,371.0,PI569453,MAC Field Scanner Season 4 Range 21 Column 9,21,9
75730,MAC Field Scanner Season 4 Range 48 Column 12,33.076237,-111.974868,2017 Jun 2,74.0,PI563009,MAC Field Scanner Season 4 Range 48 Column 12,48,12
268935,MAC Field Scanner Season 4 Range 28 Column 9,33.075518,-111.974917,2017 Aug 30,359.0,PI329646,MAC Field Scanner Season 4 Range 28 Column 9,28,9
78773,MAC Field Scanner Season 4 Range 46 Column 12,33.076165,-111.974868,2017 Jun 16,126.0,PI535793,MAC Field Scanner Season 4 Range 46 Column 12,46,12
352229,MAC Field Scanner Season 4 Range 52 Column 14,33.076381,-111.974836,2017 Jul 20,272.0,PI329618,MAC Field Scanner Season 4 Range 52 Column 14,52,14


### IV. Change string date values to iso datetime format

In [15]:
new_dates = []

for d in df_4.date.values:
    
    if 'Phoenix' in d:
        new_name = d[:-18]
        new_dates.append(new_name)
    
    else:
        new_name = d
        new_dates.append(new_name)
        
print(df_4.shape[0])
print(len(new_dates))

58056
58056


In [16]:
iso_format_dates = pd.to_datetime(new_dates)

In [17]:
df_5 = df_4.copy()

df_5['date_1'] = iso_format_dates
df_5.head()

Unnamed: 0,sitename,lat,lon,date,mean,cultivar,sitename_1,range,column,date_1
51339,MAC Field Scanner Season 4 Range 30 Column 9 E,33.07559,-111.974913,2017 Jun 19 (America/Phoenix),91.0,PI329518,MAC Field Scanner Season 4 Range 30 Column 9,30,9,2017-06-19
51340,MAC Field Scanner Season 4 Range 20 Column 15 W,33.075231,-111.974823,2017 Jun 4 (America/Phoenix),99.0,PI152828,MAC Field Scanner Season 4 Range 20 Column 15,20,15,2017-06-04
51341,MAC Field Scanner Season 4 Range 20 Column 15 W,33.075231,-111.974823,2017 May 29 (America/Phoenix),63.0,PI152828,MAC Field Scanner Season 4 Range 20 Column 15,20,15,2017-05-29
75000,MAC Field Scanner Season 4 Range 35 Column 14,33.07577,-111.974835,2017 May 20,22.0,PI619838,MAC Field Scanner Season 4 Range 35 Column 14,35,14,2017-05-20
75001,MAC Field Scanner Season 4 Range 36 Column 2,33.075806,-111.975032,2017 May 20,26.0,PI155885,MAC Field Scanner Season 4 Range 36 Column 2,36,2,2017-05-20


### V. Drop, rename, & reorder columns

In [18]:
new_col_order = ['sitename_1', 'range', 'column', 'lat', 'lon', 'date_1', 'cultivar', 'mean']
df_6 = pd.DataFrame(data=df_5, columns=new_col_order, index=df_5.index)
df_6.head()

Unnamed: 0,sitename_1,range,column,lat,lon,date_1,cultivar,mean
51339,MAC Field Scanner Season 4 Range 30 Column 9,30,9,33.07559,-111.974913,2017-06-19,PI329518,91.0
51340,MAC Field Scanner Season 4 Range 20 Column 15,20,15,33.075231,-111.974823,2017-06-04,PI152828,99.0
51341,MAC Field Scanner Season 4 Range 20 Column 15,20,15,33.075231,-111.974823,2017-05-29,PI152828,63.0
75000,MAC Field Scanner Season 4 Range 35 Column 14,35,14,33.07577,-111.974835,2017-05-20,PI619838,22.0
75001,MAC Field Scanner Season 4 Range 36 Column 2,36,2,33.075806,-111.975032,2017-05-20,PI155885,26.0


In [20]:
df_7 = df_6.rename({'sitename_1': 'sitename', 'date_1': 'date', 'mean': 'canopy_height'}, axis=1)
df_7.head()

Unnamed: 0,sitename,range,column,lat,lon,date,cultivar,canopy_height
51339,MAC Field Scanner Season 4 Range 30 Column 9,30,9,33.07559,-111.974913,2017-06-19,PI329518,91.0
51340,MAC Field Scanner Season 4 Range 20 Column 15,20,15,33.075231,-111.974823,2017-06-04,PI152828,99.0
51341,MAC Field Scanner Season 4 Range 20 Column 15,20,15,33.075231,-111.974823,2017-05-29,PI152828,63.0
75000,MAC Field Scanner Season 4 Range 35 Column 14,35,14,33.07577,-111.974835,2017-05-20,PI619838,22.0
75001,MAC Field Scanner Season 4 Range 36 Column 2,36,2,33.075806,-111.975032,2017-05-20,PI155885,26.0


### VI. Take average values for canopy heights per sitename if measured on the same date

In [24]:
df_7.to_sql('end_of_season_canopy_heights.sqlite', conn)

In [38]:
df_8 = pd.read_sql_query("""
                            select sitename, range, column, lat, lon, date, cultivar, 
                            canopy_height, avg(canopy_height) as avg_canopy_height 
                            from 'end_of_season_canopy_heights.sqlite'
                            group by sitename, date
                            order by date DESC;
                            """, conn)

print(df_8.shape)
df_8.head()

(32995, 9)


Unnamed: 0,sitename,range,column,lat,lon,date,cultivar,canopy_height,avg_canopy_height
0,MAC Field Scanner Season 4 Range 10 Column 10,10,10,33.074871,-111.9749,2017-08-30 00:00:00,PI152816,340.0,340.0
1,MAC Field Scanner Season 4 Range 10 Column 12,10,12,33.074871,-111.974868,2017-08-30 00:00:00,PI329501,369.0,369.0
2,MAC Field Scanner Season 4 Range 10 Column 15,10,15,33.074871,-111.974818,2017-08-30 00:00:00,PI144134,331.0,331.0
3,MAC Field Scanner Season 4 Range 10 Column 2,10,2,33.074871,-111.975031,2017-08-30 00:00:00,PI641807,307.0,307.0
4,MAC Field Scanner Season 4 Range 10 Column 5,10,5,33.074871,-111.974982,2017-08-30 00:00:00,PI641821,337.0,337.0


#### Last Steps

In [None]:
need_to_create_csv = False

if need_to_create_csv:

    timestamp = datetime.datetime.now().replace(microsecond=0).isoformat()
    output_filename = f'max_canopy_heights_{timestamp}.csv'.replace(':', '')
    max_canopy_heights_4.to_csv(f'data/processed/{output_filename}')

## For Future Tests

In [27]:
# range 20 column 13, range 30 column 9

# test_sites = df_7.loc[(df_7.sitename == 'MAC Field Scanner Season 4 Range 20 Column 13') | (df_7.sitename == 'MAC Field Scanner Season 4 Range 30 Column 9')]

In [35]:
# test_sites.sort_values(by='date')

In [36]:
# test_df.loc[(test_df.date == '2017-05-29 00:00:00') & (test_df.sitename == 'MAC Field Scanner Season 4 Range 20 Column 13')]

In [37]:
# May 4 should be 10

# test_df.loc[(test_df.date == '2017-05-04 00:00:00') & (test_df.sitename == 'MAC Field Scanner Season 4 Range 20 Column 13')]