## Canopy Heights Season 4
* end of season heights
    * one row per plot
    * sitename as index
    * keep latest date only
* time series for whole season 
    * one row per plot per date
    * date as index

### Season Dates
* Planting: 2017-04-20
* Last Day of Harvest: 2017-09-16

In [None]:
import datetime
import pandas as pd
import numpy as np
import sqlite3
import sqlalchemy

In [None]:
df_0 = pd.read_csv('data/raw/mac_season_4.csv', low_memory=False)
# df_0.head()

In [None]:
df_0.shape

### I. Connect to sqlite database

In [None]:
conn = sqlite3.connect('end_of_season_canopy_heights.sqlite')
cursor = conn.cursor()
print("Opened database successfully")

In [None]:
df_1 = df_0.loc[df_0.trait == 'canopy_height']
df_1.shape

In [None]:
df_1.columns

In [None]:
cols_to_drop = ['Unnamed: 0', 'checked', 'result_type', 'id', 'citation_id', 'site_id', 'treatment_id', 'city', 
                'scientificname', 'commonname', 'genus', 'species_id', 'cultivar_id', 'author', 'citation_year', 
                'treatment', 'time', 'raw_date', 'month', 'year', 'dateloc', 'trait', 'trait_description', 'units', 'n',
                'statname', 'stat', 'notes', 'access_level', 'entity', 'method_name', 'view_url', 'edit_url']

In [None]:
df_2 = df_1.drop(labels=cols_to_drop, axis=1)
print(df_2.shape)
# df_2.head()

### II. Strip subplot designations

In [None]:
sitename_values = df_2.sitename.values
no_e_w_names = []

for name in sitename_values:
    
    if name.endswith(' W') | name.endswith(' E'):
        name = name[:-2]
        no_e_w_names.append(name)
        
    else:
        no_e_w_names.append(name)

In [None]:
df_3 = df_2.copy()
df_3['sitename_1'] = no_e_w_names
# df_3.head()

### III. Extract Range and Column Values

In [None]:
df_4 = df_3.copy()

df_4['range'] = df_4['sitename_1'].str.extract("Range (\d+)").astype(int)
df_4['column'] = df_4['sitename_1'].str.extract("Column (\d+)").astype(int)

# df_4.sample(n=7)

### IV. Change string date values to iso datetime format

In [None]:
new_dates = []

for d in df_4.date.values:
    
    if 'Phoenix' in d:
        new_name = d[:-18]
        new_dates.append(new_name)
    
    else:
        new_name = d
        new_dates.append(new_name)
        
print(df_4.shape[0])
print(len(new_dates))

In [None]:
iso_format_dates = pd.to_datetime(new_dates)

In [None]:
df_5 = df_4.copy()

df_5['date_1'] = iso_format_dates
# df_5.head()

### V. Drop, rename, & reorder columns

In [None]:
new_col_order = ['sitename_1', 'range', 'column', 'lat', 'lon', 'date_1', 'cultivar', 'mean']
df_6 = pd.DataFrame(data=df_5, columns=new_col_order, index=df_5.index)
# df_6.head()

In [None]:
df_7 = df_6.rename({'sitename_1': 'sitename', 'date_1': 'date', 'mean': 'canopy_height'}, axis=1)
# df_7.head()

### VI. Take average values for canopy heights per sitename if measured on the same date

In [None]:
df_7.to_sql('end_of_season_canopy_heights.sqlite', conn)

In [None]:
df_8 = pd.read_sql_query("""
                            SELECT sitename, range, column, lat, lon, date, cultivar, 
                            canopy_height, avg(canopy_height) AS avg_canopy_height 
                            FROM 'end_of_season_canopy_heights.sqlite'
                            GROUP BY sitename, date
                            ORDER BY date DESC;
                            """, conn)

print(df_8.shape)
# df_8.head()

### VII. Time Series for Canopy Heights throughout Season

In [None]:
season_heights_0 = df_8.set_index(keys='date')
print(df_8.shape)
print(season_heights_0.shape)
# season_heights_0.head()

In [None]:
season_heights_1 = season_heights_0.sort_index()
print(season_heights_1.shape)
# season_heights_1.head()

In [None]:
# check differences between canopy_height and avg_canopy_height columns

# season_heights_1.loc[season_heights_1.canopy_height != season_heights_1.avg_canopy_height]

In [None]:
season_heights_2 = season_heights_1.drop(labels='canopy_height', axis=1)
# season_heights_2.tail()

In [None]:
season_heights_3 = season_heights_2.copy()
season_heights_3['canopy_height'] = season_heights_3.avg_canopy_height.round(2)
# season_heights_3.sample(n=15)

In [None]:
season_heights_4 = season_heights_3.drop(labels='avg_canopy_height', axis=1)
# season_heights_4.tail()

### VIII. End of Season Heights
* Canopy height on date closest to harvest

In [None]:
end_of_season_0 = pd.read_sql_query("""
                            SELECT sitename, range, column, lat, lon, date, cultivar,
                            avg(canopy_height) AS avg_canopy_height 
                            FROM 'end_of_season_canopy_heights.sqlite'
                            GROUP BY sitename, date
                            ORDER BY date DESC
                            """, conn)

end_of_season_0.shape

In [None]:
end_of_season_0.sitename.nunique()

In [None]:
# end_of_season_0.head()

In [None]:
end_of_season_1 = end_of_season_0.drop_duplicates(subset=['sitename', 'cultivar'])
end_of_season_1.shape

In [None]:
# end_of_season_1.tail()

In [None]:
season_heights_4.loc[season_heights_4.sitename == 'MAC Field Scanner Season 4 Range 20 Column 5']

In [None]:
print(f'Latest date: {end_of_season_1.date.max()}')
print(f'Earliest date: {end_of_season_1.date.min()}')
print('')
print(f'Tallest height: {end_of_season_1.avg_canopy_height.max()}')
print(f'Shortest height: {end_of_season_1.avg_canopy_height.min()}')

In [None]:
end_of_season_2 = end_of_season_1.set_index('sitename')
# end_of_season_2.head()

In [None]:
end_of_season_3 = end_of_season_2.rename({'avg_canopy_height': 'canopy_height'}, axis=1)
end_of_season_3['canopy_height'] = end_of_season_3.canopy_height.round(2)
# end_of_season_3.tail()

#### Last Steps
* Change filename and df name as needed for `.csv` generated

In [None]:
need_to_create_csv = False

if need_to_create_csv:

    timestamp = datetime.datetime.now().replace(microsecond=0).isoformat()
    output_filename = f'canopy_heights_end_of_season_4_{timestamp}.csv'.replace(':', '')
    end_of_season_3.to_csv(f'data/processed/{output_filename}')