In [7]:
##Import necessary packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sqlite3
from IPython.display import clear_output
##Import bq_helper to pull forestry data from Kaggle
from bq_helper import BigQueryHelper #pip install -e git+https://github.com/SohierDane/BigQuery_Helper#egg=bq_helper
import os
#Setup Google Credentials(Data is stored on Google Cloud Servers)
os.environ["GOOGLE_APPLICATION_CREDENTIALS"]="ErdosTrees-f85df8c8cb95.json" #Get .json file from Google Cloud Platform

In [8]:
##Connect to USFS FIA data and list the available datasets
usfs = BigQueryHelper("bigquery-public-data", "usfs_fia")
#usfs.list_tables()

In [9]:
#This gets a list of the 10 most common trees in the data set
query1 = f"""
    SELECT
        species_common_name,
        COUNT(species_common_name)
    FROM
        `bigquery-public-data.usfs_fia.plot_tree`
    WHERE
        total_height > 0
    GROUP BY
        species_common_name
    ;        """
df = usfs.query_to_pandas_safe(query1, max_gb_scanned=10)
species = list(df.sort_values('f0_',ascending = False).head(10).species_common_name)
print(species)

['loblolly pine', 'red maple', 'sweetgum', 'Douglas-fir', 'slash pine', 'white oak', 'sugar maple', 'quaking aspen', 'lodgepole pine', 'ponderosa pine']


In [52]:
#List of states in alphabetical order and USFS state ids
states = ["Alabama","Alaska","Arizona","Arkansas","California","Colorado",
  "Connecticut","Delaware","Florida","Georgia","Hawaii","Idaho","Illinois",
  "Indiana","Iowa","Kansas","Kentucky","Louisiana","Maine","Maryland",
  "Massachusetts","Michigan","Minnesota","Mississippi","Missouri","Montana",
  "Nebraska","Nevada","New Hampshire","New Jersey","New Mexico","New York",
  "North Carolina","North Dakota","Ohio","Oklahoma","Oregon","Pennsylvania",
  "Rhode Island","South Carolina","South Dakota","Tennessee","Texas","Utah",
  "Vermont","Virginia","Washington","West Virginia","Wisconsin","Wyoming"]
state_id = [1,2,4,5,6,8,9,10,12,13,15,16,17,18,19,20,21,22,23,24,25,26,27,28,
           29,30,31,32,33,34,35,36,37,38,39,40,41,42,44,45,46,47,48,49,50,51,
           53,54,55,56]
states_df = pd.DataFrame(states,index = state_id, columns = ['states'])

In [5]:
#Create dataframe for holding full species data 
c = species.copy()
c.insert(0,'State')
trees = pd.DataFrame(index=state_id, columns=c)
trees['State'] = states

In [6]:
#Get population of each species for each state and then  save as a csv
for s in species:
    query = f"""
        SELECT
            plot_state_code,
            COUNT(species_common_name)
        FROM
            `bigquery-public-data.usfs_fia.plot_tree`
        WHERE
            species_common_name = "{s}"
            AND total_height > 0
        GROUP BY
             plot_state_code
        ;
                """
    df = usfs.query_to_pandas_safe(query, max_gb_scanned=10)
    
    df.set_index('plot_state_code', inplace=True)
    for ind in df.index: #There is probably a more efficient way to do this step
        try:
            trees.loc[ind].loc[s] = df['f0_'][ind]
        except:
            print(f'state id {ind} failed')
trees = trees.fillna(0)
trees.to_csv('TreePop.csv') #save to csv for plotting later

In [53]:
#Create dataframe for holding full height/diameter data for top 10 species and all states
MI = pd.MultiIndex.from_product([species,states], names=['species', 'state'])
trees = pd.DataFrame(np.zeros((500, 2)), index=MI, columns=['Avg_Height', 'Avg_Diameter'])

In [92]:
#Get Avg height and Avg Diameter of each species for each state and then  save as a csv
for s in species:
    query = f"""
        SELECT
            plot_state_code,
            AVG(total_height) as avg_height,
            AVG(current_diameter) as avg_diameter
        FROM
            `bigquery-public-data.usfs_fia.plot_tree`
        WHERE
            species_common_name = "{s}"
            AND total_height > 0
        GROUP BY
             plot_state_code
        ;
                """
    df = usfs.query_to_pandas_safe(query, max_gb_scanned=10)
    for ind in df.index: #There is probably a more efficient way to do this step, this is a slow loop
        try:
            st = states_df.loc[int(df.iloc[ind]['plot_state_code'])][0]
            trees.loc[s,st] = df.iloc[ind][['avg_height','avg_diameter']].values
        except:
            print(f'state id {ind} failed')
trees.to_csv('Tree_Heights_Diameters.csv') #save to csv for plotting later