In [3]:
##Import necessary packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sqlite3
from IPython.display import clear_output
##Import bq_helper to pull forestry data from Kaggle
from bq_helper import BigQueryHelper #pip install -e git+https://github.com/SohierDane/BigQuery_Helper#egg=bq_helper
import os
#Setup Google Credentials(Data is stored on Google Cloud Servers)
os.environ["GOOGLE_APPLICATION_CREDENTIALS"]="ErdosTrees-f85df8c8cb95.json" #Get .json file from Google Cloud Platform

In [4]:
##Connect to USFS FIA data and list the available datasets
usfs = BigQueryHelper("bigquery-public-data", "usfs_fia")
#usfs.list_tables()

In [93]:
#This gets a list of the 10 most common trees in the data set
query1 = f"""
    SELECT
        species_common_name,
        COUNT(species_common_name)
    FROM
        `bigquery-public-data.usfs_fia.plot_tree`
    WHERE
        total_height > 0
    GROUP BY
        species_common_name
    ;        """
df = usfs.query_to_pandas_safe(query1, max_gb_scanned=10)
species = list(df.sort_values('f0_',ascending = False).head(10).species_common_name)
print(species)

['loblolly pine', 'red maple', 'sweetgum', 'Douglas-fir', 'slash pine', 'white oak', 'sugar maple', 'quaking aspen', 'lodgepole pine', 'ponderosa pine']


In [105]:
#List of states in alphabetical order and USFS state ids
states = ["Alabama","Alaska","Arizona","Arkansas","California","Colorado",
  "Connecticut","Delaware","Florida","Georgia","Hawaii","Idaho","Illinois",
  "Indiana","Iowa","Kansas","Kentucky","Louisiana","Maine","Maryland",
  "Massachusetts","Michigan","Minnesota","Mississippi","Missouri","Montana",
  "Nebraska","Nevada","New Hampshire","New Jersey","New Mexico","New York",
  "North Carolina","North Dakota","Ohio","Oklahoma","Oregon","Pennsylvania",
  "Rhode Island","South Carolina","South Dakota","Tennessee","Texas","Utah",
  "Vermont","Virginia","Washington","West Virginia","Wisconsin","Wyoming"]
state_id = [1,2,4,5,6,8,9,10,12,13,15,16,17,18,19,20,21,22,23,24,25,26,27,28,
           29,30,31,32,33,34,35,36,37,38,39,40,41,42,44,45,46,47,48,49,50,51,
           53,54,55,56]

In [122]:
#Create dataframe for holding full species data 
c = species.copy()
c.insert(0,'State')
trees = pd.DataFrame(index=state_id, columns=c)
trees['State'] = states

In [154]:
#Get population of each species for each state and then  save as a csv
for s in species:
    query = f"""
        SELECT
            plot_state_code,
            COUNT(species_common_name)
        FROM
            `bigquery-public-data.usfs_fia.plot_tree`
        WHERE
            species_common_name = "{s}"
            AND total_height > 0
        GROUP BY
             plot_state_code
        ;
                """
    df = usfs.query_to_pandas_safe(query, max_gb_scanned=10)
    
    df.set_index('plot_state_code', inplace=True)
    for ind in df.index:
        try:
            trees.loc[ind].loc[s] = df['f0_'][ind]
        except:
            print(f'state id {ind} failed')
trees = trees.fillna(0)
trees.to_csv('TreePop.csv') #save to csv for plotting later

In [None]:
#Currently nothing... just some code I want to keep track of, will be using parts of it later perhaps...
#Don't run this cell, it won't work
for s in species:
    query = f"""
        SELECT
            plot_state_code,
            AVG(total_height) as avg_height,
            AVG(current_diameter) as avg_diameter,
            total_height,
            current_diameter,
            measurement_year,
            latitude,
            longitude
        FROM
            `bigquery-public-data.usfs_fia.plot_tree`
        WHERE
            species_common_name = {s}
            AND total_height > 0
        GROUP BY
             plot_sequence_number,
             plot_state_code,
             measurement_year,
             measurement_month,
             species_code,
             latitude,
             longitude
        ;
                """

#This will create a dataframe of the number of trees sampled for each tree type across the entire US
#The final output is a list of the 10 trees that have the highest frequency in the most states (not the most populous...)
#This cell is also not used right now...
trees = pd.DataFrame(columns = ['num_Trees','num_States','States'])
for i in range(1,51):
    clear_output(wait=True)
    print(f'Now working on: {states[i-1]}')
    query1 = f"""
        SELECT
            species_common_name,
        FROM
            `bigquery-public-data.usfs_fia.plot_tree`
        WHERE
            total_height > 0
            AND plot_state_code = {i}
        ;        """
    df = usfs.query_to_pandas_safe(query1, max_gb_scanned=2)
    df_Trees = df['species_common_name'].value_counts()
    df_Trees = df_Trees[df_Trees > 1000]
    for j,t in enumerate(df_Trees.index):
        if t in trees.index:
            trees.loc[t] = {'num_Trees':df_Trees[j] + trees.loc[t].num_Trees,
                            'num_States':trees.loc[t].num_States + 1,
                            'States':trees.loc[t].States + f',{states[i-1]}'}
        else:
            trees.loc[t] = {'num_Trees':df_Trees[j],'num_States':1,'States':states[i-1]}
clear_output(wait=True)
print('Finished!')
#Top 10 trees with a population of over 1000 that occur in the most states
numStates = list(trees.sort_values('num_States',ascending = False).index[:10])