In [2]:
##Import necessary packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sqlite3
from IPython.display import clear_output
##Import bq_helper to pull forestry data from Kaggle
from bq_helper import BigQueryHelper #pip install -e git+https://github.com/SohierDane/BigQuery_Helper#egg=bq_helper
import os
#Setup Google Credentials(Data is stored on Google Cloud Servers)
os.environ["GOOGLE_APPLICATION_CREDENTIALS"]="ErdosTrees-f85df8c8cb95.json" #Get .json file from Google Cloud Platform

In [3]:
##Connect to USFS FIA data and list the available datasets
usfs = BigQueryHelper("bigquery-public-data", "usfs_fia")
#usfs.list_tables()

In [4]:
#List of states in alphabetical order and USFS state ids
states = ["Alabama","Alaska","Arizona","Arkansas","California","Colorado",
  "Connecticut","Delaware","Florida","Georgia","Hawaii","Idaho","Illinois",
  "Indiana","Iowa","Kansas","Kentucky","Louisiana","Maine","Maryland",
  "Massachusetts","Michigan","Minnesota","Mississippi","Missouri","Montana",
  "Nebraska","Nevada","New Hampshire","New Jersey","New Mexico","New York",
  "North Carolina","North Dakota","Ohio","Oklahoma","Oregon","Pennsylvania",
  "Rhode Island","South Carolina","South Dakota","Tennessee","Texas","Utah",
  "Vermont","Virginia","Washington","West Virginia","Wisconsin","Wyoming"]
state_id = [1,2,4,5,6,8,9,10,12,13,15,16,17,18,19,20,21,22,23,24,25,26,27,28,
           29,30,31,32,33,34,35,36,37,38,39,40,41,42,44,45,46,47,48,49,50,51,
           53,54,55,56]
states_df = pd.DataFrame(states,index = state_id, columns = ['states'])

In [5]:
#makes state id/name dataframe (same result as the cell above, quite pointless, just an alternative...)
query = f"""
    SELECT
        plot_state_code,
        plot_state_code_name
    FROM
        `bigquery-public-data.usfs_fia.plot_tree`
    WHERE
        plot_state_code < 57
    GROUP BY
        plot_state_code,
        plot_state_code_name
    ORDER BY
        plot_state_code
    ;
            """
df = usfs.query_to_pandas_safe(query, max_gb_scanned=10)
df.set_index('plot_state_code', inplace=True)
#df.to_csv('State_Region.csv')

In [6]:
#load region identification and prepare dataframe for holding species
regions = pd.read_csv('State_Region.csv',index_col = 0)

species = pd.DataFrame(index = [1,2,3,4],columns = ['tree1','tree2','tree3','tree4','tree5'])
species.index.name = 'region'

In [22]:
#Get the top 5 species for each region
for i in [1,2,3,4]:
    f = tuple(regions.index[regions.state_region == i])

    query = f"""
        SELECT
            species_common_name,
            COUNT(species_common_name) AS pop
        FROM
            `bigquery-public-data.usfs_fia.plot_tree`
        WHERE
            plot_state_code IN {f}
            AND measurement_year > 2001

        GROUP BY
            species_common_name
        ORDER BY
            pop DESC
        ;        """
    df = usfs.query_to_pandas_safe(query, max_gb_scanned=10)
    species.loc[i] = list(df.head().species_common_name)
species.to_csv('Top5_Species.csv')

In [18]:
#Get volume, mass for specific region, save csv file for that region

for region in [2]:
    f = tuple(regions.index[regions.state_region == region])
    S = tuple(species.loc[region].values)
    query = f"""
        SELECT
            species_common_name,
            plot_state_code,
            plot_county_code,
            measurement_year,
            SUM(gross_cubicfoot_volume) AS volume,
            SUM(aboveground_carbon) AS mass,
            COUNT(species_common_name) AS number_trees        
        FROM
            `bigquery-public-data.usfs_fia.plot_tree`
        WHERE
            plot_state_code in {f}
            AND measurement_year BETWEEN 2001 AND 2019
            AND species_common_name in {S}
            AND aboveground_carbon > 0
        GROUP BY
            species_common_name,
            plot_state_code,
            plot_county_code,
            measurement_year
        ORDER BY
            species_common_name,
            plot_state_code,
            plot_county_code,
            measurement_year
        ;        """
    df = usfs.query_to_pandas_safe(query, max_gb_scanned=20)
    df.to_csv(f'Trees_Region{region}.csv',index=False)

In [19]:
df #just a sanity check that things look correct

Unnamed: 0,species_common_name,plot_state_code,plot_county_code,measurement_year,volume,mass,number_trees
0,loblolly pine,1,1,2001,580.837421,12112.015753,148
1,loblolly pine,1,1,2002,1033.094786,20520.755920,118
2,loblolly pine,1,1,2003,428.743737,8896.363848,127
3,loblolly pine,1,1,2004,215.031215,4509.813002,76
4,loblolly pine,1,1,2005,1175.172070,24137.645809,249
...,...,...,...,...,...,...,...
45360,yellow-poplar,51,810,2007,12.139242,211.766165,3
45361,yellow-poplar,51,810,2010,2.818890,64.241959,1
45362,yellow-poplar,51,810,2012,6.227634,120.448936,1
45363,yellow-poplar,51,810,2016,3.697974,81.416153,1
