In [1]:
import numpy as np
import pandas as pd
import sqlite3
import bq_helper
from bq_helper import BigQueryHelper
from datetime import datetime
from lib_DMH import gsodquery
from lib_DMH import mapplot
import cartopy
import cartopy.crs as ccrs
import cartopy.io.shapereader as shpreader
import shapely.geometry as sgeom
import shapely
import matplotlib.pyplot as plt
import time

In [2]:
usfs = bq_helper.BigQueryHelper(active_project="bigquery-public-data",
                                   dataset_name="usfs_fia")

In [3]:
#states = ('Florida')
states =('Alabama', 'Arkansas', 'Florida', 
         'Georgia', 'Kentucky', 'Louisiana', 
         'Mississippi', 'North Carolina', 
         'Oklahoma', 'South Carolina', 'Tennessee', 
         'Texas', 'Virginia')

In [4]:
trees = (316,131,611,202,746,318,12,491,108,802)
query1 = f"""
        SELECT
            plot_sequence_number,
            plot_state_code,
            measurement_year,
            measurement_month,
            species_code,
            latitude,
            longitude,
            AVG(total_height) as avg_height,
            AVG(current_diameter) as avg_diameter
        FROM
            `bigquery-public-data.usfs_fia.plot_tree`
        WHERE
            species_code in {trees} AND plot_state_code_name in {states} 
        GROUP BY
             plot_sequence_number,
             plot_state_code,
             measurement_year,
             measurement_month,
             species_code,
             latitude,
             longitude
        ;
                """

In [5]:
plots = usfs.query_to_pandas_safe(query1, max_gb_scanned=10)

In [6]:
plots = plots.dropna()
plots['date'] = pd.to_datetime(plots[['measurement_year','measurement_month']].rename(columns = {'measurement_year':'year','measurement_month':'month'}).assign(day=1).astype(int))
plots = plots.reset_index(drop=True)

In [7]:
noaa_gsod = bq_helper.BigQueryHelper(active_project= "bigquery-public-data", 
                                     dataset_name= "noaa_gsod")

In [8]:
#defining the query to grab the station table
query1 = """
            SELECT 
                usaf AS Station_number, 
                lat AS Latitude, 
                lon AS Longitude, 
            FROM 
                `bigquery-public-data.noaa_gsod.stations` 
            WHERE 
                country = 'US' AND lat IS NOT NULL AND lon IS NOT NULL AND NOT (lat = 0.0 AND lon = 0.0) AND NOT usaf = '999999' 
        """

In [9]:
#The commented portion would give all the stations, including repeated ones
stations1 = noaa_gsod.query_to_pandas_safe(query1, max_gb_scanned=10)
stations = stations1.copy()
stations = stations.drop(stations.loc[stations.Station_number.duplicated(keep='last')].index)
stations_unique = stations.drop(stations.loc[(stations.Latitude.duplicated(keep='last')) 
                                             & (stations.Longitude.duplicated(keep='last'))].index).reset_index(drop=True)

In [14]:
def find_nearest3(lat,long,df):
    
    index_nearest = np.sqrt((lat-df.Latitude)**2 + (long-df.Longitude)**2).idxmin()
    return df.Station_number[index_nearest]

def get_station3(sta, plt):
    
    plt_copy = plt.copy()
    nstation = np.empty(len(plt_copy)).astype(str)
    nstation_ind = np.empty(len(plt_copy))
    
    for i in range(len(plt_copy)):
        
        nstation[i] = find_nearest3(round(plt_copy.latitude[i],3),round(plt_copy.longitude[i],3),sta)
        
    plt_copy['nearest_station'] = nstation
    
    return plt_copy

def add_temp(df,gsod):
    temps = np.empty(len(df))
    for i in range(len(df)):
        try:
            temps[i] = np.array(gsod.loc[(gsod.Station_number == df.nearest_station[i]) 
                                         & (gsod.date == df.date[i])].Mean_temp)[0]
        except:
            temps[i] = np.nan
    df['mean_temp'] = temps
    return df

def latlong(gsod,stations):
    
    lat = np.empty(len(gsod.Station_number))
    long = np.empty(len(gsod.Station_number))
    gsod_copy = gsod.copy()
    for i in range(len(gsod.Station_number)):
        lat[i] = np.array(stations.loc[stations.Station_number == gsod.Station_number[i]].Latitude)[0]
        long[i] = np.array(stations.loc[stations.Station_number == gsod.Station_number[i]].Longitude)[0]
    gsod_copy['Latitude'] = lat
    gsod_copy['Longitude'] = long
    return gsod_copy

In [15]:
plots_to_2019 = plots.loc[plots.date >= datetime(2000,1,1)].sort_values('date').reset_index(drop=True)

In [16]:
plot_stations = pd.DataFrame()
stations_tuple = tuple(stations_unique.Station_number)

In [20]:
t0 = time.perf_counter()
for i in range(2019-2000+1):
    
    year = 2000+i
    plot_year = plots_to_2019.loc[plots_to_2019.measurement_year == year].reset_index(drop=True)
    gsod = gsodquery(year, stations_tuple, noaa_gsod)
    gsod['date'] = pd.to_datetime(gsod[['Year','Month']].assign(day=1))
    gsod = latlong(gsod,stations_unique)
    plot_station = get_station3(gsod, plot_year)
    plot_station = add_temp(plot_station,gsod)
    plot_stations = pd.concat([plot_stations,plot_station])

t1 = time.perf_counter()
total_t = t1-t0

In [29]:
plot_stations.reset_index(drop=True)

Unnamed: 0,plot_sequence_number,plot_state_code,measurement_year,measurement_month,species_code,latitude,longitude,avg_height,avg_diameter,date,nearest_station,mean_temp
0,20662348010478,47,2000.0,1.0,802,35.082916,-87.307777,48.000000,5.140000,2000-01-01,723235,42.254839
1,43091200010478,1,2000.0,1.0,802,31.086241,-85.361519,79.000000,15.166667,2000-01-01,722268,51.722581
2,43050775010478,1,2000.0,1.0,131,32.859089,-87.383179,57.666667,11.966667,2000-01-01,722286,46.090323
3,59364890010478,5,2000.0,1.0,491,34.279373,-93.779053,15.000000,1.000000,2000-01-01,723435,42.996774
4,43802867010478,51,2000.0,1.0,611,36.630676,-76.897530,15.000000,1.700000,2000-01-01,723083,40.616667
...,...,...,...,...,...,...,...,...,...,...,...,...
250193,495148254126144,1,2019.0,6.0,611,32.690655,-87.816071,48.500000,5.500000,2019-06-01,722286,80.243333
250194,495148546126144,1,2019.0,6.0,611,34.116493,-87.041344,39.000000,4.645000,2019-06-01,722031,74.153333
250195,495148546126144,1,2019.0,6.0,802,34.116493,-87.041344,77.000000,17.600000,2019-06-01,722031,74.153333
250196,495148469126144,1,2019.0,6.0,131,31.904102,-87.592651,73.650000,9.531000,2019-06-01,722276,80.950000


In [31]:
plot_stations.to_csv("plots_with_temp.csv")

In [24]:
total_t

1852.8024994470034