In [10]:
import numpy as np
import pandas as pd
import sqlite3
import bq_helper
from bq_helper import BigQueryHelper
from datetime import datetime
from lib_DMH import gsodquery
from lib_DMH import mapplot
import cartopy
import cartopy.crs as ccrs
import cartopy.io.shapereader as shpreader
import shapely.geometry as sgeom
import shapely
import matplotlib.pyplot as plt
import time

In [11]:
usfs = bq_helper.BigQueryHelper(active_project="bigquery-public-data",
                                   dataset_name="usfs_fia")

In [12]:
#states = ('Florida')
states =('Alabama', 'Arkansas', 'Florida', 
         'Georgia', 'Kentucky', 'Louisiana', 
         'Mississippi', 'North Carolina', 
         'Oklahoma', 'South Carolina', 'Tennessee', 
         'Texas', 'Virginia')

In [42]:
trees = (316,131,611,202,746,318,12,491,108,802)
query1 = f"""
        SELECT
            plot_phase_2_plot_number,
            plot_state_code,
            measurement_year,
            measurement_month,
            species_code,
            latitude,
            longitude,
            AVG(total_height) as avg_height,
            AVG(current_diameter) as avg_diameter
        FROM
            `bigquery-public-data.usfs_fia.plot_tree`
        WHERE
            species_code in {trees} AND plot_state_code_name in {states} 
        GROUP BY
             plot_phase_2_plot_number,
             plot_state_code,
             measurement_year,
             measurement_month,
             species_code,
             latitude,
             longitude
        ;
                """

In [43]:
plots = usfs.query_to_pandas_safe(query1, max_gb_scanned=10)

In [46]:
plots = plots.dropna()
plots['date'] = pd.to_datetime(plots[['measurement_year','measurement_month']].rename(columns = {'measurement_year':'year','measurement_month':'month'}).assign(day=1).astype(int))
plots = plots.reset_index(drop=True)
plots

Unnamed: 0,plot_phase_2_plot_number,plot_state_code,measurement_year,measurement_month,species_code,latitude,longitude,avg_height,avg_diameter,date
0,61,5,2019.0,1.0,131,33.274956,-93.069420,57.933333,10.704000,2019-01-01
1,73,28,2019.0,2.0,131,32.105919,-88.546005,66.645833,9.058542,2019-02-01
2,37,45,2019.0,1.0,131,32.337814,-81.032860,51.250000,7.624750,2019-01-01
3,44,28,2019.0,1.0,131,32.147362,-89.242004,28.888889,4.692593,2019-01-01
4,90001,12,1969.0,3.0,131,30.360001,-83.779999,85.666667,18.236364,1969-03-01
...,...,...,...,...,...,...,...,...,...,...
434760,13,1,2018.0,2.0,131,32.507706,-86.682121,70.000000,12.030000,2018-02-01
434761,11,37,2018.0,3.0,611,34.905674,-78.598930,24.000000,2.900000,2018-03-01
434762,29,48,2018.0,4.0,802,31.943209,-94.116623,31.000000,5.900000,2018-04-01
434763,59,1,2018.0,8.0,316,34.548733,-87.605629,37.000000,6.200000,2018-08-01


In [47]:
noaa_gsod = bq_helper.BigQueryHelper(active_project= "bigquery-public-data", 
                                     dataset_name= "noaa_gsod")

In [48]:
#defining the query to grab the station table
query1 = """
            SELECT 
                usaf AS Station_number, 
                lat AS Latitude, 
                lon AS Longitude, 
            FROM 
                `bigquery-public-data.noaa_gsod.stations` 
            WHERE 
                country = 'US' AND lat IS NOT NULL AND lon IS NOT NULL AND NOT (lat = 0.0 AND lon = 0.0) AND NOT usaf = '999999' 
        """

In [49]:
#The commented portion would give all the stations, including repeated ones
stations1 = noaa_gsod.query_to_pandas_safe(query1, max_gb_scanned=10)
stations = stations1.copy()
stations = stations.drop(stations.loc[stations.Station_number.duplicated(keep='last')].index)
stations_unique = stations.drop(stations.loc[(stations.Latitude.duplicated(keep='last')) 
                                             & (stations.Longitude.duplicated(keep='last'))].index).reset_index(drop=True)

In [50]:
def find_nearest3(lat,long,df):
    
    index_nearest = np.sqrt((lat-df.Latitude)**2 + (long-df.Longitude)**2).idxmin()
    return df.Station_number[index_nearest]

def get_station3(sta, plt):
    
    plt_copy = plt.copy()
    nstation = np.empty(len(plt_copy)).astype(str)
    nstation_ind = np.empty(len(plt_copy))
    
    for i in range(len(plt_copy)):
        
        nstation[i] = find_nearest3(round(plt_copy.latitude[i],3),round(plt_copy.longitude[i],3),sta)
        
    plt_copy['nearest_station'] = nstation
    
    return plt_copy

def add_temp(df,gsod):
    temps = np.empty(len(df))
    for i in range(len(df)):
        try:
            temps[i] = np.array(gsod.loc[(gsod.Station_number == df.nearest_station[i]) 
                                         & (gsod.date == df.date[i])].Mean_temp)[0]
        except:
            temps[i] = np.nan
    df['mean_temp'] = temps
    return df

def latlong(gsod,stations):
    
    lat = np.empty(len(gsod.Station_number))
    long = np.empty(len(gsod.Station_number))
    gsod_copy = gsod.copy()
    for i in range(len(gsod.Station_number)):
        lat[i] = np.array(stations.loc[stations.Station_number == gsod.Station_number[i]].Latitude)[0]
        long[i] = np.array(stations.loc[stations.Station_number == gsod.Station_number[i]].Longitude)[0]
    gsod_copy['Latitude'] = lat
    gsod_copy['Longitude'] = long
    return gsod_copy

In [51]:
plots_to_2019 = plots.loc[plots.date >= datetime(2000,1,1)].sort_values('date').reset_index(drop=True)

In [52]:
plot_stations = pd.DataFrame()
stations_tuple = tuple(stations_unique.Station_number)

In [53]:
t0 = time.perf_counter()
for i in range(2019-2000+1):
    
    year = 2000+i
    plot_year = plots_to_2019.loc[plots_to_2019.measurement_year == year].reset_index(drop=True)
    gsod = gsodquery(year, stations_tuple, noaa_gsod)
    gsod['date'] = pd.to_datetime(gsod[['Year','Month']].assign(day=1))
    gsod = latlong(gsod,stations_unique)
    plot_station = get_station3(gsod, plot_year)
    plot_station = add_temp(plot_station,gsod)
    plot_stations = pd.concat([plot_stations,plot_station])

t1 = time.perf_counter()
total_t = t1-t0

In [55]:
plot_stations.reset_index(drop=True)

Unnamed: 0,plot_phase_2_plot_number,plot_state_code,measurement_year,measurement_month,species_code,latitude,longitude,avg_height,avg_diameter,date,nearest_station,mean_temp
0,55,45,2000.0,1.0,316,33.830917,-79.296043,61.454545,8.943636,2000-01-01,747910,44.629032
1,20,45,2000.0,1.0,491,34.857124,-82.795647,30.500000,4.020000,2000-01-01,723119,41.374194
2,48,1,2000.0,1.0,491,32.783588,-86.329277,30.333333,4.333333,2000-01-01,722265,50.827586
3,51,51,2000.0,1.0,611,37.019611,-78.791679,45.333333,5.933333,2000-01-01,724017,37.577419
4,114,1,2000.0,1.0,611,32.632221,-87.441994,77.428571,9.900000,2000-01-01,722286,46.090323
...,...,...,...,...,...,...,...,...,...,...,...,...
237870,99,1,2019.0,6.0,611,31.430559,-87.490250,67.000000,11.200000,2019-06-01,722276,80.950000
237871,64,1,2019.0,6.0,611,32.690655,-87.816071,48.500000,5.500000,2019-06-01,722286,80.243333
237872,68,1,2019.0,6.0,316,34.116493,-87.041344,46.125000,7.500000,2019-06-01,722031,74.153333
237873,40,1,2019.0,6.0,611,33.877670,-86.350563,16.000000,1.100000,2019-06-01,722285,75.033333


In [56]:
plot_stations.to_csv("plots_with_temp2.csv")

In [57]:
total_t

1804.9889938749984