In [2]:
import numpy as np
import pandas as pd
import sqlite3
import bq_helper
from bq_helper import BigQueryHelper
from datetime import datetime
from lib_DMH import gsodquery
from lib_DMH import mapplot
import cartopy
import cartopy.crs as ccrs
import cartopy.io.shapereader as shpreader
import shapely.geometry as sgeom
import shapely
import matplotlib.pyplot as plt
import time

In [21]:
usfs = bq_helper.BigQueryHelper(active_project="bigquery-public-data",
                                   dataset_name="usfs_fia")

In [22]:
#states = ('Florida')
states =('Alabama', 'Arkansas', 'Florida', 
         'Georgia', 'Kentucky', 'Louisiana', 
         'Mississippi', 'North Carolina', 
         'Oklahoma', 'South Carolina', 'Tennessee', 
         'Texas', 'Virginia')

In [23]:
trees = (316,131,611,202,746,318,12,491,108,802)
query1 = f"""
        SELECT
            plot_phase_2_plot_number,
            plot_state_code,
            plot_county_code,
            measurement_year,
            measurement_month,
            species_code,
            latitude,
            longitude,
            AVG(total_height) as avg_height,
            AVG(current_diameter) as avg_diameter
        FROM
            `bigquery-public-data.usfs_fia.plot_tree`
        WHERE
            species_code in {trees} AND plot_state_code_name in {states} 
        GROUP BY
             plot_phase_2_plot_number,
             plot_state_code,
             plot_county_code,
             measurement_year,
             measurement_month,
             species_code,
             latitude,
             longitude
        ;
                """

In [24]:
plots = usfs.query_to_pandas_safe(query1, max_gb_scanned=10)

In [25]:
plots = plots.dropna()
plots['date'] = pd.to_datetime(plots[['measurement_year','measurement_month']].rename(columns = {'measurement_year':'year','measurement_month':'month'}).assign(day=1).astype(int))
plots = plots.reset_index(drop=True)
plots

Unnamed: 0,plot_phase_2_plot_number,plot_state_code,plot_county_code,measurement_year,measurement_month,species_code,latitude,longitude,avg_height,avg_diameter,date
0,46,5,11,2019.0,1.0,131,33.515865,-92.108780,56.600000,9.171200,2019-01-01
1,39,45,75,2019.0,4.0,131,33.432636,-81.015923,55.140351,8.511053,2019-04-01
2,90099,12,33,1968.0,8.0,802,30.750000,-87.309998,75.500000,17.750000,1968-08-01
3,90071,12,39,1968.0,11.0,131,30.610001,-84.529999,63.000000,11.229412,1968-11-01
4,90131,12,73,1969.0,1.0,131,30.580000,-84.000000,90.909091,16.954545,1969-01-01
...,...,...,...,...,...,...,...,...,...,...,...
434760,54,28,19,2018.0,5.0,131,33.275051,-89.121559,101.000000,21.900000,2018-05-01
434761,50,5,55,2018.0,7.0,802,36.020546,-90.775978,58.000000,8.000000,2018-07-01
434762,77,5,113,2018.0,3.0,491,34.379868,-93.939743,24.000000,3.000000,2018-03-01
434763,81,1,127,2018.0,11.0,802,33.933933,-87.522453,57.000000,7.500000,2018-11-01


In [26]:
noaa_gsod = bq_helper.BigQueryHelper(active_project= "bigquery-public-data", 
                                     dataset_name= "noaa_gsod")

In [27]:
#defining the query to grab the station table
query1 = """
            SELECT 
                usaf AS Station_number, 
                lat AS Latitude, 
                lon AS Longitude, 
            FROM 
                `bigquery-public-data.noaa_gsod.stations` 
            WHERE 
                country = 'US' AND lat IS NOT NULL AND lon IS NOT NULL AND NOT (lat = 0.0 AND lon = 0.0) AND NOT usaf = '999999' 
        """

In [28]:
#The commented portion would give all the stations, including repeated ones
stations1 = noaa_gsod.query_to_pandas_safe(query1, max_gb_scanned=10)
stations = stations1.copy()
stations = stations.drop(stations.loc[stations.Station_number.duplicated(keep='last')].index)
stations_unique = stations.drop(stations.loc[(stations.Latitude.duplicated(keep='last')) 
                                             & (stations.Longitude.duplicated(keep='last'))].index).reset_index(drop=True)

In [75]:
def find_nearest3(lat,long,df):
    
    index_nearest = np.sqrt((lat-df.Latitude)**2 + (long-df.Longitude)**2).idxmin()
    return df.Station_number[index_nearest]

def get_station3(sta, plt):
    
    plt_copy = plt.copy()
    nstation = np.empty(len(plt_copy)).astype(str)

    
    for i in range(len(plt_copy)):
        
        nstation[i] = find_nearest3(round(plt_copy.latitude[i],3),round(plt_copy.longitude[i],3),sta)
        
    plt_copy['nearest_station'] = nstation
    
    return plt_copy

def add_temp(df,gsod):
    feats = np.empty((len(df),3))
    for i in range(len(df)):
        features = np.array(gsod.loc[(gsod.Station_number == df.nearest_station[i]) 
                                         & (gsod.date == df.Year[i])][['Mean_temp','Mean_dwp','Mean_prcp']])
        try:
            feats[i][0] =features[0][0]
        except:
            feats[i][0] = np.nan
        try:
            feats[i][1] =features[0][1]
        except:
            feats[i][1] = np.nan
        try:
            feats[i][2] =features[0][2]
        except:
            feats[i][2] = np.nan
    df = df.assign(mean_temp = feats[:,0], mean_dwp = feats[:,1], mean_prcp = feats[:,2])
    return df

def latlong(gsod,stations):
    
    lat = np.empty(len(gsod.Station_number))
    long = np.empty(len(gsod.Station_number))
    gsod_copy = gsod.copy()
    for i in range(len(gsod.Station_number)):
        lat[i] = np.array(stations.loc[stations.Station_number == gsod.Station_number[i]].Latitude)[0]
        long[i] = np.array(stations.loc[stations.Station_number == gsod.Station_number[i]].Longitude)[0]
    gsod_copy['Latitude'] = lat
    gsod_copy['Longitude'] = long
    return gsod_copy

In [76]:
plots_to_2019 = plots.loc[plots.date >= datetime(2000,1,1)].sort_values('date').reset_index(drop=True)

In [77]:
plot_stations = pd.DataFrame()
stations_tuple = tuple(stations_unique.Station_number)

In [79]:
t0 = time.perf_counter()
for i in range(1):
    
    year = 2000+i
    plot_year = plots_to_2019.loc[plots_to_2019.measurement_year == year].reset_index(drop=True)
    gsod = gsodqueryyear(year, stations_tuple, noaa_gsod)
    #gsod['date'] = pd.to_datetime(gsod[['Year','Month']].assign(day=1))
    gsod = latlong(gsod,stations_unique)
    plot_station = get_station3(gsod, plot_year)
    plot_station = add_temp(plot_station,gsod)
    plot_stations = pd.concat([plot_stations,plot_station])

t1 = time.perf_counter()
total_t = t1-t0

In [11]:
plot_stations.reset_index(drop=True).loc[plot_stations.plot_county_code == 25]

NameError: name 'plot_stations' is not defined

In [None]:
plot_code = plot_stations[['plot_phase_2_plot_number','plot_state_code','plot_county_code']].astype(str)
plot_code = plot_code.apply(lambda x: np.sum(x+'_'), axis=1)
plot_stations['unique_code'] = plot_code
plot_stations.to_csv("plots_with_temp2_with_county.csv")

In [None]:
plot_stations

In [None]:
uniqueplot = plot_code.unique()
for i in range(len(uniqueplot)):
    plot_stations['unique_code_2'].loc[plot_stations.unique_code == uniqueplot[i]] = i
    
    

In [None]:
plot_stations

In [9]:
test = pd.DataFrame({'a':[2,2,1],'b':[4,5,6],'c':[7,8,9]})
test

Unnamed: 0,a,b,c
0,2,4,7
1,2,5,8
2,1,6,9


In [10]:
test.a.idxmin()


2

In [82]:
gsod.loc[gsod.Station_number == '724243']

Unnamed: 0,Station_number,Year,Month,Mean_temp,Mean_dwp,Mean_prcp,date,Latitude,Longitude
626,724243,2000,1,34.758065,25.006452,3.33129,2000-01-01,37.087,-84.077
1954,724243,2000,2,42.665517,32.272414,0.121034,2000-02-01,37.087,-84.077
3409,724243,2000,3,50.922581,36.796774,0.08,2000-03-01,37.087,-84.077
4404,724243,2000,4,53.763333,43.363333,0.161333,2000-04-01,37.087,-84.077
6115,724243,2000,5,67.141935,56.958065,0.100323,2000-05-01,37.087,-84.077
7592,724243,2000,6,72.466667,63.333333,0.099333,2000-06-01,37.087,-84.077
9143,724243,2000,7,73.448387,65.264516,0.145161,2000-07-01,37.087,-84.077
10385,724243,2000,8,72.890323,65.454839,0.137742,2000-08-01,37.087,-84.077
11641,724243,2000,9,65.706667,58.46,0.069,2000-09-01,37.087,-84.077
13280,724243,2000,10,57.229032,46.316129,0.037419,2000-10-01,37.087,-84.077


In [67]:
test.assign(d = t[:,0], e = t[:,1])

Unnamed: 0,a,b,c,d,e
0,1,4,7,1.0,4.0
1,2,5,8,2e-323,2.5e-323
2,3,6,9,3.5e-323,4e-323


In [72]:
t[:,1]

array([4.0e+000, 2.5e-323, 4.0e-323])