In [3]:
import sqlite3
import pandas as pd
from google.cloud import bigquery
from google.oauth2 import service_account
import pandas_gbq
from scipy.spatial import distance
import numpy as np

In [16]:
# Get a dataframe of all the cities at hand, along with their latitude and longitude
conn = sqlite3.connect('../cityDB.sqlite')
cities = pd.read_sql_query("SELECT cid, city, state, lat, lng  FROM cities;", conn)

# Get the coordinates for the cities
cities['coordinates'] = cities.apply(lambda x: (x['lat'], x['lng']), axis=1)

In [36]:
# Get a list of all the stations in the US from NOAA data publicly available on BigQuery
credentials = service_account.Credentials.from_service_account_file('bigquery_key.json')
stations_query = ("""SELECT *
FROM `bigquery-public-data.noaa_gsod.stations`
WHERE country = 'US'""")
stations = pandas_gbq.read_gbq(stations_query,
                               project_id="dva-destination-recommender",
                               credentials=credentials, dialect='standard')

# Remove stations with null values
stations.dropna(axis=0, inplace=True)

# Get the coordinates for the stations
stations['coordinates'] = stations.apply(lambda x: np.array([x.lat, x.lon]), axis=1)

In [59]:
def find_closest_station(city_coord, stations):
    """
    INPUTS:
    city_coord (tuple): a tuple with the coordinates for the city
    stations (NumPy array): an array of coordinates for various stations

    OUTPUT:
    ids (int): the station ID for the closest station to the city, both usaf and wban
    """
    stations_coord = np.array(stations['coordinates'].tolist())
    closest_index = distance.cdist([city_coord], stations_coord).argmin()
    usaf = stations.iloc[closest_index, 0]
    wban = stations.iloc[closest_index, 1]

    ids = str(usaf) + '|' + str(wban)

    return ids

In [60]:
# Find the closest station for each city in the dataframe
closest_stations = []
for row_num in range(len(cities)):
    city_coord = cities.iloc[row_num].coordinates
    closest_stations.append(find_closest_station(city_coord, stations))

In [61]:
cities['closest_station'] = closest_stations

In [63]:
cities[['closest_station_usaf', 'closest_station_wban']] = cities['closest_station'].str.split('|',expand=True)

In [64]:
cities.head()

Unnamed: 0,cid,city,state,lat,lng,coordinates,closest_station,closest_station_usaf,closest_station_wban
0,g60763,New York,NY,40.712775,-74.005973,"(40.7127753, -74.0059728)",720553|99999,720553,99999
1,g32655,Los Angeles,CA,34.052234,-118.243685,"(34.0522342, -118.2436849)",999999|93134,999999,93134
2,g35805,Chicago,IL,41.639202,-87.454763,"(41.63920239999999, -87.4547635)",725337|04807,725337,4807
3,g56003,Houston,TX,29.760427,-95.369803,"(29.7604267, -95.3698028)",720594|99999,720594,99999
4,g31310,Phoenix,AZ,33.448377,-112.074037,"(33.4483771, -112.0740373)",999999|23183,999999,23183
