# Testing Location Linking File

All the code here also apears in the data_linking.py file... I'm just using the NB to debug.

In [1]:
# imports
# Imports
from __future__ import absolute_import, print_function, unicode_literals
import numpy as np
import pandas as pd
import psycopg2
from psycopg2.extensions import ISOLATION_LEVEL_AUTOCOMMIT
from sqlalchemy import create_engine
import vincenty

In [3]:
# Set up connection to postgres tables
db_loc = 'postgresql+psycopg2://teamsunshinedemo:oscarisawesome123'
db_loc += '@teamsunshinedemo.coga7nzsvf0h.us-east-1.rds.amazonaws.com:'
db_loc += '5432/solarenergy'
engine = create_engine(db_loc)

In [4]:
# get solar locations
query = """
        SELECT DISTINCT loc_id,
        CAST(latitude as decimal) AS latitude,
        CAST(longitude as decimal) AS longitude
        FROM generation
        """
solar_locations_df = pd.read_sql(query, engine, index_col = 'loc_id')

In [7]:
# take a look
solar_locations_df.head(3)

Unnamed: 0_level_0,latitude,longitude
loc_id,Unnamed: 1_level_1,Unnamed: 2_level_1
ELEC.PLANT.GEN.56889-SUN-ALL.M,40.727386,-74.3783
ELEC.PLANT.GEN.58536-SUN-ALL.M,40.352222,-74.187778
ELEC.PLANT.GEN.59361-SUN-ALL.M,42.200833,-72.322222


In [9]:
# get weather locations
query = """
        SELECT DISTINCT wban_id,
        CAST(latitude as decimal) AS latitude,
        CAST(longitude as decimal) AS longitude
        FROM weather_stations
        """
weather_locations_df = pd.read_sql(query, engine, index_col = 'wban_id')

In [10]:
# take a look
weather_locations_df.head(3)

Unnamed: 0_level_0,latitude,longitude
wban_id,Unnamed: 1_level_1,Unnamed: 2_level_1
94088,44.51,-104.43
4237,47.51,-123.81
23906,28.3,-96.82


### Location Linking

In [76]:
# trying it out on one station:
loc_id = 'ELEC.PLANT.GEN.56889-SUN-ALL.M'
solar_df = solar_locations_df
uscrn_df = weather_locations_df

In [77]:
# store coordinates of solar location
solar_loc = solar_df.latitude[loc_id], solar_df.longitude[loc_id]
# take a look
print(solar_loc)

(40.727386000000003, -74.378299999999996)


In [78]:
# stations w.in a small radius
neighbors =  uscrn_df[0:0]
# take a look
print(len(neighbors))
neighbors

0


Unnamed: 0_level_0,latitude,longitude
wban_id,Unnamed: 1_level_1,Unnamed: 2_level_1


In [85]:
radius = 2

# range of lat/long to search
xmin, xmax = solar_loc[0] - radius, solar_loc[0] + radius
ymin, ymax = solar_loc[1] - radius, solar_loc[1] + radius

print(xmin, xmax, ymin, ymax)

38.727386 42.727386 -76.3783 -72.3783


In [86]:
lat_r = (uscrn_df.latitude>=xmin)&(uscrn_df.latitude<=xmax)
long_r = (uscrn_df.longitude>= ymin)&(uscrn_df.longitude<=ymax)
neighbors = uscrn_df[lat_r & long_r]

In [87]:
neighbors

Unnamed: 0_level_0,latitude,longitude
wban_id,Unnamed: 1_level_1,Unnamed: 2_level_1
64756,41.78,-73.74
64758,42.44,-76.24
3761,39.85,-75.78


In [82]:
# find closest neighbor
closest_station = None
dist_to_closest_station = np.inf
for wban_id, (lat, lng) in neighbors.iterrows():
    dist = vincenty.vincenty((lat, lng), solar_loc)
    if dist < dist_to_closest_station:
        dist_to_closest_station = dist
        closest_station = wban_id

print(closest_station, dist_to_closest_station)

64756 128.560289


In [96]:
def find_closest_station(loc_id, solar_df, uscrn_df):
    """
    This is a helper function to find the closest weather station
    given the id of a solar location.
    """
    # store coordinates of solar location
    solar_loc = solar_df.latitude[loc_id], solar_df.longitude[loc_id]

    # stations w.in a small radius
    neighbors =  uscrn_df[0:0]
    radius = 1
    while len(neighbors) == 0:
        # range of lat/long to search
        xmin, xmax = solar_loc[0] - radius, solar_loc[0] + radius
        ymin, ymax = solar_loc[1] - radius, solar_loc[1] + radius
        
        # pull stations in that range
        lat_r = (uscrn_df.latitude>=xmin)&(uscrn_df.latitude<=xmax)
        long_r = (uscrn_df.longitude>=ymin)&(uscrn_df.longitude<=ymax)
        neighbors = uscrn_df[lat_r & long_r]
        radius += 1

    # find closest neighbor
    closest_station = None
    dist_to_closest_station = np.inf
    for wban_id, (lat, lng) in neighbors.iterrows():
        dist = vincenty.vincenty((lat, lng), solar_loc)
        if dist < dist_to_closest_station:
            dist_to_closest_station = dist
            closest_station = wban_id

    return closest_station, dist_to_closest_station

In [97]:
find_closest_station(loc_id, solar_df, uscrn_df)

(u'64756', 128.560289)

In [99]:
mini = solar_df.head(10)
mini

Unnamed: 0_level_0,latitude,longitude
loc_id,Unnamed: 1_level_1,Unnamed: 2_level_1
ELEC.PLANT.GEN.56889-SUN-ALL.M,40.727386,-74.3783
ELEC.PLANT.GEN.58536-SUN-ALL.M,40.352222,-74.187778
ELEC.PLANT.GEN.59361-SUN-ALL.M,42.200833,-72.322222
ELEC.PLANT.GEN.57485-SUN-ALL.M,40.483888,-74.856666
ELEC.PLANT.GEN.59956-SUN-ALL.M,35.5262,-78.2898
ELEC.PLANT.GEN.58540-SUN-ALL.M,32.713889,-108.248056
ELEC.PLANT.GEN.58373-SUN-ALL.M,35.871389,-119.4625
ELEC.PLANT.GEN.58561-SUN-ALL.M,42.833056,-70.991944
ELEC.PLANT.GEN.58315-SUN-ALL.M,35.779167,-81.175556
ELEC.PLANT.GEN.58604-SUN-ALL.M,38.402778,-112.988889


In [102]:
def find_closest_station(solar_loc, uscrn_df):
    """
    This is a helper function to find the closest weather station
    given the id of a solar location.
    """
    # store coordinates of solar location
    # solar_loc = solar_df.latitude[loc_id], solar_df.longitude[loc_id]

    # stations w.in a small radius
    neighbors =  uscrn_df[0:0]
    radius = 1
    while len(neighbors) == 0:
        # range of lat/long to search
        xmin, xmax = solar_loc[0] - radius, solar_loc[0] + radius
        ymin, ymax = solar_loc[1] - radius, solar_loc[1] + radius
        
        # pull stations in that range
        lat_r = (uscrn_df.latitude>=xmin)&(uscrn_df.latitude<=xmax)
        long_r = (uscrn_df.longitude>=ymin)&(uscrn_df.longitude<=ymax)
        neighbors = uscrn_df[lat_r & long_r]
        radius += 1

    # find closest neighbor
    closest_station = None
    dist_to_closest_station = np.inf
    for wban_id, (lat, lng) in neighbors.iterrows():
        dist = vincenty.vincenty((lat, lng), solar_loc)
        if dist < dist_to_closest_station:
            dist_to_closest_station = dist
            closest_station = wban_id

    return closest_station, dist_to_closest_station

In [118]:
temp_lst=[]
for loc_id, solar_loc in solar_df.iterrows():
    wban_id, distance = find_closest_station(solar_loc, uscrn_df)
    temp_lst.append([loc_id,wban_id, distance])

In [120]:
closest_stations_df = pd.DataFrame(lst, columns=['loc_id', 'wban_id','distance'])

In [123]:
closest_stations = closest_stations_df.set_index('loc_id')

# Final pass through functions

In [3]:
def get_locations(verbose=False):
    """
    This function connect to the postgres DB to pull locations for
    solar locations and for weather stations. It returns two dataframes.
    """
    # Set up connection to postgres tables
    db_loc = 'postgresql+psycopg2://teamsunshinedemo:oscarisawesome123'
    db_loc += '@teamsunshinedemo.coga7nzsvf0h.us-east-1.rds.amazonaws.com:'
    db_loc += '5432/solarenergy'
    engine = create_engine(db_loc)

    # get solar locations
    query = """
            SELECT DISTINCT loc_id,
            CAST(latitude as decimal) AS latitude,
            CAST(longitude as decimal) AS longitude
            FROM generation
            """
    solar_df = pd.read_sql(query, engine, index_col = 'loc_id')
    if verbose:
        print('... loaded locations for %s solar locations' % (len(solar_df)))
    
    # get weather locations
    query = """
            SELECT DISTINCT wban_id,
            CAST(latitude as decimal) AS latitude,
            CAST(longitude as decimal) AS longitude
            FROM weather_stations
            """
    uscrn_df = pd.read_sql(query, engine, index_col = 'wban_id')
    if verbose:
        print('... loaded locations for %s uscrn stations' % (len(uscrn_df)))

    return solar_df, uscrn_df

In [4]:
s_df, udf = get_locations(verbose=True)

... loaded locations for 1298 solar locations
... loaded locations for 233 uscrn stations


In [8]:
def find_closest_station(solar_loc, uscrn_df):
    """
    This is a helper function to find the closest weather station
    given solar location and a dataframe with weather station locations.
    """
    # start from subset of stations w.in a small radius
    neighbors =  uscrn_df[0:0]
    radius = 1
    while len(neighbors) == 0:
        # range of lat/long to search
        xmin, xmax = solar_loc[0] - radius, solar_loc[0] + radius
        ymin, ymax = solar_loc[1] - radius, solar_loc[1] + radius

        # pull stations in that range
        lat_r = (uscrn_df.latitude>=xmin)&(uscrn_df.latitude<=xmax)
        long_r = (uscrn_df.longitude>=ymin)&(uscrn_df.longitude<=ymax)
        neighbors = uscrn_df[lat_r & long_r]
        radius += 1

    # find closest neighbor
    closest_station = None
    dist_to_closest_station = np.inf
    for wban_id, (lat, lng) in neighbors.iterrows():
        dist = vincenty.vincenty((lat, lng), solar_loc)
        if dist < dist_to_closest_station:
            dist_to_closest_station = dist
            closest_station = wban_id

    return closest_station, dist_to_closest_station

In [11]:
def create_closest_station_df(solar_df, uscrn_df):
    """
    This function creates a new dataframe whose index are location ids
    and whose columns are wban_id and distance (of the closest station).
    """
    # iterate through solar locations to get closest station
    temp_lst=[]
    for loc_id, solar_loc in solar_df.iterrows():
        wban_id, distance = find_closest_station(solar_loc, uscrn_df)
        temp_lst.append([loc_id,wban_id, distance])

    # put results in a data frame
    cols = ['loc_id', 'wban_id','distance']
    closest_stations_df = pd.DataFrame(temp_lst, columns=cols)
    closest_stations_df = closest_stations_df.set_index('loc_id')
    return closest_stations_df

In [12]:
new_df = create_closest_station_df(s_df, udf)

In [13]:
new_df.head()

Unnamed: 0_level_0,wban_id,distance
loc_id,Unnamed: 1_level_1,Unnamed: 2_level_1
ELEC.PLANT.GEN.56889-SUN-ALL.M,64756,128.560289
ELEC.PLANT.GEN.58536-SUN-ALL.M,3761,146.768766
ELEC.PLANT.GEN.59361-SUN-ALL.M,54796,102.243005
ELEC.PLANT.GEN.57485-SUN-ALL.M,3761,105.548258
ELEC.PLANT.GEN.59956-SUN-ALL.M,3758,87.539528


# Confirming script ran

In [14]:
db_loc = 'postgresql+psycopg2://teamsunshinedemo:oscarisawesome123'
db_loc += '@teamsunshinedemo.coga7nzsvf0h.us-east-1.rds.amazonaws.com:'
db_loc += '5432/solarenergy'
engine = create_engine(db_loc)

In [15]:
query = "SELECT * from closest_stations"
test = pd.read_sql(query, engine, index_col = 'loc_id')

In [16]:
test

Unnamed: 0_level_0,wban_id,distance
loc_id,Unnamed: 1_level_1,Unnamed: 2_level_1
ELEC.PLANT.GEN.56889-SUN-ALL.M,64756,128.560289
ELEC.PLANT.GEN.58536-SUN-ALL.M,03761,146.768766
ELEC.PLANT.GEN.59361-SUN-ALL.M,54796,102.243005
ELEC.PLANT.GEN.57485-SUN-ALL.M,03761,105.548258
ELEC.PLANT.GEN.59956-SUN-ALL.M,03758,87.539528
ELEC.PLANT.GEN.58540-SUN-ALL.M,53025,92.582347
ELEC.PLANT.GEN.58373-SUN-ALL.M,53152,166.329221
ELEC.PLANT.GEN.58561-SUN-ALL.M,54795,29.956636
ELEC.PLANT.GEN.58315-SUN-ALL.M,53878,131.116470
ELEC.PLANT.GEN.58604-SUN-ALL.M,53185,56.080586
