In [1]:
import urllib.request as urllib2
import json
import pandas as pd
import numpy as np
import time as t
import datetime
import itertools
import os
from scipy.sparse import lil_matrix

First code blocks are focused on just gathering data for a mesh of atlanta. Essentially 4 closest points are gathered
4|V| = |E|. Then, the adj matrix (A) of edges to be collected are collected for array of coords are collected and dumped into csv. $A_{ij}$ implies info going from i to j. What edges to use is dependent only on A.

In [2]:
"""
INPUTS:
    url: a request url
OUTPUTS: 
    the data returned by calling that url
"""
def request_data_from_url(url):
    req = urllib2.Request(url)
    success = False
    while success is False:
        try: 
            #open the url
            response = urllib2.urlopen(req)
            
            #200 is the success code for http
            if response.getcode() == 200:
                success = True
        except Exception as e:
            #if we didn't get a success, then print the error and wait 5 seconds before trying again
            print(e)
            t.sleep(5)

            print("Error for URL %s: %s" % (url, datetime.datetime.now()))
            print("Retrying...")
    return response.read()

In [3]:
"""
INPUTS:
    api_key: authentication to GMaps that we're allowed to request this data
    origin: lat,long of origin
    destination: lat,long of destination
OUTPUTS
    (time_traffic,time,distance,origin_addr,destination_addr)
    real time
    usual time
    distance
"""
def scrape_gmaps_data(api_key, origin, destination):
    
    #we want to scrape the google maps website
    site = 'https://maps.googleapis.com/maps/api/'
    
    #we want to use the distance matrix service
    service = 'distancematrix/json?'
    
    #input origin and destination from the user 
    locations = f"origins={origin[0]},{origin[1]}&destinations={destination[0]},{destination[1]}&departure_time=now&"

    #input api key from user
    key = 'key=%s' % (api_key)
    
    #construct request url
    request_url = site + service + locations + key
    
    # get data from api
    data = json.loads(request_data_from_url(request_url))
    
    #extract travel real time from response
    realtime = data['rows'][0]['elements'][0]['duration_in_traffic']['value']
    
    #extract travel avg time from response
    time = data['rows'][0]['elements'][0]['duration']['value']
    
    #extract distance from response
    dist = data['rows'][0]['elements'][0]['duration_in_traffic']['value'] 
    
    # origin address
    oa = data['origin_addresses'][0]
    
    # destination address
    da = data['destination_addresses'][0]
    
    return (realtime, time, dist, oa, da)

Proof that this works! And gives us what we want

In [4]:
with open("supersecretapikey.txt","r") as fh:
    api_key = fh.readlines()[0]
origin = (33.776360,-84.397824) # Campus Center
destination = (33.772428,-84.392709) #Bobby Dodd

scrape_gmaps_data(api_key, origin, destination)

(248,
 243,
 248,
 '788 Research Dr NW, Atlanta, GA 30313, USA',
 '177 N Ave NW, Atlanta, GA 30313, USA')

In [5]:
"""
INPUTS:
    adjacency matrix: sparse boolean matrix representing which edges to query for
    location_matrix: np.array floating points of size 2x|N| (Longitude, Latitude)
    period: how often to query in minutes
    number_of_queries: how many times to query
    api_key: api key for your gcp account 
OUTPUTS
    Pandas df
"""
def graph_query(location_matrix, adjacency_matrix, period, number_of_queries, api_key):
    # basic checks 
    if location_matrix.shape[1] != adjacency_matrix.shape[0]:
        raise ValueError("length of locations is not the same as size of adjacency matrix")
    if adjacency_matrix.shape[0] != adjacency_matrix.shape[1]:
        raise ValueError("invalid adjacency matrix shape")
    
    # see how many elements there are in the matrix
    mag_vset = 0
    cx = adjacency_matrix.tocoo()
    for i,j,v in zip(cx.row, cx.col, cx.data): mag_vset += 1
    mag_vset *= number_of_queries
    
    # make the pandas dataframe
    df = pd.DataFrame(np.zeros(shape=(mag_vset,11)))
    df.columns = ["Time","Actual Query Time","Origin Latitude","Origin Longitude","Origin Address",
                  "Destination Latitude","Destination Longitude","Destination Address",
                  "Distance","Traffic Time","Normal Time"]
    
    # go through all elements with the query
    df_index = 0
    for qi in range(number_of_queries):
        time = datetime.datetime.now()
        for i,j,v in zip(cx.row, cx.col, cx.data):
            # get origin and destination
            origin = (location_matrix[:,i][0],location_matrix[:,i][1])
            destination = (location_matrix[:,j][0],location_matrix[:,j][1])
            # Actually Query GCP (Distance Matrix)
            (realtime_travel, time_travel, dist, oa, da) = scrape_gmaps_data(api_key, origin, destination)
            # Add all data to dataframe
            df.iloc[df_index,0], df.iloc[df_index,1] = time, datetime.datetime.now()
            df.iloc[df_index,2],df.iloc[df_index,3],df.iloc[df_index,4] = origin[0], origin[1], oa
            df.iloc[df_index,5],df.iloc[df_index,6],df.iloc[df_index,7] = destination[0], destination[1], da
            df.iloc[df_index,8], df.iloc[df_index,9], df.iloc[df_index,10] = realtime_travel, time_travel, dist
            df_index += 1
        # wait to get to whole period before new queries
        if qi != number_of_queries - 1:
            t.sleep(period * 60 - (datetime.datetime.now() - time).total_seconds())
    return df

In [6]:
location_matrix = np.array([[33.776360,33.772428],[-84.397824,-84.392709]])
adjacency_matrix = lil_matrix((2,2),dtype = '?')
adjacency_matrix[0,1] = True
adjacency_matrix[1,0] = True

df = graph_query(location_matrix,adjacency_matrix,3/60,3,api_key)

In [7]:
# center of the mesh
center_mesh = [33.776360,-84.397824]
# how many rows to make mesh
num_rows = 5
# how many columns
num_columns = 5
# distance between them
delta = .005
lat = []
lon = []
# make all vertices
for i,j in itertools.product(range(num_columns), range(num_rows)):
    lat.append(center_mesh[0] + (i-num_rows/2)*delta)
    lon.append(center_mesh[1] + (j-num_columns/2)*delta)
# magnitude of vector set
mag_vset = len(lat)
# numpy this boy
location_matrix = np.array([lat,lon])
# add only the "mesh" coordinates (one hop neighbors)
adjacency_matrix = lil_matrix((mag_vset,mag_vset),dtype = '?')

(2, 25)


In [None]:
#### Take home project: Add all the "mesh" edges to the adjacency list in O(N) time####






In [97]:
#### Take home project: Implement csv_graph method, which goes from a csv to a graph representation of the query.
#### Assume that the dataset was created as above. Please make the adj. mat. sparse (look above for example). The values
#### on the edges should not be booleans but instead time magnitude. The default will be traffic time BUT you should support
#### using normal time and distance (use that optional param). This should be efficient!!!####

def csv_graph(csv, location_matrix, adjacency_matrix, value = "Traffic Time"):
    return adjacency_matrix
    



In [90]:
df = graph_query(location_matrix,adjacency_matrix,3,1)

In [11]:
path = os.path.join(os.getcwd(), "..")
path = os.path.join(path, "dataset")
path = os.path.join(path, "dataset.csv")
df.to_csv(path)