# Plotting Geo-Data on GoogleMaps using Bokeh Visualization Library

### Example: plotting the location of NYC tech companies and startups for Metis Data Science Project Benson

In [None]:
from __future__ import print_function, division

In [None]:
import pandas as pd

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [None]:
import datetime
import re

In [None]:
# Install Google APIs Client Library for Python
# https://developers.google.com/api-client-library/python/start/installation

import googlemaps
from datetime import datetime

In [None]:
# Enter your personal API key
# Apply for one here:
# https://developers.google.com/maps/documentation/javascript/get-api-key

gmaps = googlemaps.Client(key="ENTER YOUR API KEY HERE")

In [None]:
from bokeh.io import output_file, show
from bokeh.models import (
  GMapPlot, GMapOptions, ColumnDataSource, Circle, DataRange1d, PanTool, WheelZoomTool, BoxSelectTool, LabelSet, Label
)

In [None]:
# Import data on NYC tech companies (Source: Mapped in NYC)
df = pd.read_csv('https://data.cityofnewyork.us/api/views/f4yq-wry5/rows.csv')

In [None]:
# Preview the dataframe
df.head(10)

In [None]:
# Check column names
df.columns

In [None]:
# Count the number of rows
len(df)

In [None]:
# Replace missing data with NaN (entries ' - ')
df = df.replace("' - '", np.nan)

In [None]:
# Drop rows where Address is missing
df.dropna(subset=["Address"], axis=0, inplace=True)

In [None]:
# Check how many rows remain
len(df)

In [None]:
# Remove quotation marks around strings in certain columns
df['Address'] = df.Address.apply(lambda x: re.sub("'",'',x))

In [None]:
# Check value counts of City column
#df.City.value_counts()

In [None]:
# Correct for different spellings of "New York"
# NOTE: Probably easier to do this using regex!
df = df.replace("'New York City'", "'New York'")

In [None]:
# Correct for different spellings of "New York"
# NOTE: Probably easier to do this using regex!
df = df.replace("'New York '", "'New York'")

In [None]:
# Correct for different spellings of "New York"
# NOTE: Probably easier to do this using regex!
df = df.replace("'NY'", "'New York'")

In [None]:
# Correct for different spellings of "New York"
# NOTE: Probably easier to do this using regex!
df = df.replace("'NEW YORK'", "'New York'")

In [None]:
# Check city value counts again
df.City.value_counts()

In [None]:
# Filter for only NYC companies
df = df[df['City'] == "'New York'"]

In [None]:
# Check number of rows remaining
len(df)

In [None]:
# Remove quotations marks around strings in column "City"
df['City'] = df.City.apply(lambda x: re.sub("'",'',x))

In [None]:
# get address in one cell by concatanating "Address" and "City"
df["Full_Address"] = df['Address'] + ', ' + df['City']

In [None]:
# Define a function which find the latitude of a location when given an address
# For more info on the gmaps.geocode function:
# https://github.com/googlemaps/google-maps-services-python

def get_lat(full_address):
    geocode_result = gmaps.geocode(full_address)
    if geocode_result:
        return geocode_result[0]["geometry"]["location"]["lat"]
    else:
        return None

In [None]:
# test the formula
get_lat('711 Third Avenue, New York')

In [None]:
# NOTE: Running this fuction to get latitude and longitude does take some time and often times out.
# Workaround: Run the function over the dataframe small chunks at a time (e.g. first 100 rows, then 500, etc.)

In [None]:
# Apply get_lat function to first 100 rows:
df['Latitude'] = df.Full_Address.head(100).apply(get_lat)

In [None]:
# Apply get_lat function to first 500 rows:
df['Latitude'] = df.Full_Address.head(500).apply(get_lat)

In [None]:
# Define a function which find the longitude of a location when given an address
# For more info on the gmaps.geocode function:
# https://github.com/googlemaps/google-maps-services-python

def get_lon(full_address):
    geocode_result = gmaps.geocode(full_address)
    if geocode_result:
        return geocode_result[0]["geometry"]["location"]["lng"]
    else:
        return None

In [None]:
# Apply get_lon function to first 100 rows:
df['Longitude'] = df.Full_Address.head(100).apply(get_lon)

In [None]:
# Apply get_lon function to first 500 rows:
df['Longitude'] = df.Full_Address.head(500).apply(get_lon)

In [None]:
# Apply get_lat function to last 500 rows (for total 1000 rows):
df['Latitude'] = df.Full_Address.tail(500).apply(get_lat)

In [None]:
# Apply get_lon function to last 500 rows (for total 1000 rows):
df['Longitude'] = df.Full_Address.tail(500).apply(get_lon)

In [None]:
# Create a new dataframe of subway stations
stationsdf = pd.DataFrame(columns=['station'])

In [None]:
# Input subway stations of interest (e.g. top 10 busiest)
stationsdf['station'] = ['GRD CNTRL-42 ST', 'PATH NEW WTC', '34 ST-PENN STA', 'FLUSHING-MAIN', '14 ST-UNION SQ', '59 ST COLUMBUS', 'JKSN HT-ROOSVLT', '34 ST-HERALD SQ', '86 ST']

In [None]:
# Apply get_lat function to all rows:
stationsdf['Latitude'] = stationsdf.station.apply(get_lat)

In [None]:
# Apply get_lon function to all rows:
stationsdf['Longitude'] = stationsdf.station.apply(get_lon)

In [None]:
# Check datafram
stationsdf

In [None]:
# PLOT COORDINATES USING BOKEH VISUALISATION LIBRARY - map combination of stations and NYC tech companies
# More info:
# https://bokeh.pydata.org/en/latest/docs/user_guide/geo.html#google-maps-support

map_options = GMapOptions(lat=40.7527, lng=-73.9772, map_type="roadmap", zoom=15)

plot = GMapPlot(
    x_range=DataRange1d(), y_range=DataRange1d(), map_options=map_options
)
plot.title.text = "Target Subway Stations (red) & NYC Tech Companies (blue)"

# For GMaps to function, Google requires you obtain and enable an API key:
#
#     https://developers.google.com/maps/documentation/javascript/get-api-key
#
# Replace the value below with your personal API key:
plot.api_key = "ENTER GOOGLEMAPS API KEY"

source1 = ColumnDataSource(
    data=dict(
        lat=df['Latitude'],
        lon=df['Longitude'],
    )
)

circle1 = Circle(x="lon", y="lat", size=12, fill_color="blue", fill_alpha=0.8, line_color=None)
plot.add_glyph(source1, circle1)

source2 = ColumnDataSource(
    data=dict(
        lat=stationsdf['Latitude'],
        lon=stationsdf['Longitude'],
        names=stationsdf['station'],
    )
)

circle2 = Circle(x="lon", y="lat", size=15, fill_color="red", fill_alpha=0.8, line_color=None)
plot.add_glyph(source2, circle2)

labels = LabelSet(x="lon", y="lat", text='names', level='glyph',
              x_offset=5, y_offset=5, source=source2, render_mode='canvas', background_fill_color='white', background_fill_alpha=1.0)

plot.add_layout(labels)

plot.add_tools(PanTool(), WheelZoomTool(), BoxSelectTool())
output_file("gmap_plot.html")
show(plot)