In [18]:
# Basic Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Import the config file
import config

# For tesxt:
import re

# For times:
import time

# Set a random seed for imputation
#  Source:  https://numpy.org/doc/stable/reference/random/generated/numpy.random.seed.html
np.random.seed(42)

# for Google Geocoding API
'''
Setting up the API key and getting started was enabled with help from the following sources:
* https://developers.google.com/maps/documentation/geocoding/?csw=1
* https://developers.google.com/maps/documentation/geocoding/overview
* https://developers.google.com/maps/get-started#api-key
* https://developers.google.com/maps/documentation/geocoding/get-api-key
* https://github.com/googlemaps/google-maps-services-python
'''
import googlemaps
from datetime import datetime

# Import the Beatiful Soup and Requests Packages for web scraping:
import requests
from bs4 import BeautifulSoup

# Find Proximity to Subway ('T') Stations

## Get a List of Sll the T Stations/Stops

Web scrape this entire site:  https://www.mbta.com/stops/subway .... to get the list of stations, then go to the urls for each to get the address and scrape the address which will get entered into the google geocoding request

In [89]:
# Store the base url from which all T-stop data will be retrieved
base_url = 'https://www.mbta.com/stops/subway'

# Connect to the web page:
res = requests.get(base_url)

# Pull the website text HTML string out
html = res.text

# Create that Beautiful Soup Instance!  Choosing 'lxml' as the parser.
soup = BeautifulSoup(html, 'lxml')

res.status_code

200

In [93]:
# From the site code, the urls for the stops are under the class specified below
a_list = soup.find_all('a', {'class': 'btn button stop-btn m-detailed-stop'})

# The href is then the url for each stop
stn_urls = [station['href'] for station in a_list]
print(len(stn_urls))
stn_urls

132


['/stops/place-alfcl',
 '/stops/place-andrw',
 '/stops/place-asmnl',
 '/stops/place-brntn',
 '/stops/place-brdwy',
 '/stops/place-cntsq',
 '/stops/place-chmnl',
 '/stops/place-davis',
 '/stops/place-dwnxg',
 '/stops/place-fldcr',
 '/stops/place-harsq',
 '/stops/place-jfk',
 '/stops/place-knncl',
 '/stops/place-nqncy',
 '/stops/place-pktrm',
 '/stops/place-portr',
 '/stops/place-qamnl',
 '/stops/place-qnctr',
 '/stops/place-shmnl',
 '/stops/place-smmnl',
 '/stops/place-sstat',
 '/stops/place-wlsta',
 '/stops/place-asmnl',
 '/stops/place-butlr',
 '/stops/place-capst',
 '/stops/place-cedgr',
 '/stops/place-cenav',
 '/stops/place-matt',
 '/stops/place-miltt',
 '/stops/place-valrd',
 '/stops/place-astao',
 '/stops/place-bbsta',
 '/stops/place-chncl',
 '/stops/place-ccmnl',
 '/stops/place-dwnxg',
 '/stops/place-forhl',
 '/stops/place-grnst',
 '/stops/place-haecl',
 '/stops/place-jaksn',
 '/stops/place-mlmnl',
 '/stops/place-masta',
 '/stops/place-north',
 '/stops/place-ogmnl',
 '/stops/place

In [105]:
# Create the url for the station

'''
For whatever reason, the base url for each specific station drops the '/stops/subway'
portion of the base url.  This is corrected by creating a new base url
'''
base_url_new = base_url.replace('/stops/subway', '')
print(base_url_new)

# Create the url for the first T-stop
stn_url = base_url_new + stn_urls[0]
print(stn_url)

# Connect to T-stop page
res_stn = requests.get(stn_url)

# Pull the website text HTML string out for the station/t-stop
html_stn = res_stn.text

# Create the soup object
soup_stn = BeautifulSoup(html_stn, 'lxml')

# Extract the address text:
soup_stn.find_all('div', {'class': 'h3'})[0].text

https://www.mbta.com
https://www.mbta.com/stops/place-alfcl


'Alewife Brook Pkwy and Cambridge Park Dr, Cambridge, MA 02140'

In [108]:
# Now take everything above and loop through it but add a wait time

for station in stn_urls[0:5]:
    print(station)

/stops/place-alfcl
/stops/place-andrw
/stops/place-asmnl
/stops/place-brntn
/stops/place-brdwy


# Geocoding API

In [2]:
api_key = config.api_key;

In [6]:
# I think we need this url per the following source:
#  https://developers.google.com/maps/documentation/geocoding/get-api-key

# This is the url they give you for formatting:
    # 'https://maps.googleapis.com/maps/api/geocode/json?address=1600+Amphitheatre+Parkway,+Mountain+View,+CA&key=YOUR_API_KEY'
url = 'https://maps.googleapis.com/maps/api/geocode/json?address=1600+Amphitheatre+Parkway,+Mountain+View,+CA&key=' + api_key

In [8]:
# The code below was adapted from the following source:
    # https://github.com/googlemaps/google-maps-services-python

gmaps = googlemaps.Client(key=api_key)

# Geocoding an address
geocode_result = gmaps.geocode('Alewife Brook Pkwy and Cambridge Park Dr, Cambridge, MA 02140')

# Look up an address with reverse geocoding
reverse_geocode_result = gmaps.reverse_geocode((40.714224, -73.961452))

In [10]:
geocode_result

[{'address_components': [{'long_name': 'Alewife Brook Parkway & Cambridgepark Drive',
    'short_name': 'Alewife Brook Pkwy & Cambridgepark Dr',
    'types': ['intersection']},
   {'long_name': 'North Cambridge',
    'short_name': 'North Cambridge',
    'types': ['neighborhood', 'political']},
   {'long_name': 'Cambridge',
    'short_name': 'Cambridge',
    'types': ['locality', 'political']},
   {'long_name': 'Middlesex County',
    'short_name': 'Middlesex County',
    'types': ['administrative_area_level_2', 'political']},
   {'long_name': 'Massachusetts',
    'short_name': 'MA',
    'types': ['administrative_area_level_1', 'political']},
   {'long_name': 'United States',
    'short_name': 'US',
    'types': ['country', 'political']},
   {'long_name': '02140', 'short_name': '02140', 'types': ['postal_code']}],
  'formatted_address': 'Alewife Brook Pkwy & Cambridgepark Dr, Cambridge, MA 02140, USA',
  'geometry': {'location': {'lat': 42.3947455, 'lng': -71.14048149999999},
   'locati

In [109]:
# Help with selecting the appropriate coordinates from:  https://stackoverflow.com/questions/43194789/whats-the-difference-between-location-and-viewport-coordinates-when-geocoding-wi

geocode_result[0]['geometry']['location']

{'lat': 42.3947455, 'lng': -71.14048149999999}