# MBTA Scraper
*Finnley Autumn Rogers* | 2024-07-26

Please see README.md for complete description of project and goals. 

This is the code as I write it for the web scraper and data preprocessing of the MBTA station locations dataset.

In [56]:
import pandas as pd
import numpy as np
import requests as re

from bs4 import BeautifulSoup
from selenium import webdriver

from sys import stdout
from time import sleep

from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC 
from selenium.webdriver.common.by import By

from time import sleep

In [9]:
base_url = "https://www.mbta.com/stops/"

In [18]:
driver = webdriver.Chrome()
driver.get(f"{base_url}subway")

WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.CLASS_NAME, 'm-detailed-stop')))

html_source = driver.page_source
driver.quit()

In [19]:
subway_soup = BeautifulSoup(html_source, "html.parser")

In [20]:
stops_sections = subway_soup.find_all('div', class_= "stops-for-route")

In [24]:
def address_getter(mbta_url):
    ''' 
    # address_getter

    takes a url path to an MBTA stop page and returns the listed address

    ## Parameters

    - mbta_url (string): complete link to an MBTA stop information page

    ## Outputs

    - parsed_address (string): stop address listed on the listed MBTA page

    '''

    # get the page
    driver = webdriver.Chrome()
    driver.get(mbta_url)
    
    # ensure that the link I want is actually loaded before closing out.
    # I was pulling my hair out over this earlier lol
    WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.CLASS_NAME, 'c-call-to-action')))

    html_source = driver.page_source
    driver.quit()

    stop_soup = BeautifulSoup(html_source, 'html.parser')

    # try to grab the address
    # some of these return nulls for an unknown reason.
    try:
        return((stop_soup.find('a', class_='c-call-to-action').text, stop_soup.find('a', class_='c-call-to-action')['href']))
    except:
        return((np.nan, np.nan))

In [30]:
# empty pandas df
subway_df = pd.DataFrame({
    'transit_mode': [],
    'line': [],
    'subline': [],
    'station_name': [],
    'station_url': []
})

In [37]:
lines = ['Red', 'Orange', 'Blue', 'Green']

for line in lines: 

    cur_line = stops_sections[0].find('div', id = f'stops-{line}')
    button_groups = cur_line.find_all('div', class_='button-group')
    
    # section names
    subline_names = cur_line.find_all('h3')

    # in case there are multiple button groups 
    # (looking at you red line...)
    for group in button_groups:

        subline = subline_names[button_groups.index(group)].text

        cur_urls = []
        cur_names = []

        for a in group.find_all('a'):
            cur_urls.append(a['href'])
            cur_names.append(a.text.split("\n")[1])
        
        # make pandas df section and concat
        section_df = pd.DataFrame({
            'transit_mode': ['subway'] * len(cur_names),
            'line': [line] * len(cur_names),
            'subline': [subline] * len(cur_names),
            'station_name': cur_names,
            'station_url': cur_urls
        })

        subway_df = pd.concat([subway_df, section_df])
    

In [38]:
subway_df.head()

Unnamed: 0,transit_mode,line,subline,station_name,station_url
0,subway,Red,Stations,Alewife,/stops/place-alfcl
1,subway,Red,Stations,Andrew,/stops/place-andrw
2,subway,Red,Stations,Ashmont,/stops/place-asmnl
3,subway,Red,Stations,Braintree,/stops/place-brntn
4,subway,Red,Stations,Broadway,/stops/place-brdwy


In [39]:
subway_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 264 entries, 0 to 69
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   transit_mode  264 non-null    object
 1   line          264 non-null    object
 2   subline       264 non-null    object
 3   station_name  264 non-null    object
 4   station_url   264 non-null    object
dtypes: object(5)
memory usage: 12.4+ KB


In [40]:
subway_df[['line', 'station_name']].groupby('line').count()

Unnamed: 0_level_0,station_name
line,Unnamed: 1_level_1
Blue,24
Green,140
Orange,40
Red,60


Since many stops are connections, focus on getting addresses for unique stops

In [50]:
unique_stops = subway_df[['station_name', 'station_url']].drop_duplicates().reset_index().drop(columns=['index'])

In [51]:
unique_stops.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 125 entries, 0 to 124
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   station_name  125 non-null    object
 1   station_url   125 non-null    object
dtypes: object(2)
memory usage: 2.1+ KB


In [52]:
# expand url column
unique_stops['full_url'] = unique_stops['station_url'].apply(lambda x: "https://www.mbta.com" + x)

In [53]:
unique_stops

Unnamed: 0,station_name,station_url,full_url
0,Alewife,/stops/place-alfcl,https://www.mbta.com/stops/place-alfcl
1,Andrew,/stops/place-andrw,https://www.mbta.com/stops/place-andrw
2,Ashmont,/stops/place-asmnl,https://www.mbta.com/stops/place-asmnl
3,Braintree,/stops/place-brntn,https://www.mbta.com/stops/place-brntn
4,Broadway,/stops/place-brdwy,https://www.mbta.com/stops/place-brdwy
...,...,...,...
120,Waban,/stops/place-waban,https://www.mbta.com/stops/place-waban
121,Warren Street,/stops/place-wrnst,https://www.mbta.com/stops/place-wrnst
122,Washington Square,/stops/place-bcnwa,https://www.mbta.com/stops/place-bcnwa
123,Washington Street,/stops/place-wascm,https://www.mbta.com/stops/place-wascm


In [70]:
# apply existing function over full url 
subway_address = []
subway_link = []

for i in unique_stops.full_url:
    outs = address_getter(i)

    num = unique_stops.index[unique_stops.full_url == i].to_list()[0]
    denom = len(unique_stops.full_url)
    print(f"{str(num)}/{str(denom)} ({num/denom * 100}%)")

    subway_address.append(outs[0])
    subway_link.append(outs[1])

    sleep((np.random.rand(1,1)[0] * 5)[0])
    print("\n")

0/125 (0.0%)


1/125 (0.8%)


2/125 (1.6%)


3/125 (2.4%)


4/125 (3.2%)


5/125 (4.0%)


6/125 (4.8%)


7/125 (5.6000000000000005%)


8/125 (6.4%)


9/125 (7.199999999999999%)


10/125 (8.0%)


11/125 (8.799999999999999%)


12/125 (9.6%)


13/125 (10.4%)


14/125 (11.200000000000001%)


15/125 (12.0%)


16/125 (12.8%)


17/125 (13.600000000000001%)


18/125 (14.399999999999999%)


19/125 (15.2%)


20/125 (16.0%)


21/125 (16.8%)


22/125 (17.599999999999998%)


23/125 (18.4%)


24/125 (19.2%)


25/125 (20.0%)


26/125 (20.8%)


27/125 (21.6%)


28/125 (22.400000000000002%)


29/125 (23.200000000000003%)


30/125 (24.0%)


31/125 (24.8%)


32/125 (25.6%)


33/125 (26.400000000000002%)


34/125 (27.200000000000003%)


35/125 (28.000000000000004%)


36/125 (28.799999999999997%)


37/125 (29.599999999999998%)


38/125 (30.4%)


39/125 (31.2%)


40/125 (32.0%)


41/125 (32.800000000000004%)


42/125 (33.6%)


43/125 (34.4%)


44/125 (35.199999999999996%)


45/125 (36.0%)


46/125 (36.8%)



In [71]:
unique_stops['address'] = subway_address
unique_stops['location_link'] = subway_link

In [72]:
unique_stops

Unnamed: 0,station_name,station_url,full_url,address,location_link
0,Alewife,/stops/place-alfcl,https://www.mbta.com/stops/place-alfcl,"Alewife Brook Pkwy and Cambridge Park Dr, Camb...",https://www.google.com/maps/search/?api=1&&que...
1,Andrew,/stops/place-andrw,https://www.mbta.com/stops/place-andrw,"Dorchester Ave and Southhampton St, South Bost...",https://www.google.com/maps/search/?api=1&&que...
2,Ashmont,/stops/place-asmnl,https://www.mbta.com/stops/place-asmnl,"Dorchester Ave and Ashmont St, Boston, MA 02124",https://www.google.com/maps/search/?api=1&&que...
3,Braintree,/stops/place-brntn,https://www.mbta.com/stops/place-brntn,"197 Ivory St, Braintree, MA 02184",https://www.google.com/maps/search/?api=1&&que...
4,Broadway,/stops/place-brdwy,https://www.mbta.com/stops/place-brdwy,"Dorchester Ave and Broadway, Boston, MA",https://www.google.com/maps/search/?api=1&&que...
...,...,...,...,...,...
120,Waban,/stops/place-waban,https://www.mbta.com/stops/place-waban,"Beacon St and Waban Sq, Newton, MA 02468",https://www.google.com/maps/search/?api=1&&que...
121,Warren Street,/stops/place-wrnst,https://www.mbta.com/stops/place-wrnst,"Commonwealth Ave and Warren St, Boston, MA",https://www.google.com/maps/search/?api=1&&que...
122,Washington Square,/stops/place-bcnwa,https://www.mbta.com/stops/place-bcnwa,"Beacon St and Washington St, Brookline, MA",https://www.google.com/maps/search/?api=1&&que...
123,Washington Street,/stops/place-wascm,https://www.mbta.com/stops/place-wascm,"Commonwealth Ave and Washington St, Boston, MA",https://www.google.com/maps/search/?api=1&&que...


In [73]:
sum(unique_stops.address.isna())

0

In [74]:
unique_stops.location_link.iloc[0].split("=")[-1]

'-71.141287,42.39583'

In [78]:
unique_stops['latitude'] = unique_stops.location_link.apply(lambda x: pd.to_numeric(x.split("=")[-1].split(",")[1]))
unique_stops['longitude'] = unique_stops.location_link.apply(lambda x: pd.to_numeric(x.split("=")[-1].split(",")[0]))

In [79]:
unique_stops.head()

Unnamed: 0,station_name,station_url,full_url,address,location_link,latitude,longitude
0,Alewife,/stops/place-alfcl,https://www.mbta.com/stops/place-alfcl,"Alewife Brook Pkwy and Cambridge Park Dr, Camb...",https://www.google.com/maps/search/?api=1&&que...,42.39583,-71.141287
1,Andrew,/stops/place-andrw,https://www.mbta.com/stops/place-andrw,"Dorchester Ave and Southhampton St, South Bost...",https://www.google.com/maps/search/?api=1&&que...,42.330154,-71.057655
2,Ashmont,/stops/place-asmnl,https://www.mbta.com/stops/place-asmnl,"Dorchester Ave and Ashmont St, Boston, MA 02124",https://www.google.com/maps/search/?api=1&&que...,42.28452,-71.063777
3,Braintree,/stops/place-brntn,https://www.mbta.com/stops/place-brntn,"197 Ivory St, Braintree, MA 02184",https://www.google.com/maps/search/?api=1&&que...,42.207854,-71.001138
4,Broadway,/stops/place-brdwy,https://www.mbta.com/stops/place-brdwy,"Dorchester Ave and Broadway, Boston, MA",https://www.google.com/maps/search/?api=1&&que...,42.342622,-71.056967


In [80]:
unique_stops.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 125 entries, 0 to 124
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   station_name   125 non-null    object 
 1   station_url    125 non-null    object 
 2   full_url       125 non-null    object 
 3   address        125 non-null    object 
 4   location_link  125 non-null    object 
 5   latitude       125 non-null    float64
 6   longitude      125 non-null    float64
dtypes: float64(2), object(5)
memory usage: 7.0+ KB


In [81]:
final = subway_df.merge(unique_stops, how = 'left', on = ['station_name', 'station_url'])

In [82]:
final = final.drop(columns=['full_url', 'location_link'])

In [83]:
final.head()

Unnamed: 0,transit_mode,line,subline,station_name,station_url,address,latitude,longitude
0,subway,Red,Stations,Alewife,/stops/place-alfcl,"Alewife Brook Pkwy and Cambridge Park Dr, Camb...",42.39583,-71.141287
1,subway,Red,Stations,Andrew,/stops/place-andrw,"Dorchester Ave and Southhampton St, South Bost...",42.330154,-71.057655
2,subway,Red,Stations,Ashmont,/stops/place-asmnl,"Dorchester Ave and Ashmont St, Boston, MA 02124",42.28452,-71.063777
3,subway,Red,Stations,Braintree,/stops/place-brntn,"197 Ivory St, Braintree, MA 02184",42.207854,-71.001138
4,subway,Red,Stations,Broadway,/stops/place-brdwy,"Dorchester Ave and Broadway, Boston, MA",42.342622,-71.056967


In [84]:
final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 264 entries, 0 to 263
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   transit_mode  264 non-null    object 
 1   line          264 non-null    object 
 2   subline       264 non-null    object 
 3   station_name  264 non-null    object 
 4   station_url   264 non-null    object 
 5   address       264 non-null    object 
 6   latitude      264 non-null    float64
 7   longitude     264 non-null    float64
dtypes: float64(2), object(6)
memory usage: 16.6+ KB


In [85]:
final.to_csv("data/mbta_subway_stations_geocoded_20240728.csv")