In [3]:
import requests
import json
import pandas as pd
import calendar

In [4]:
def get_mbta_df(url):
    '''
    parameters: url (str): the url for the mbta api call
    returns: ridership_df (dataframe): a cleaned dataframe of all the ridership data
    does: uses the given url to get, clean, and put the mbta api data into a dataframe
    '''
    # MBTA Monthly Ridership
    # Call api and create dataframe
    ridership_response = requests.get(url).text
    ridership_json = json.loads(ridership_response)
    ridership_df = pd.json_normalize(ridership_json['features'])

    # conver the date column from milliseconds to datetime
    ridership_df['Date'] = pd.to_datetime(ridership_df['properties.service_date'], unit='ms')

    # change the names of the columns
    ridership_df.rename(columns={'properties.route_or_line':'Route/Line', 'properties.average_monthly_weekday_ridersh':'Avg Monthly Weekday Ridership',
                    'properties.average_monthly_ridership':'Avg Monthly Ridership'}, inplace=True)

    # clean the dataframe values
    # convert the datetime into two columns: month and year
    ridership_df['Year'] = pd.DatetimeIndex(ridership_df['Date']).year
    ridership_df['month_num'] = pd.DatetimeIndex(ridership_df['Date']).month
    ridership_df['Month'] = ridership_df['month_num'].apply(lambda x: calendar.month_name[x])
    ridership_df.drop(columns=['Date'])
    
    # reorder columns
    ridership_df = ridership_df[['Year', 'Month', 'Route/Line',	'Avg Monthly Weekday Ridership', 'Avg Monthly Ridership', 'month_num']]
    
    # drop all data that is unrelated to rail ridership
    ridership_df = ridership_df[ridership_df['Route/Line'].isin(['Green Line', 'Orange Line', 'Red Line', 'Blue Line'])]

    return ridership_df

In [None]:
# get ridership data
url = 'https://services1.arcgis.com/ceiitspzDAHrdGO1/arcgis/rest/services/MBTA_Monthly_Ridership_By_Mode/FeatureServer/0/query?outFields=*&where=1%3D1&f=geojson'
ridership_df = get_mbta_df(url)

In [None]:
import matplotlib.pyplot as plt
import numpy as np

In [None]:
def ridership_by_year(ridership_df, year_range):
    '''
    parameters: 
        ridership_df (dataframe): dataframe of ridership data
        year_range (list of integers): list of all the years we want to collect data for
    returns:
        ridership_years : new dataframe of the same ridership data for the specified years
    does: returns a subset of data with only the years specified.
    '''
    ridership_years = pd.DataFrame(columns=['Route/Line', 'year', 'month', 'month_num', 'avg_monthly_weeday_ridership', 'avg_monthly_ridership'])
    for year in year_range:
        year_df = ridership_df[ridership_df['Year'] == year].sort_values(by='month_num', ascending=True)
        year_df.rename(columns={'Route/Line' : 'Route/Line', 'Year': 'year', 'Month' : 'month', 'month_num' : 'month_num',
                                     'Avg Monthly Weekday Ridership' : 'avg_monthly_weeday_ridership', 
                                     'Avg Monthly Ridership' : 'avg_monthly_ridership'}, inplace=True)
        ridership_years = pd.concat([ridership_years, year_df])

    return ridership_years

In [6]:
ridership_years = ridership_by_year(ridership_df, [2017, 2018, 2019, 2020, 2021, 2022, 2023])
ridership_years.head()

  ridership_years = pd.concat([ridership_years, year_df])


Unnamed: 0,Route/Line,year,month,month_num,avg_monthly_weeday_ridership,avg_monthly_ridership
82,Blue Line,2017,January,1,61669.0,50078.0
88,Green Line,2017,January,1,176111.0,145421.0
89,Orange Line,2017,January,1,206018.0,159300.0
90,Red Line,2017,January,1,245430.0,189336.0
93,Blue Line,2017,February,2,61185.0,50253.0
