In [1]:
import requests
from bs4 import BeautifulSoup

url = "https://transitfeeds.com/p/mta/79"
response = requests.get(url)
total_pages = 0

if response.status_code == 200:
    soup = BeautifulSoup(response.content, "html.parser")
    pagination = soup.find("ul", class_="pagination")
    last_page_link = pagination.find_all("a")[-1] # second to last link
    total_pages = int(last_page_link.text)
    print(f"Total number of pages: {total_pages}")
else:
    print(f"Failed to retrieve data from {url}. Status code: {response.status_code}")


Total number of pages: 6


In [6]:
from datetime import datetime

a_text_list = []

# Loop through pages 1 to total_pages
for page_num in range(1, total_pages + 1):
    url = f'https://transitfeeds.com/p/mta/79?p={page_num}'
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    # Find the first tbody element
    tbody_element = soup.find('tbody')

    # Find all tr elements within the current tbody
    tr_elements = tbody_element.find_all('tr')

    # Loop through each tr element
    for tr in tr_elements:
        # Find the first td element within the current tr
        td_element = tr.find('td')

        # Find the first a element within the current td and extract its text
        a_element = td_element.find('a')
        a_text = a_element.text

        # Convert the date string to the desired format
        date_obj = datetime.strptime(a_text, '%d %B %Y')
        formatted_date = date_obj.strftime('%Y%m%d')
        a_text_list.append(formatted_date)

print(a_text_list)

['20221129', '20221121', '20220615', '20211210', '20211109', '20210713', '20210615', '20210503', '20210315', '20201102', '20201001', '20200910', '20200814', '20200430', '20200109', '20191231', '20191112', '20191003', '20190909', '20190509', '20190423', '20190420', '20181221', '20181113', '20181004', '20180908', '20180708', '20180622', '20180615', '20180109', '20171109', '20171106', '20170919', '20170712', '20170619', '20170130', '20170124', '20170113', '20161222', '20161210', '20161107', '20161103', '20160627', '20160502', '20151207', '20150914', '20150909', '20150901', '20150616', '20141204', '20140919', '20140822', '20140801', '20140715', '20140626', '20140326', '20140204', '20131030']


In [9]:
import os
import zipfile

# Define the base URL
base_url = 'https://transitfeeds.com/p/mta/79/'

# Create a directory to store the downloaded files, if it doesn't exist
if not os.path.exists('files/zip_files'):
    os.makedirs('files/zip_files')

# Create a directory to store the extracted files, if it doesn't exist
if not os.path.exists('files/extracted'):
    os.makedirs('files/extracted')

# Loop through each date in a_text_list
for date_text in a_text_list:
    # Construct the URL for the current date
    url = f'{base_url}{date_text}/download'

    # Define the filenames and directories for the downloaded and extracted data
    zip_filename = f'files/zip_files/{date_text}.zip'
    extract_dir = f'files/extracted/{date_text}'

    # Check if the ZIP file already exists
    if os.path.exists(zip_filename):
        print(f'{zip_filename} already exists')
    else:
        # Download the data and save it to a file
        response = requests.get(url)
        if response.status_code == 200:
            with open(zip_filename, 'wb') as f:
                f.write(response.content)
            print(f'Successfully downloaded data for {date_text}')
        else:
            print(f'Error downloading data for {date_text}')
            continue

    # Check if the target directory already exists
    if os.path.exists(extract_dir):
        print(f'{extract_dir} already exists')
    else:
        # Extract the contents of the ZIP file to the target directory
        with zipfile.ZipFile(zip_filename, 'r') as zip_ref:
            zip_ref.extractall(extract_dir)
        print(f'Successfully extracted data for {date_text}')


Successfully downloaded data for 20221129
Successfully extracted data for 20221129
Successfully downloaded data for 20221121
Successfully extracted data for 20221121
Successfully downloaded data for 20220615
Successfully extracted data for 20220615
Successfully downloaded data for 20211210
Successfully extracted data for 20211210
Successfully downloaded data for 20211109
Successfully extracted data for 20211109
Successfully downloaded data for 20210713
Successfully extracted data for 20210713
Successfully downloaded data for 20210615
Successfully extracted data for 20210615
Successfully downloaded data for 20210503
Successfully extracted data for 20210503
Successfully downloaded data for 20210315
Successfully extracted data for 20210315
Successfully downloaded data for 20201102
Successfully extracted data for 20201102
Successfully downloaded data for 20201001
Successfully extracted data for 20201001
Successfully downloaded data for 20200910
Successfully extracted data for 20200910
Succ