In [None]:
import urllib3
import warnings

# Suppress only the InsecureRequestWarning
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

warnings.filterwarnings("ignore")

In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import json


driver = webdriver.Firefox()

# URL to scrape
# if you don't want rtd-denver then you have to choose something else
# ideally we will have a list of agencies corresponding to their URL
# eventually..
url = 'https://transitfeeds.com/p/rtd-denver/188'


driver.get(url)

# Function to extract download links
def get_download_links():
    download_links = []
    
    # Find all "Download" buttons with the class "btn btn-xs btn-primary"
    buttons = driver.find_elements(By.XPATH, '//a[contains(@class, "btn btn-xs btn-primary")]')
    
    for button in buttons:
        # Get the href attribute (download link)
        link = button.get_attribute('href')
        if link:
            download_links.append(link)
    
    return download_links

# Function to navigate to the next page
def go_to_next_page(page_num):
    try:
        # Find the pagination button
        next_page_button = driver.find_element(By.XPATH, f'//a[text()="{page_num}"]')
        next_page_button.click()
        # Wait for the page to load by waiting for the presence of the "Download" buttons
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.XPATH, '//a[contains(@class, "btn btn-xs btn-primary")]'))
        )
    except Exception as e:
        print(f"Could not navigate to page {page_num}: {e}")
        return False
    return True

# Function to find the last page number
def find_last_page_number():
    try:
        pagination = driver.find_element(By.XPATH, '//ul[contains(@class, "pagination")]')
        page_numbers = pagination.find_elements(By.XPATH, './/li/a')
        last_page_number = max([int(page.text) for page in page_numbers if page.text.isdigit()])
        return last_page_number
    except Exception as e:
        print(f"Could not determine the last page number: {e}")
        return 1

# Start scraping the links
all_download_links = []

# Find the last page number
last_page_number = find_last_page_number()
print(f"Last page number: {last_page_number}")

# Scrape the links from all pages
for page in range(1, last_page_number + 1):
    print(f"Scraping page {page}")
    # Get download links from the current page
    all_download_links += get_download_links()
    
    # Navigate to the next page (skip for the last page)
    if page < last_page_number:
        go_to_next_page(page + 1)

# Close the browser
driver.quit()

# Write the collected AWS links to a JSON file
links_file = 'transitfeeds_links.json'
with open(links_file, 'w') as json_file:
    json.dump(all_download_links, json_file, indent=4)

print(f"Collected links have been written to {links_file}")

In [None]:
print(len(all_download_links))

In [None]:
from pprint import pprint
pprint(all_download_links)

This takes around 20 hr on 23 Mbps down 

In [None]:
import os
import requests

# Directory to save the downloaded files
save_dir = 'denver'
os.makedirs(save_dir, exist_ok=True)

# Initialize an empty list to store AWS links
aws_links = []

# Function to get the file size from the server using GET request with stream=True
def get_remote_file_size(url):
    response = requests.get(url, stream=True, verify=False)
    if response.status_code == 200:
        content_length = response.headers.get('Content-Length')
        if content_length is not None:
            return int(content_length)
        else:
            print(f"Content-Length header is missing for URL: {url}")
    else:
        print(f"Failed to get headers for URL: {url} with status code: {response.status_code}")
    return 0

# Function to download a file from a URL and save it to a specified directory
def download_file(url, save_path):
    remote_file_size = get_remote_file_size(url)
    if os.path.exists(save_path):
        local_file_size = os.path.getsize(save_path)
        if local_file_size == remote_file_size:
            print(f"File already downloaded: {save_path}")
            return
        else:
            print('local file size:', local_file_size)
            print('remote file size:', remote_file_size)
            print(f"File size mismatch for {save_path}. Redownloading...")

    response = requests.get(url, stream=True, verify=False)
    if response.status_code == 200:
        # Save the AWS link to the list
        print(response.url)
        aws_links.append(response.url)
        with open(save_path, 'wb') as file:
            for chunk in response.iter_content(chunk_size=8192):
                file.write(chunk)
        print(f"Downloaded: {save_path}")
    else:
        print(f"Failed to download: {url}")


# Function to extract the part of the URL immediately before "download"
def extract_part_from_url(url):
    return url.split('/')[-2]

# Download the first two links
for link in all_download_links:
    part_str = extract_part_from_url(link)
    file_name = f"gtfs_{part_str}.zip"
    save_path = os.path.join(save_dir, file_name)
    download_file(link, save_path)

# Write the collected AWS links to a JSON file
aws_links_file = os.path.join(save_dir, 'aws_links.json')
with open(aws_links_file, 'w') as json_file:
    json.dump(aws_links, json_file, indent=4)

print(f"Collected AWS links have been written to {aws_links_file}")

# Distill GTFS

Do not run the following cell until using onebusaway's gtfs merge module.

In [None]:
# this code saves 1 gtfs per quarter and moves the rest to a dir called "extra"
import os
import shutil
from datetime import datetime

# Directory containing GTFS ZIP files
gtfs_dir = 'denver'
bad_dir = os.path.join(gtfs_dir, 'bad')
extra_dir = os.path.join(gtfs_dir, 'extra')

# Create directories if they don't exist
os.makedirs(bad_dir, exist_ok=True)
os.makedirs(extra_dir, exist_ok=True)

# List of extra years
extra_years = ['2015', '2016', '2017', '2018']

# Function to check if a string is a valid date in the format YYYYMMDD
def is_valid_date(date_str):
    try:
        datetime.strptime(date_str, '%Y%m%d')
        return True
    except ValueError:
        return False

# Dictionary to store the earliest file per quarter
earliest_files = {}

# Function to get the quarter from a date string
def get_quarter(date_str):
    year = date_str[:4]
    month = int(date_str[4:6])
    quarter = (month - 1) // 3 + 1
    return f"{year}Q{quarter}"

# Iterate over all GTFS ZIP files in the directory
for file_name in os.listdir(gtfs_dir):
    if file_name.endswith('.zip'):
        file_path = os.path.join(gtfs_dir, file_name)
        date_str = file_name.split('_')[-1].replace('.zip', '')

        # Check if the date part is valid
        if not is_valid_date(date_str):
            shutil.move(file_path, os.path.join(bad_dir, file_name))
            continue

        # Extract year and quarter
        year = date_str[:4]
        quarter = get_quarter(date_str)

        # Check if the year is in the list of extra years
        if year in extra_years:
            shutil.move(file_path, os.path.join(extra_dir, file_name))
            continue

        # Check if this is the earliest file for the quarter
        if quarter not in earliest_files:
            earliest_files[quarter] = (date_str, file_name)
        else:
            current_earliest_date, current_earliest_file = earliest_files[quarter]
            if date_str < current_earliest_date:
                # Move the current earliest file to extra
                shutil.move(os.path.join(gtfs_dir, current_earliest_file), os.path.join(extra_dir, current_earliest_file))
                # Update with the new earliest file
                earliest_files[quarter] = (date_str, file_name)
            else:
                # Move the current file to extra
                shutil.move(file_path, os.path.join(extra_dir, file_name))

print("Processing complete.")

# GraphQL Interface

Do not run the following until the OTP instance is live.

# OTP GraphQL

In [None]:
import requests
import json

url = 'http://localhost:9999/otp/gtfs/v1'

#date: "2021-09-27"
# date: "2024-10-02"
query = """
query {
  plan(
    fromPlace: "39.79062,-104.9114"
    toPlace: "39.68415,-105.0370"
    date: "2021-09-27"
    time: "15:23:10"
    numItineraries: 10
    transportModes: [{ mode: WALK }, { mode: BUS }, { mode: RAIL }, { mode: TRAM }]
  ) {
    itineraries {
      duration
      legs {
        startTime
        endTime
        mode
        distance
        from {
          name
        }
        to {
          name
        }
      }
    }
  }
}
"""

payload = {
    "query": query,
    
}

headers = {
    'Content-Type': 'application/json',
    'OTPTimeout': '180000'
}

response = requests.post(url, headers=headers, json=payload)

if response.status_code == 200:
    data = response.json()
    print(json.dumps(data, indent=2))
else:
    print(f"Error: {response.status_code}, {response.text}")

# lastly 
# TODO: find shortest duration and convert that to a human readable format
