In [None]:
#Hello

In [5]:
import requests
import random
import logging
from datetime import datetime, timedelta
import time

# 🔧 Config
API_KEY_ID = "5yx99dm30481n7svtve0c1e7t"
API_KEY_SECRET = "1t99at0isb9o5erxzsp1ngfn0fjv8zj46cpgo5q2pdbevzwy8q"
BASE_URL = "https://data.ny.gov/resource/vxuj-8kew.json"
NUM_SAMPLES = 5  # Number of random dates to fetch
START_DATE = datetime(2020, 3, 1)  # Dataset starts March 2020
END_DATE = datetime.today()

# 🪵 Logging setup
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s"
)

def get_random_dates(n):
    """Randomly pick `n` full calendar dates between START_DATE and END_DATE."""
    total_days = (END_DATE - START_DATE).days
    sampled_days = random.sample(range(total_days), n)
    return [(START_DATE + timedelta(days=d)).strftime("%Y-%m-%d") for d in sampled_days]

def fetch_data_for_date(date_str):
    """Fetch ridership data for a specific date using basic auth."""
    url = f"{BASE_URL}?date={date_str}T00:00:00.000"

    try:
        response = requests.get(url, auth=(API_KEY_ID, API_KEY_SECRET))
        if response.status_code == 200:
            data = response.json()
            if data:
                logging.info(f"✅ Fetched {len(data)} records for {date_str}")
            else:
                logging.warning(f"⚠️ No data found for {date_str}")
            return data
        else:
            logging.error(f"❌ Failed to fetch {date_str}: {response.status_code} - {response.text}")
            return []
    except Exception as e:
        logging.error(f"❌ Error fetching data for {date_str}: {e}")
        return []

def sample_and_fetch_random_days(n):
    """Randomly sample `n` dates and fetch ridership data for each."""
    logging.info("🚀 Starting random full-day sampling from MTA ridership dataset...")
    sampled_dates = get_random_dates(n)
    logging.info(f"🎯 Randomly selected dates: {sampled_dates}")

    for date in sampled_dates:
        fetch_data_for_date(date)
        time.sleep(0.3)  # Pause to avoid hitting rate limits

if __name__ == "__main__":
    sample_and_fetch_random_days(NUM_SAMPLES)


2025-08-02 20:31:07,133 - INFO - 🚀 Starting random full-day sampling from MTA ridership dataset...
2025-08-02 20:31:07,135 - INFO - 🎯 Randomly selected dates: ['2024-03-27', '2021-06-28', '2020-03-10', '2020-05-11', '2024-11-27']
2025-08-02 20:31:08,051 - INFO - ✅ Fetched 1 records for 2024-03-27
2025-08-02 20:31:08,946 - INFO - ✅ Fetched 1 records for 2021-06-28
2025-08-02 20:31:09,893 - INFO - ✅ Fetched 1 records for 2020-03-10
2025-08-02 20:31:10,610 - INFO - ✅ Fetched 1 records for 2020-05-11
2025-08-02 20:31:11,327 - INFO - ✅ Fetched 1 records for 2024-11-27


In [10]:
import requests
import random
import logging
from datetime import datetime, timedelta
import time
import csv

# 🔧 Config
API_KEY_ID = "5yx99dm30481n7svtve0c1e7t"
API_KEY_SECRET = "1t99at0isb9o5erxzsp1ngfn0fjv8zj46cpgo5q2pdbevzwy8q"
BASE_URL = "https://data.ny.gov/resource/wujg-7c2s.json"
NUM_SAMPLES = 5
START_DATE = datetime(2020, 3, 1)
END_DATE = datetime.today()
OUTPUT_FILE = "/Users/danielbrown/Desktop/mta_random_days.csv"

# 🪵 Logging setup
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s"
)

def get_random_dates(n):
    total_days = (END_DATE - START_DATE).days
    sampled_days = random.sample(range(total_days), n)
    return [(START_DATE + timedelta(days=d)).strftime("%Y-%m-%d") for d in sampled_days]

def fetch_data_for_date(date_str, max_retries=3, retry_delay=2):
    logging.info(f"📅 Fetching all records for {date_str}...")
    all_records = []
    offset = 0
    limit = 1000
    done = False

    next_date = (datetime.strptime(date_str, "%Y-%m-%d") + timedelta(days=1)).strftime("%Y-%m-%d")
    where_clause = f"transit_timestamp >= '{date_str}T00:00:00' AND transit_timestamp < '{next_date}T00:00:00'"

    # Outer retry loop for the full fetch process
    for attempt in range(1, max_retries + 1):
        try:
            while not done:
                url = (
                    f"{BASE_URL}?$where={where_clause}"
                    f"&$limit={limit}&$offset={offset}"
                )
                response = requests.get(url, auth=(API_KEY_ID, API_KEY_SECRET))
                if response.status_code == 200:
                    batch = response.json()
                    logging.info(f"📦 Retrieved {len(batch)} records at offset {offset}")
                    all_records.extend(batch)

                    if len(batch) < limit:
                        done = True
                    else:
                        offset += limit
                else:
                    logging.warning(f"⚠️ API error {response.status_code}: {response.text}")
                    break

                time.sleep(0.25)  # politeness delay

            if len(all_records) == 0:
                logging.warning(f"🕒 Attempt {attempt}: Received 0 records for {date_str}. Retrying after {retry_delay} seconds...")
                time.sleep(retry_delay)
                offset = 0
                done = False
                retry_delay *= 2  # exponential backoff
            else:
                break  # success
        except Exception as e:
            logging.error(f"❌ Exception on attempt {attempt} for {date_str}: {e}")
            time.sleep(retry_delay)
            retry_delay *= 2

    if len(all_records) == 0:
        logging.error(f"🚫 No data retrieved for {date_str} after {max_retries} attempts.")

    logging.info(f"✅ Finished fetching {len(all_records)} records for {date_str}")
    return all_records



def write_to_csv(data_list, output_file):
    if not data_list:
        logging.warning("⚠️ No data to write.")
        return

    # Determine fieldnames from keys of the first record
    fieldnames = sorted(set().union(*(d.keys() for d in data_list)))

    try:
        with open(output_file, mode="w", newline="", encoding="utf-8") as f:
            writer = csv.DictWriter(f, fieldnames=fieldnames)
            writer.writeheader()
            writer.writerows(data_list)
        logging.info(f"📄 Successfully wrote {len(data_list)} rows to '{output_file}'")
    except Exception as e:
        logging.error(f"❌ Failed to write CSV: {e}")

def sample_and_fetch_random_days(n):
    logging.info("🚀 Starting random full-day sampling from MTA ridership dataset...")
    sampled_dates = get_random_dates(n)
    logging.info(f"🎯 Randomly selected dates: {sampled_dates}")

    all_data = []
    for date in sampled_dates:
        day_data = fetch_data_for_date(date)
        if day_data:
            all_data.extend(day_data)
        time.sleep(0.3)  # avoid hammering the API

    write_to_csv(all_data, OUTPUT_FILE)

if __name__ == "__main__":
    sample_and_fetch_random_days(NUM_SAMPLES)


2025-08-03 10:37:31,869 - INFO - 🚀 Starting random full-day sampling from MTA ridership dataset...
2025-08-03 10:37:31,875 - INFO - 🎯 Randomly selected dates: ['2021-06-02', '2020-12-14', '2023-01-22', '2020-05-06', '2023-03-20']
2025-08-03 10:37:31,875 - INFO - 📅 Fetching all records for 2021-06-02...
2025-08-03 10:37:35,041 - INFO - 📦 Retrieved 1000 records at offset 0
2025-08-03 10:37:36,264 - INFO - 📦 Retrieved 1000 records at offset 1000
2025-08-03 10:37:37,033 - INFO - 📦 Retrieved 1000 records at offset 2000
2025-08-03 10:37:38,819 - INFO - 📦 Retrieved 1000 records at offset 3000
2025-08-03 10:37:39,741 - INFO - 📦 Retrieved 1000 records at offset 4000
2025-08-03 10:37:41,380 - INFO - 📦 Retrieved 1000 records at offset 5000
2025-08-03 10:37:42,194 - INFO - 📦 Retrieved 1000 records at offset 6000
2025-08-03 10:37:43,019 - INFO - 📦 Retrieved 1000 records at offset 7000
2025-08-03 10:37:44,039 - INFO - 📦 Retrieved 1000 records at offset 8000
2025-08-03 10:37:44,857 - INFO - 📦 Retriev

In [11]:
import requests
import random
import logging
from datetime import datetime, timedelta
import time
import csv

# 🔧 Config
API_KEY_ID = "5yx99dm30481n7svtve0c1e7t"
API_KEY_SECRET = "1t99at0isb9o5erxzsp1ngfn0fjv8zj46cpgo5q2pdbevzwy8q"
BASE_URL = "https://data.ny.gov/resource/wujg-7c2s.json"
NUM_SAMPLES = 75
START_DATE = datetime(2021, 3, 1)
END_DATE = datetime.today()
OUTPUT_FILE = "/Users/danielbrown/Desktop/mta_random_days.csv"

# 🪵 Logging setup
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s"
)

def get_random_date():
    total_days = (END_DATE - START_DATE).days
    random_day = random.randint(0, total_days)
    return (START_DATE + timedelta(days=random_day)).strftime("%Y-%m-%d")

def fetch_data_for_date(date_str, max_retries=3, retry_delay=2):
    logging.info(f"📅 Fetching all records for {date_str}...")
    all_records = []
    offset = 0
    limit = 1000
    done = False

    next_date = (datetime.strptime(date_str, "%Y-%m-%d") + timedelta(days=1)).strftime("%Y-%m-%d")
    where_clause = f"transit_timestamp >= '{date_str}T00:00:00' AND transit_timestamp < '{next_date}T00:00:00'"

    for attempt in range(1, max_retries + 1):
        try:
            while not done:
                url = (
                    f"{BASE_URL}?$where={where_clause}"
                    f"&$limit={limit}&$offset={offset}"
                )
                response = requests.get(url, auth=(API_KEY_ID, API_KEY_SECRET))
                if response.status_code == 200:
                    batch = response.json()
                    logging.info(f"📦 Retrieved {len(batch)} records at offset {offset}")
                    all_records.extend(batch)

                    if len(batch) < limit:
                        done = True
                    else:
                        offset += limit
                else:
                    logging.warning(f"⚠️ API error {response.status_code}: {response.text}")
                    break

                time.sleep(0.25)  # politeness delay

            if len(all_records) == 0:
                logging.warning(f"🕒 Attempt {attempt}: Received 0 records for {date_str}. Retrying after {retry_delay} seconds...")
                time.sleep(retry_delay)
                offset = 0
                done = False
                retry_delay *= 2
            else:
                break
        except Exception as e:
            logging.error(f"❌ Exception on attempt {attempt} for {date_str}: {e}")
            time.sleep(retry_delay)
            retry_delay *= 2

    if len(all_records) == 0:
        logging.error(f"🚫 No data retrieved for {date_str} after {max_retries} attempts.")

    logging.info(f"✅ Finished fetching {len(all_records)} records for {date_str}")
    return all_records

def write_to_csv(data_list, output_file):
    if not data_list:
        logging.warning("⚠️ No data to write.")
        return

    fieldnames = sorted(set().union(*(d.keys() for d in data_list)))

    try:
        with open(output_file, mode="w", newline="", encoding="utf-8") as f:
            writer = csv.DictWriter(f, fieldnames=fieldnames)
            writer.writeheader()
            writer.writerows(data_list)
        logging.info(f"📄 Successfully wrote {len(data_list)} rows to '{output_file}'")
    except Exception as e:
        logging.error(f"❌ Failed to write CSV: {e}")

def sample_and_fetch_random_days(n):
    logging.info("🚀 Starting random full-day sampling from MTA ridership dataset...")

    successful_dates = set()
    tried_dates = set()
    all_data = []

    while len(successful_dates) < n:
        date = get_random_date()
        if date in tried_dates:
            continue

        tried_dates.add(date)
        day_data = fetch_data_for_date(date)
        if day_data:
            all_data.extend(day_data)
            successful_dates.add(date)
        else:
            logging.info(f"🔁 Will sample another day to replace failed date: {date}")

        time.sleep(0.3)

    logging.info(f"🏁 Successfully retrieved data for {len(successful_dates)} unique days: {sorted(successful_dates)}")
    write_to_csv(all_data, OUTPUT_FILE)

if __name__ == "__main__":
    sample_and_fetch_random_days(NUM_SAMPLES)



2025-08-03 10:48:05,441 - INFO - 🚀 Starting random full-day sampling from MTA ridership dataset...
2025-08-03 10:48:05,442 - INFO - 📅 Fetching all records for 2022-03-30...
2025-08-03 10:48:06,313 - INFO - 📦 Retrieved 1000 records at offset 0
2025-08-03 10:48:07,644 - INFO - 📦 Retrieved 1000 records at offset 1000
2025-08-03 10:48:08,556 - INFO - 📦 Retrieved 1000 records at offset 2000
2025-08-03 10:48:09,413 - INFO - 📦 Retrieved 1000 records at offset 3000
2025-08-03 10:48:10,205 - INFO - 📦 Retrieved 1000 records at offset 4000
2025-08-03 10:48:11,015 - INFO - 📦 Retrieved 1000 records at offset 5000
2025-08-03 10:48:12,042 - INFO - 📦 Retrieved 1000 records at offset 6000
2025-08-03 10:48:12,763 - INFO - 📦 Retrieved 1000 records at offset 7000
2025-08-03 10:48:13,475 - INFO - 📦 Retrieved 1000 records at offset 8000
2025-08-03 10:48:14,302 - INFO - 📦 Retrieved 1000 records at offset 9000
2025-08-03 10:48:15,054 - INFO - 📦 Retrieved 1000 records at offset 10000
2025-08-03 10:48:15,801 - 