In [12]:
#I just downloaded the data from NYC OPEN DATA directly


In [2]:
import requests
import random
import logging
from datetime import datetime, timedelta
import time
import csv

# 🔧 Config
API_KEY_ID = "5yx99dm30481n7svtve0c1e7t"
API_KEY_SECRET = "1t99at0isb9o5erxzsp1ngfn0fjv8zj46cpgo5q2pdbevzwy8q"
BASE_URL = "https://data.ny.gov/resource/wujg-7c2s.json"
NUM_SAMPLES = 20
START_DATE = datetime(2021, 3, 1)
END_DATE = datetime.today()
#OUTPUT_FILE = "/Users/danielbrown/Desktop/Portfolio_Projects/fare_evasion/data/raw/mta_random_days_20.csv"

# 🪵 Logging setup
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s"
)

def get_random_date():
    total_days = (END_DATE - START_DATE).days
    random_day = random.randint(0, total_days)
    return (START_DATE + timedelta(days=random_day)).strftime("%Y-%m-%d")

def fetch_data_for_date(date_str, max_retries=3, retry_delay=2):
    logging.info(f"📅 Fetching all records for {date_str}...")
    all_records = []
    offset = 0
    limit = 1000
    done = False

    next_date = (datetime.strptime(date_str, "%Y-%m-%d") + timedelta(days=1)).strftime("%Y-%m-%d")
    where_clause = f"transit_timestamp >= '{date_str}T00:00:00' AND transit_timestamp < '{next_date}T00:00:00'"

    for attempt in range(1, max_retries + 1):
        try:
            while not done:
                url = (
                    f"{BASE_URL}?$where={where_clause}"
                    f"&$limit={limit}&$offset={offset}"
                )
                response = requests.get(url, auth=(API_KEY_ID, API_KEY_SECRET))
                if response.status_code == 200:
                    batch = response.json()
                    logging.info(f"📦 Retrieved {len(batch)} records at offset {offset}")
                    all_records.extend(batch)

                    if len(batch) < limit:
                        done = True
                    else:
                        offset += limit
                else:
                    logging.warning(f"⚠️ API error {response.status_code}: {response.text}")
                    break

                time.sleep(0.25)  # politeness delay

            if len(all_records) == 0:
                logging.warning(f"🕒 Attempt {attempt}: Received 0 records for {date_str}. Retrying after {retry_delay} seconds...")
                time.sleep(retry_delay)
                offset = 0
                done = False
                retry_delay *= 2
            else:
                break
        except Exception as e:
            logging.error(f"❌ Exception on attempt {attempt} for {date_str}: {e}")
            time.sleep(retry_delay)
            retry_delay *= 2

    if len(all_records) == 0:
        logging.error(f"🚫 No data retrieved for {date_str} after {max_retries} attempts.")

    logging.info(f"✅ Finished fetching {len(all_records)} records for {date_str}")
    return all_records

def write_to_csv(data_list, output_file):
    if not data_list:
        logging.warning("⚠️ No data to write.")
        return

    fieldnames = sorted(set().union(*(d.keys() for d in data_list)))

    try:
        with open(output_file, mode="w", newline="", encoding="utf-8") as f:
            writer = csv.DictWriter(f, fieldnames=fieldnames)
            writer.writeheader()
            writer.writerows(data_list)
        logging.info(f"📄 Successfully wrote {len(data_list)} rows to '{output_file}'")
    except Exception as e:
        logging.error(f"❌ Failed to write CSV: {e}")

def sample_and_fetch_random_days(n):
    logging.info("🚀 Starting random full-day sampling from MTA ridership dataset...")

    successful_dates = set()
    tried_dates = set()
    all_data = []

    while len(successful_dates) < n:
        date = get_random_date()
        if date in tried_dates:
            continue

        tried_dates.add(date)
        day_data = fetch_data_for_date(date)
        if day_data:
            all_data.extend(day_data)
            successful_dates.add(date)
        else:
            logging.info(f"🔁 Will sample another day to replace failed date: {date}")

        time.sleep(0.3)

    logging.info(f"🏁 Successfully retrieved data for {len(successful_dates)} unique days: {sorted(successful_dates)}")
    write_to_csv(all_data, OUTPUT_FILE)

if __name__ == "__main__":
    sample_and_fetch_random_days(NUM_SAMPLES)


2025-08-03 12:47:42,578 - INFO - 🚀 Starting random full-day sampling from MTA ridership dataset...
2025-08-03 12:47:42,587 - INFO - 📅 Fetching all records for 2021-05-14...
2025-08-03 12:47:44,288 - INFO - 📦 Retrieved 1000 records at offset 0
2025-08-03 12:47:45,403 - INFO - 📦 Retrieved 1000 records at offset 1000
2025-08-03 12:47:46,511 - INFO - 📦 Retrieved 1000 records at offset 2000
2025-08-03 12:47:47,235 - INFO - 📦 Retrieved 1000 records at offset 3000
2025-08-03 12:47:48,038 - INFO - 📦 Retrieved 1000 records at offset 4000
2025-08-03 12:47:48,977 - INFO - 📦 Retrieved 1000 records at offset 5000
2025-08-03 12:47:49,672 - INFO - 📦 Retrieved 1000 records at offset 6000
2025-08-03 12:47:50,302 - INFO - 📦 Retrieved 1000 records at offset 7000
2025-08-03 12:47:51,125 - INFO - 📦 Retrieved 1000 records at offset 8000
2025-08-03 12:47:51,819 - INFO - 📦 Retrieved 1000 records at offset 9000
2025-08-03 12:47:52,658 - INFO - 📦 Retrieved 1000 records at offset 10000
2025-08-03 12:47:53,478 - 