Scrapped from: https://samples.adsbexchange.com/readsb-hist/2024/

In [1]:
import requests
import gzip
import json
from urllib.parse import urljoin

In [2]:
def get_page(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        return response.text
    except requests.RequestException as e:
        print(f"Request failed: {e}")
        return None

def parse_links(html, base_url):
    links = set()
    for line in html.splitlines():
        if 'href="' in line and '.json.gz' in line:
            start_index = line.find('href="') + len('href="')
            end_index = line.find('.json.gz') + len('.json.gz')
            link = urljoin(base_url, line[start_index:end_index])
            links.add(link)
    return links

def download_json(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        return response.json()
    except (requests.RequestException, json.JSONDecodeError) as e:
        print(f"Error processing {url}: {e}")
        return None

def scrape_directory(start_url, limit=300):
    html = get_page(start_url)
    if html is None:
        return []

    links = parse_links(html, start_url)
    all_data = []
    count = 0

    for link in links:
        if count >= limit:
            break
        print(f"Processing: {link}")
        json_data = download_json(link)
        if json_data:
            all_data.append(json_data)
            count += 1
    
    return all_data

In [3]:
if __name__ == "__main__":
    start_url = "https://samples.adsbexchange.com/readsb-hist/2024/02/01/"
    all_data = scrape_directory(start_url)
    print(f"Scraped {len(all_data)} JSON files.")
    with open('scraped_data.json', 'w') as f:
        json.dump(all_data, f, indent=4)

Processing: https://samples.adsbexchange.com/readsb-hist/2024/02/01/092105Z.json.gz
Processing: https://samples.adsbexchange.com/readsb-hist/2024/02/01/063250Z.json.gz
Processing: https://samples.adsbexchange.com/readsb-hist/2024/02/01/190300Z.json.gz
Processing: https://samples.adsbexchange.com/readsb-hist/2024/02/01/085310Z.json.gz
Processing: https://samples.adsbexchange.com/readsb-hist/2024/02/01/044605Z.json.gz
Processing: https://samples.adsbexchange.com/readsb-hist/2024/02/01/195835Z.json.gz
Processing: https://samples.adsbexchange.com/readsb-hist/2024/02/01/053345Z.json.gz
Processing: https://samples.adsbexchange.com/readsb-hist/2024/02/01/234710Z.json.gz
Processing: https://samples.adsbexchange.com/readsb-hist/2024/02/01/102425Z.json.gz
Processing: https://samples.adsbexchange.com/readsb-hist/2024/02/01/045900Z.json.gz
Processing: https://samples.adsbexchange.com/readsb-hist/2024/02/01/204550Z.json.gz
Processing: https://samples.adsbexchange.com/readsb-hist/2024/02/01/072805Z.

Processing: https://samples.adsbexchange.com/readsb-hist/2024/02/01/003400Z.json.gz
Processing: https://samples.adsbexchange.com/readsb-hist/2024/02/01/171545Z.json.gz
Processing: https://samples.adsbexchange.com/readsb-hist/2024/02/01/194815Z.json.gz
Processing: https://samples.adsbexchange.com/readsb-hist/2024/02/01/141620Z.json.gz
Processing: https://samples.adsbexchange.com/readsb-hist/2024/02/01/140910Z.json.gz
Processing: https://samples.adsbexchange.com/readsb-hist/2024/02/01/154150Z.json.gz
Processing: https://samples.adsbexchange.com/readsb-hist/2024/02/01/085750Z.json.gz
Processing: https://samples.adsbexchange.com/readsb-hist/2024/02/01/192120Z.json.gz
Processing: https://samples.adsbexchange.com/readsb-hist/2024/02/01/021155Z.json.gz
Processing: https://samples.adsbexchange.com/readsb-hist/2024/02/01/121510Z.json.gz
Processing: https://samples.adsbexchange.com/readsb-hist/2024/02/01/051110Z.json.gz
Processing: https://samples.adsbexchange.com/readsb-hist/2024/02/01/132525Z.

Processing: https://samples.adsbexchange.com/readsb-hist/2024/02/01/175625Z.json.gz
Processing: https://samples.adsbexchange.com/readsb-hist/2024/02/01/231555Z.json.gz
Processing: https://samples.adsbexchange.com/readsb-hist/2024/02/01/004950Z.json.gz
Processing: https://samples.adsbexchange.com/readsb-hist/2024/02/01/033200Z.json.gz
Processing: https://samples.adsbexchange.com/readsb-hist/2024/02/01/000935Z.json.gz
Processing: https://samples.adsbexchange.com/readsb-hist/2024/02/01/041120Z.json.gz
Processing: https://samples.adsbexchange.com/readsb-hist/2024/02/01/151650Z.json.gz
Processing: https://samples.adsbexchange.com/readsb-hist/2024/02/01/154145Z.json.gz
Processing: https://samples.adsbexchange.com/readsb-hist/2024/02/01/052715Z.json.gz
Processing: https://samples.adsbexchange.com/readsb-hist/2024/02/01/192440Z.json.gz
Processing: https://samples.adsbexchange.com/readsb-hist/2024/02/01/124055Z.json.gz
Processing: https://samples.adsbexchange.com/readsb-hist/2024/02/01/035335Z.

Processing: https://samples.adsbexchange.com/readsb-hist/2024/02/01/224100Z.json.gz
Processing: https://samples.adsbexchange.com/readsb-hist/2024/02/01/013945Z.json.gz
Processing: https://samples.adsbexchange.com/readsb-hist/2024/02/01/141525Z.json.gz
Processing: https://samples.adsbexchange.com/readsb-hist/2024/02/01/173400Z.json.gz
Processing: https://samples.adsbexchange.com/readsb-hist/2024/02/01/085130Z.json.gz
Processing: https://samples.adsbexchange.com/readsb-hist/2024/02/01/150605Z.json.gz
Scraped 300 JSON files.
