# Wayback Machine


## Setup


In [2]:
import requests

## Utilities


In [4]:
def fetch_wayback_versions(
    url, from_timestamp="", to_timestamp="", max_versions=10, include_content=False
):
    """
    Fetch past versions of a URL from the Wayback Machine and optionally the content.

    Parameters:
    - url (str): The URL to fetch past versions for.
    - from_timestamp (str): Start timestamp in format YYYYmmddHHMMSS. Empty means no start limit.
    - to_timestamp (str): End timestamp in format YYYYmmddHHMMSS. Empty means no end limit.
    - max_versions (int): Maximum number of versions to fetch.
    - include_content (bool): Whether to fetch the content of the archived URL.

    Returns:
    - list of dicts: Each dict contains 'timestamp', 'archive_url', and optionally 'archive_content'.
    """
    base_url = "http://web.archive.org/cdx/search/cdx"
    params = {
        "url": url,
        "output": "json",
        "from": from_timestamp,
        "to": to_timestamp,
        "limit": max_versions,
    }

    try:
        response = requests.get(base_url, params=params)
        response.raise_for_status()  # Raise an HTTPError if the response was an error
        data = response.json()

        if not data or len(data) < 2:
            return []

        versions = []
        for item in data[1:]:
            version_info = {
                "timestamp": item[1],
                "archive_url": f"https://web.archive.org/web/{item[1]}/{url}",
            }
            if include_content:
                content_response = requests.get(version_info["archive_url"])
                content_response.raise_for_status()  # Raise an HTTPError if the response was an error
                version_info["archive_content"] = content_response.text
            versions.append(version_info)

        return versions
    except requests.RequestException as e:
        print(f"Error fetching data from Wayback Machine: {e}")
        return []

## Main


In [5]:
url_to_check = "http://rss.cnn.com/rss/cnn_latest.rss"
versions = fetch_wayback_versions(url_to_check, max_versions=5)
for version in versions:
    print(version)

{'timestamp': '20050209094655', 'archive_url': 'https://web.archive.org/web/20050209094655/http://rss.cnn.com/rss/cnn_latest.rss'}
{'timestamp': '20050209094655', 'archive_url': 'https://web.archive.org/web/20050209094655/http://rss.cnn.com/rss/cnn_latest.rss'}
{'timestamp': '20050403232743', 'archive_url': 'https://web.archive.org/web/20050403232743/http://rss.cnn.com/rss/cnn_latest.rss'}
{'timestamp': '20050403232743', 'archive_url': 'https://web.archive.org/web/20050403232743/http://rss.cnn.com/rss/cnn_latest.rss'}
{'timestamp': '20050407112433', 'archive_url': 'https://web.archive.org/web/20050407112433/http://rss.cnn.com/rss/cnn_latest.rss'}
