# Wayback Machine


## Setup


In [11]:
from pathlib import Path
from urllib.parse import urlparse
import requests

## Utilities


In [24]:
def fetch_wayback_versions(
    url, from_timestamp="", to_timestamp="", max_versions=10, include_content=False
):
    """
    Fetch past versions of a URL from the Wayback Machine and optionally the content.

    Parameters:
    - url (str): The URL to fetch past versions for.
    - from_timestamp (str): Start timestamp in format YYYYmmddHHMMSS. Empty means no start limit.
    - to_timestamp (str): End timestamp in format YYYYmmddHHMMSS. Empty means no end limit.
    - max_versions (int): Maximum number of versions to fetch.
    - include_content (bool): Whether to fetch the content of the archived URL.

    Returns:
    - list of dicts: Each dict contains 'timestamp', 'archive_url', and optionally 'archive_content'.
    """
    base_url = "http://web.archive.org/cdx/search/cdx"
    params = {
        "url": url,
        "output": "json",
        "from": from_timestamp,
        "to": to_timestamp,
        "limit": max_versions,
    }

    try:
        response = requests.get(base_url, params=params)
        response.raise_for_status()
        data = response.json()

        if not data or len(data) < 2:
            return []

        versions = []
        for item in data[1:]:
            version_info = {
                "timestamp": item[1],
                "url": url,
                "archive_url": f"https://web.archive.org/web/{item[1]}/{url}",
            }
            if include_content:
                content_response = requests.get(version_info["archive_url"])
                content_response.raise_for_status()
                version_info["archive_content"] = content_response.text
            versions.append(version_info)

        return versions
    except requests.RequestException as e:
        print(f"Error fetching data from Wayback Machine: {e}")
        return []


def save_content(file_path, content):
    """
    Saves content to a file, creating parent directories if they don't exist.

    Parameters:
    - file_path (str): The path to the file where content will be saved.
    - content (str): The content to save to the file.
    """
    # Convert the file_path string to a Path object
    path = Path(file_path)

    path.parent.mkdir(parents=True, exist_ok=True)

    with path.open(mode="w", encoding="utf-8") as file:
        file.write(content)


def url_to_filepath(url, base_dir="web_content", timestamp=None):
    """
    Converts a URL to a file path, using a base directory, and optionally includes a timestamp
    for versioning.

    Parameters:
    - url (str): The URL to convert.
    - base_dir (str): The base directory where the content will be stored.
    - timestamp (str, optional): A timestamp string to include in the file path for versioning.

    Returns:
    - Path: A pathlib.Path object representing the file path.
    """
    parsed_url = urlparse(url)

    domain_name = parsed_url.netloc
    path = parsed_url.path

    clean_path = path.strip("/")

    path_parts = clean_path.split("/") if clean_path else []

    # Construct the file path from the base directory, domain name, and path parts
    if path_parts:
        # Extract the last part as the filename
        filename = path_parts.pop()
        # Identify the file extension
        name, dot, extension = filename.partition(".")
        # Insert the timestamp before the extension (if any)
        if timestamp:
            filename = (
                f"{name}{dot}{timestamp}{dot}{extension}"
                if dot
                else f"{name}{dot}{timestamp}"
            )
        else:
            filename = f"{name}{dot}{extension}"
        path_parts.append(filename)

    filepath = Path(base_dir, domain_name, *path_parts)

    return filepath

## Main


In [13]:
url_to_check = "http://rss.cnn.com/rss/cnn_latest.rss"
versions = fetch_wayback_versions(url_to_check, max_versions=5, include_content=True)
for version in versions:
    print(version)

{'timestamp': '20050209094655', 'url': 'http://rss.cnn.com/rss/cnn_latest.rss', 'archive_url': 'https://web.archive.org/web/20050209094655/http://rss.cnn.com/rss/cnn_latest.rss', 'archive_content': '<?xml version="1.0" encoding="ISO-8859-1"?>\n<rss version="2.0">\n<channel>\n<title>CNN.com Recently Published/Updated</title>\n<link>http://www.cnn.com/rssclick/?section=cnn_latest</link>\n<description>CNN.com delivers up-to-the-minute news and information on the latest top stories, weather, entertainment, politics and more.</description>\n<language>en-us</language>\n<copyright>© 2005 Cable News Network LP, LLLP.</copyright>\n<pubDate>Wed, 09 Feb 2005 04:46:00 EST</pubDate>\n<ttl>10</ttl>\n<image>\n<title>CNN.com Recently Published/Updated</title>\n<link>http://www.cnn.com/rssclick/?section=cnn_latest</link>\n<url>http://i.cnn.net/cnn/.element/img/1.0/logo/cnn.logo.rss.gif</url>\n<width>144</width>\n<height>33</height>\n<description>CNN.com delivers up-to-the-minute news and information on

In [27]:
for version in versions:
    url = version["url"]
    timestamp = version["timestamp"]
    content = version["archive_content"]
    filepath = url_to_filepath(url, base_dir="data", timestamp=timestamp)
    print(f"URL: {url}\nFile Path: {filepath}")
    save_content(filepath, content)

URL: http://rss.cnn.com/rss/cnn_latest.rss
File Path: data/rss.cnn.com/rss/cnn_latest.20050209094655.rss
URL: http://rss.cnn.com/rss/cnn_latest.rss
File Path: data/rss.cnn.com/rss/cnn_latest.20050209094655.rss
URL: http://rss.cnn.com/rss/cnn_latest.rss
File Path: data/rss.cnn.com/rss/cnn_latest.20050403232743.rss
URL: http://rss.cnn.com/rss/cnn_latest.rss
File Path: data/rss.cnn.com/rss/cnn_latest.20050403232743.rss
URL: http://rss.cnn.com/rss/cnn_latest.rss
File Path: data/rss.cnn.com/rss/cnn_latest.20050407112433.rss
