# Wayback Machine


## Setup


In [1]:
from pathlib import Path
from urllib.parse import urlparse
import glob
import time
import pandas as pd
import requests

In [2]:
SOURCES = [
    "http://abcnews.go.com/abcnews/topstories",
    "http://feeds.bbci.co.uk/news/rss.xml",
    "http://rss.cnn.com/rss/cnn_latest.rss",
    "http://rss.nytimes.com/services/xml/rss/nyt/HomePage.xml",
    "http://time.com/feed/",
    "https://feeds.nbcnews.com/nbcnews/public/news",
    "https://moxie.foxnews.com/google-publisher/latest.xml",
    "https://nypost.com/feed/",
    "https://www.cbsnews.com/latest/rss/main",
    "https://www.cnbc.com/id/100003114/device/rss/rss.html",
]
DOWNLOAD_ARCHIVES = False

## Utilities


In [3]:
def fetch_wayback_versions(
    url,
    from_timestamp="",
    to_timestamp="",
    max_versions=10,
    include_content=False,
    initial_throttle_seconds=1,  # Initial throttle time in seconds
    max_retries=5,  # Maximum number of retries for HTTP requests
):
    """
    Fetch past versions of a URL from the Wayback Machine and optionally the content,
    with exponential backoff for handling request retries.

    Parameters:
    - url (str): The URL to fetch past versions for.
    - from_timestamp (str): Start timestamp in format YYYYmmddHHMMSS. Empty means no start limit.
    - to_timestamp (str): End timestamp in format YYYYmmddHHMMSS. Empty means no end limit.
    - max_versions (int): Maximum number of versions to fetch.
    - include_content (bool): Whether to fetch the content of the archived URL.
    - initial_throttle_seconds (float): Initial number of seconds to wait before retrying a request. Default is 1.
    - max_retries (int): Maximum number of retries for a request. Default is 5.

    Returns:
    - list of dicts: Each dict contains 'timestamp', 'archive_url', and optionally 'archive_content'.
    """
    print(
        f"Fetching versions for URL: {url} from {from_timestamp} to {to_timestamp} with max versions {max_versions}"
    )

    base_url = "http://web.archive.org/cdx/search/cdx"
    params = {
        "url": url,
        "output": "json",
        "from": from_timestamp,
        "to": to_timestamp,
        "limit": max_versions,
    }
    versions = []

    def make_request_with_exponential_backoff(
        url,
        params=None,
        max_retries=max_retries,
        initial_delay=initial_throttle_seconds,
    ):
        for attempt in range(max_retries):
            try:
                print(
                    f"Attempting request to {url} (Attempt {attempt + 1}/{max_retries})"
                )
                response = requests.get(url, params=params)
                response.raise_for_status()  # Raise an HTTPError if the response was an error
                print("Request successful")
                return response  # Success
            except requests.RequestException as e:
                print(f"Request failed: {e}")
                if attempt < max_retries - 1:
                    sleep_time = initial_delay * (2**attempt)  # Exponential backoff
                    print(f"Retrying in {sleep_time} seconds...")
                    time.sleep(sleep_time)
                else:
                    print("Max retries reached, giving up.")
                    raise  # Reraise the last exception

    try:
        response = make_request_with_exponential_backoff(base_url, params=params)
        data = response.json()

        if not data or len(data) < 2:
            print("No data found for the given URL and timestamps.")
            return []

        for item in data[1:]:
            version_info = {
                "timestamp": item[1],
                "url": url,
                "archive_url": f"https://web.archive.org/web/{item[1]}/{url}",
            }
            print(f"Found version: {version_info['timestamp']}")
            if include_content:
                try:
                    print(f"Fetching content for {version_info['archive_url']}")
                    content_response = make_request_with_exponential_backoff(
                        version_info["archive_url"]
                    )
                    version_info["archive_content"] = content_response.text
                    print("Content fetched successfully")
                except requests.RequestException as e:
                    print(
                        f"Error fetching content for {version_info['archive_url']}: {e}"
                    )
                    version_info["archive_content"] = "Error fetching content"
            versions.append(version_info)

        return versions
    except requests.RequestException as e:
        print(f"Error fetching data from Wayback Machine: {e}")

        return versions


def load_feeds_as_dataframe(glob_pattern):
    """
    Load all RSS feeds matching a glob pattern into a single Pandas DataFrame.

    Parameters:
    - glob_pattern (str): The glob pattern to match files.

    Returns:
    - DataFrame: A Pandas DataFrame containing the concatenated data from all matched XML files.
    """

    files = glob.glob(glob_pattern)

    dfs = []

    for file in files:
        df = pd.read_xml(file, xpath=".//item")

        dfs.append(df)

    combined_df = pd.concat(dfs, ignore_index=True)

    return combined_df


def save_content(file_path, content):
    """
    Saves content to a file, creating parent directories if they don't exist.

    Parameters:
    - file_path (str): The path to the file where content will be saved.
    - content (str): The content to save to the file.
    """
    # Convert the file_path string to a Path object
    path = Path(file_path)

    path.parent.mkdir(parents=True, exist_ok=True)

    with path.open(mode="w", encoding="utf-8") as file:
        file.write(content)


def url_to_filepath(url, base_dir="web_content", timestamp=None):
    """
    Converts a URL to a file path, using a base directory, and optionally includes a timestamp
    for versioning.

    Parameters:
    - url (str): The URL to convert.
    - base_dir (str): The base directory where the content will be stored.
    - timestamp (str, optional): A timestamp string to include in the file path for versioning.

    Returns:
    - Path: A pathlib.Path object representing the file path.
    """
    parsed_url = urlparse(url)

    domain_name = parsed_url.netloc
    path = parsed_url.path

    clean_path = path.strip("/")

    path_parts = clean_path.split("/") if clean_path else []

    # Construct the file path from the base directory, domain name, and path parts
    if path_parts:
        # Extract the last part as the filename
        filename = path_parts.pop()
        # Identify the file extension
        name, dot, extension = filename.partition(".")
        # Insert the timestamp before the extension (if any)
        if timestamp:
            filename = (
                f"{name}{dot}{timestamp}{dot}{extension}"
                if dot
                else f"{name}{dot}{timestamp}"
            )
        else:
            filename = f"{name}{dot}{extension}"
        path_parts.append(filename)

    filepath = Path(base_dir, domain_name, *path_parts)

    return filepath

## Main


### Download Archives


In [4]:
if DOWNLOAD_ARCHIVES:
    versions = []
    for source in SOURCES:
        source_versions = fetch_wayback_versions(
            source,
            max_versions=5,
            include_content=True,
            initial_throttle_seconds=2,
            max_retries=2,
        )
        versions.extend(source_versions)
    for version in versions:
        url = version["url"]
        timestamp = version["timestamp"]
        content = version["archive_content"]
        filepath = url_to_filepath(url, base_dir="data", timestamp=timestamp)
        save_content(filepath, content)

### Show Headlines


In [9]:
abc_feeds = load_feeds_as_dataframe("./data/abcnews.go.com/abcnews/*")
bbc_feeds = load_feeds_as_dataframe("./data/feeds.bbci.co.uk/news/*.xml")
cnn_feeds = load_feeds_as_dataframe("./data/rss.cnn.com/rss/*.rss")

XMLSyntaxError: Start tag expected, '<' not found, line 1, column 1 (<string>, line 1)

In [None]:
cnn_feeds

Unnamed: 0,title,link,description,pubDate
0,"Australia, Indonesia eye security",http://www.cnn.com/rssclick/2005/WORLD/asiapcf...,Indonesian President Susilo Bambang Yudhoyono ...,"Sun, 03 Apr 2005 19:27:06 EDT"
1,Your e-mails: Europe,http://www.cnn.com/rssclick/2005/WORLD/europe/...,CNN.com asked its users to share their views o...,"Sun, 03 Apr 2005 18:34:21 EDT"
2,Your e-mails: South America,http://www.cnn.com/rssclick/2005/WORLD/europe/...,CNN.com asked its users to share their views o...,"Sun, 03 Apr 2005 18:32:18 EDT"
3,Marseille deny PSG ninth victory,http://www.cnn.com/rssclick/2005/SPORT/footbal...,Read full story for latest details.,"Sun, 03 Apr 2005 18:30:27 EDT"
4,Pope John Paul II: CNN video log,http://www.cnn.com/rssclick/2005/WORLD/europe/...,Follow CNN's video coverage of the death of Po...,"Sun, 03 Apr 2005 18:23:42 EDT"
5,Gera dents Everton European hopes,http://www.cnn.com/rssclick/2005/SPORT/footbal...,Hungarian Zoltan Gera scored a superb headed w...,"Sun, 03 Apr 2005 18:19:53 EDT"
6,Poll: U.S. Catholics would support changes,http://www.cnn.com/rssclick/2005/US/04/03/pope...,A majority of U.S. Catholics surveyed want the...,"Sun, 03 Apr 2005 18:14:27 EDT"
7,Collins swings Test to West Indies,http://www.cnn.com/rssclick/2005/SPORT/04/03/c...,Read full story for latest details.,"Sun, 03 Apr 2005 18:13:04 EDT"
8,Barcelona salvage Real Betis draw,http://www.cnn.com/rssclick/2005/SPORT/footbal...,Read full story for latest details.,"Sun, 03 Apr 2005 17:56:46 EDT"
9,'Sin City' beats up chick-flick at box office,http://www.cnn.com/rssclick/2005/SHOWBIZ/Movie...,Read full story for latest details.,"Sun, 03 Apr 2005 17:48:07 EDT"
