In [1]:
import pandas as pd
import ast
from tqdm import tqdm
pd.set_option("display.max_colwidth", None)
data_path = "./../../data/"

In [2]:
TARGET_PATH = data_path + "archived_snapshots_urls.csv"
recorded_urls = pd.read_csv(TARGET_PATH)
recorded_urls = recorded_urls[recorded_urls.archived_snapshots != '{}'].reset_index(drop=True)
recorded_urls['archived_snapshots'] = recorded_urls.archived_snapshots.apply(ast.literal_eval)
archived_snapshots_expanded = pd.json_normalize(recorded_urls.archived_snapshots)
recorded_urls = recorded_urls.drop(columns=["archived_snapshots"]).join(archived_snapshots_expanded)
recorded_urls = recorded_urls.drop_duplicates("closest.url").reset_index(drop=True)


import re

def parse_about_url(u: str) -> str:
    m = re.search(r"/https?:/", u)
    token = m.group(0) # "/http:/" or "/https:/"
    a, b = u.split(token, 1)
    # add if_ only if the /web/<segment> ends with >=2 digits
    if re.search(r"/web/\d{2,}$", a):
        a += "if_"
    out = a + token + b
    out = out.rstrip("/")
    return out if out.endswith("/about") else out + "/about"

recorded_urls["if_format_url"] = recorded_urls["closest.url"].apply(parse_about_url)

recorded_urls.to_csv(data_path + "snapshots_urls (web.archive.org).csv")
recorded_urls

Unnamed: 0,url,timestamp,closest.status,closest.available,closest.url,closest.timestamp,if_format_url
0,mastodon.xyz,20220101000000,200,True,http://web.archive.org/web/20221122080738/https://mastodon.xyz/,20221122080738,http://web.archive.org/web/20221122080738if_/https://mastodon.xyz/about
1,mastodon.xyz,20230101000000,200,True,http://web.archive.org/web/20230101003413/https://mastodon.xyz/,20230101003413,http://web.archive.org/web/20230101003413if_/https://mastodon.xyz/about
2,mastodon.xyz,20230701000000,200,True,http://web.archive.org/web/20230701151143/https://mastodon.xyz/,20230701151143,http://web.archive.org/web/20230701151143if_/https://mastodon.xyz/about
3,mastodon.xyz,20231001000000,200,True,http://web.archive.org/web/20230924065620/https://mastodon.xyz/,20230924065620,http://web.archive.org/web/20230924065620if_/https://mastodon.xyz/about
4,mastodon.xyz,20240101000000,200,True,http://web.archive.org/web/20231229195858/https://mastodon.xyz/,20231229195858,http://web.archive.org/web/20231229195858if_/https://mastodon.xyz/about
...,...,...,...,...,...,...,...
3638,fluffy.family,20250401000000,200,True,http://web.archive.org/web/20250318155136/https://fluffy.family/,20250318155136,http://web.archive.org/web/20250318155136if_/https://fluffy.family/about
3639,fluffy.family,20250701000000,200,True,http://web.archive.org/web/20250623052106/https://fluffy.family/,20250623052106,http://web.archive.org/web/20250623052106if_/https://fluffy.family/about
3640,fluffy.family,20251001000000,200,True,http://web.archive.org/web/20250917194532/https://fluffy.family/,20250917194532,http://web.archive.org/web/20250917194532if_/https://fluffy.family/about
3641,toot.garden,20220101000000,200,True,http://web.archive.org/web/20221119084052/https://toot.garden/,20221119084052,http://web.archive.org/web/20221119084052if_/https://toot.garden/about


In [3]:
import time
import requests
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait


def make_driver(headless: bool = True) -> webdriver.Chrome:
    opts = Options()
    if headless:
        opts.add_argument("--headless=new")
    opts.add_argument("--window-size=1400,900")
    opts.add_argument("--disable-gpu")
    opts.add_argument("--no-sandbox")
    opts.add_argument("--disable-dev-shm-usage")
    opts.add_argument(
        "--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/120.0.0.0 Safari/537.36"
    )
    driver = webdriver.Chrome(options=opts)
    driver.set_page_load_timeout(60)
    return driver


def _wait_for_about_ui(driver, timeout=45):
    WebDriverWait(driver, timeout).until(lambda d: d.execute_script("""
        const container = document.querySelector('.scrollable.about');
        const titles = document.querySelectorAll('.about__section__title');
        return !!container && titles.length >= 2;
    """))

def _scroll_about_container_to_bottom(driver, settle_pause=0.35, max_steps=60):
    for _ in range(max_steps):
        reached = driver.execute_script("""
            const sc = document.querySelector('.scrollable.about');
            if (!sc) return true;
            const prevTop = sc.scrollTop;
            sc.scrollTop = sc.scrollHeight;
            const atBottom = (sc.scrollTop + sc.clientHeight >= sc.scrollHeight - 2);
            const stuck = (sc.scrollTop === prevTop);
            return atBottom || stuck;
        """)
        time.sleep(settle_pause)
        if reached:
            break


def _ensure_section_active_and_loaded(driver, title_text: str, timeout=25):
    # scroll bottom BEFORE any clicking
    _scroll_about_container_to_bottom(driver)

    # if already active, don't click
    is_active = driver.execute_script("""
        const title = arguments[0];
        const el = Array.from(document.querySelectorAll('.about__section__title'))
          .find(x => (x.innerText || '').trim().includes(title));
        const sec = el?.closest('.about__section');
        return !!(sec && sec.classList.contains('active'));
    """, title_text)

    if not is_active:
        clicked = driver.execute_script("""
            const title = arguments[0];
            const el = Array.from(document.querySelectorAll('.about__section__title'))
              .find(x => (x.innerText || '').trim().includes(title));
            if (!el) return false;
            el.scrollIntoView({block:'center'});
            el.click();
            return true;
        """, title_text)
        if not clicked:
            raise RuntimeError(f"Could not find/click section title: {title_text}")

    WebDriverWait(driver, timeout).until(lambda d: d.execute_script("""
        const title = arguments[0];
        const el = Array.from(document.querySelectorAll('.about__section__title'))
          .find(x => (x.innerText || '').trim().includes(title));
        if (!el) return false;
        const sec = el.closest('.about__section');
        if (!sec) return false;
        const body = sec.querySelector('.about__section__body');
        if (!body) return false;
        return sec.classList.contains('active') && (body.innerText || '').trim().length > 10;
    """, title_text))

    # force exact class string
    driver.execute_script("""
        const title = arguments[0];
        const el = Array.from(document.querySelectorAll('.about__section__title'))
          .find(x => (x.innerText || '').trim().includes(title));
        const sec = el?.closest('.about__section');
        if (sec) sec.className = 'about__section active';
    """, title_text)


def fetch_wayback_rendered_html(
    original_about_url: str,  # e.g. "https://mastodon.xyz/about"
    section_titles=("Server rules", "Moderated servers"),
    per_section_sleep=1.0,
    max_captures_to_try=8,
    debug_prefix="wayback_debug",
):

    last_err = None
    driver = make_driver(headless=True)
    try:
        driver.get(original_about_url)

        WebDriverWait(driver, 60).until(
            lambda d: d.execute_script("return document.readyState") == "complete"
        )

        # If the about UI doesn't appear, this capture/replay combo is unusable: try next timestamp.
        _wait_for_about_ui(driver, timeout=40)

        # required scroll before clicks
        _scroll_about_container_to_bottom(driver)

        for title in section_titles:
            _ensure_section_active_and_loaded(driver, title, timeout=30)
            time.sleep(per_section_sleep)

        return driver.page_source

    except Exception as e:
        last_err = e
        try:
            with open(f"{debug_prefix}_capture.html", "w", encoding="utf-8") as f:
                f.write(driver.page_source)
            driver.save_screenshot(f"{debug_prefix}_capture.png")
        except Exception:
            pass
    finally:
        driver.quit()

    raise RuntimeError(f"All captures failed. Last error: {last_err}")


In [4]:
recorded_urls[:3]

Unnamed: 0,url,timestamp,closest.status,closest.available,closest.url,closest.timestamp,if_format_url
0,mastodon.xyz,20220101000000,200,True,http://web.archive.org/web/20221122080738/https://mastodon.xyz/,20221122080738,http://web.archive.org/web/20221122080738if_/https://mastodon.xyz/about
1,mastodon.xyz,20230101000000,200,True,http://web.archive.org/web/20230101003413/https://mastodon.xyz/,20230101003413,http://web.archive.org/web/20230101003413if_/https://mastodon.xyz/about
2,mastodon.xyz,20230701000000,200,True,http://web.archive.org/web/20230701151143/https://mastodon.xyz/,20230701151143,http://web.archive.org/web/20230701151143if_/https://mastodon.xyz/about


In [None]:
import json

out_path = data_path + "/about_pages.jsonl"

with open(out_path, "a", encoding="utf-8") as f:
    for row in tqdm(recorded_urls.itertuples()):
        original_url = row.url
        get_url = row.if_format_url
        baseline_timestamp = row.timestamp
        try:
            html = fetch_wayback_rendered_html(get_url)
            rec = {"url": original_url, "get_url": get_url, "baseline_timestamp": baseline_timestamp,  "html": html}
        except Exception as e:
            # still write a row so you know which ones failed
            rec = {"url": get_url, "error": str(e)}
        f.write(json.dumps(rec, ensure_ascii=False) + "\n")


5it [00:39,  9.18s/it]