In [1]:
import pandas as pd
import ast
import requests
from bs4 import BeautifulSoup
# from IPython.core.display import display, HTML
pd.set_option("display.max_colwidth", None)

In [2]:
TARGET_PATH = "./../../data/archived_snapshots_urls.csv"
recorded_urls = pd.read_csv(TARGET_PATH)
recorded_urls = recorded_urls[recorded_urls.archived_snapshots != '{}'].reset_index(drop=True)
recorded_urls['archived_snapshots'] = recorded_urls.archived_snapshots.apply(ast.literal_eval)
archived_snapshots_expanded = pd.json_normalize(recorded_urls.archived_snapshots)
recorded_urls = recorded_urls.drop(columns=["archived_snapshots"]).join(archived_snapshots_expanded)

In [3]:
recorded_urls

Unnamed: 0,url,timestamp,closest.status,closest.available,closest.url,closest.timestamp
0,mastodon.xyz,20220101000000,200,True,http://web.archive.org/web/20221122080738/https://mastodon.xyz/,20221122080738
1,mastodon.xyz,20220401000000,200,True,http://web.archive.org/web/20221122080738/https://mastodon.xyz/,20221122080738
2,mastodon.xyz,20220701000000,200,True,http://web.archive.org/web/20221122080738/https://mastodon.xyz/,20221122080738
3,mastodon.xyz,20221001000000,200,True,http://web.archive.org/web/20221122080738/https://mastodon.xyz/,20221122080738
4,mastodon.xyz,20230101000000,200,True,http://web.archive.org/web/20230101003413/https://mastodon.xyz/,20230101003413
...,...,...,...,...,...,...
3238,libretooth.gr,20240101000000,200,True,http://web.archive.org/web/20231231121630/https://libretooth.gr/,20231231121630
3239,fandom.ink,20230101000000,200,True,http://web.archive.org/web/20230105152730/https://fandom.ink/,20230105152730
3240,fandom.ink,20241001000000,200,True,http://web.archive.org/web/20240922220024/https://fandom.ink/,20240922220024
3241,mastodon.triggerphra.se,20230701000000,200,True,http://web.archive.org/web/20230615102836/https://mastodon.triggerphra.se/,20230615102836


In [6]:
import time
import requests
from urllib.parse import quote

from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait



def make_driver(headless: bool = True) -> webdriver.Chrome:
    opts = Options()
    if headless:
        # new headless is more compatible with modern sites
        opts.add_argument("--headless=new")
    opts.add_argument("--window-size=1400,900")
    opts.add_argument("--disable-gpu")
    opts.add_argument("--no-sandbox")
    opts.add_argument("--disable-dev-shm-usage")

    # reduce "automation" fingerprints a bit (not perfect, but helps sometimes)
    opts.add_argument("--disable-blink-features=AutomationControlled")
    opts.add_experimental_option("excludeSwitches", ["enable-automation"])
    opts.add_experimental_option("useAutomationExtension", False)

    # Set a normal UA (some sites + IA interstitials behave better)
    opts.add_argument(
        "--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/120.0.0.0 Safari/537.36"
    )

    # Selenium 4.6+ can auto-manage drivers via Selenium Manager
    driver = webdriver.Chrome(options=opts)
    driver.set_page_load_timeout(60)
    return driver


def open_sections_and_get_html(driver, section_titles=("Server rules", "Moderated servers"), wait_after_click=1.0):
    wait = WebDriverWait(driver, 30)

    # Wait until at least one about section title exists
    wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, ".about__section__title")))

    def title_el_for(text):
        # XPath: match a .about__section__title whose normalized visible text contains the target
        # Works even with the <i> chevron inside.
        xpath = (
            "//div[contains(@class,'about__section__title') and contains(normalize-space(.),"
            f"'{text}')]"
        )
        return wait.until(EC.element_to_be_clickable((By.XPATH, xpath)))

    for title in section_titles:
        el = title_el_for(title)

        # Scroll into view (helps in headless)
        driver.execute_script("arguments[0].scrollIntoView({block:'center'});", el)
        time.sleep(0.1)

        # Click (JS click fallback avoids intercept issues)
        try:
            el.click()
        except Exception:
            driver.execute_script("arguments[0].click();", el)

        # Wait a bit for content to load
        time.sleep(wait_after_click)

        # Optional: ensure the parent section is marked active
        driver.execute_script("""
          const t = arguments[0];
          const section = t.closest('.about__section');
          if (section) section.className = 'about__section active';
        """, el)

    # Finally, return the HTML
    return driver.page_source


def fetch_wayback_rendered_html(original_url, extra_wait_sec = 2.0):
    driver = make_driver(headless=True)
    try:
        driver.get(original_url)
        # Wait for DOM ready
        WebDriverWait(driver, 45).until(
            lambda d: d.execute_script("return document.readyState") == "complete"
        )
        html = open_sections_and_get_html(driver, ("Server rules", "Moderated servers"), wait_after_click=1.0)
        return html
    finally:
        driver.quit()

In [8]:
recorded_urls['closest.url']

0                  http://web.archive.org/web/20221122080738/https://mastodon.xyz/
1                  http://web.archive.org/web/20221122080738/https://mastodon.xyz/
2                  http://web.archive.org/web/20221122080738/https://mastodon.xyz/
3                  http://web.archive.org/web/20221122080738/https://mastodon.xyz/
4                  http://web.archive.org/web/20230101003413/https://mastodon.xyz/
                                           ...                                    
3238              http://web.archive.org/web/20231231121630/https://libretooth.gr/
3239                 http://web.archive.org/web/20230105152730/https://fandom.ink/
3240                 http://web.archive.org/web/20240922220024/https://fandom.ink/
3241    http://web.archive.org/web/20230615102836/https://mastodon.triggerphra.se/
3242    http://web.archive.org/web/20240329055752/https://mastodon.triggerphra.se/
Name: closest.url, Length: 3243, dtype: object

In [None]:
original = recorded_urls['closest.url'].iloc[0]
html = fetch_wayback_rendered_html(original_url=original, extra_wait_sec=3.0)
print("Wayback URL used:", html)
open("test.html", "w", encoding="utf-8").write(html)