In [1]:
# infobox scraper

import requests
from bs4 import BeautifulSoup
import pandas as pd
from tqdm import tqdm
import re

WIKI_BASE = "https://en.wikipedia.org"
CATEGORY_URL = f"{WIKI_BASE}/wiki/Category:SummerSlam"

def get_event_links():
    resp = requests.get(CATEGORY_URL)
    resp.raise_for_status()
    soup = BeautifulSoup(resp.text, "html.parser")
    links = []
    for a in soup.select('.mw-category a[href*="SummerSlam_"]'):
        href = a['href']
        title = a.get_text(strip=True)
        if '(' in title and ')' in title and 'SummerSlam' in title:
            links.append((title, WIKI_BASE + href))
    links = list(set(links))
    links.sort(key=lambda x: x[0])
    return links

def get_infobox(soup):
    infobox = soup.find('table', {'class': lambda x: x and 'infobox' in x})
    data = {}
    if infobox:
        for row in infobox.find_all('tr'):
            if row.th and row.td:
                key = row.th.get_text(" ", strip=True)
                val = row.td.get_text(" ", strip=True)
                data[key] = val
    return data

def merge_tagline(box):
    # Collapse any "Tagline(s)", "Tagline (s)", etc. into 'Tagline' (single column)
    tagline_keys = [k for k in box if re.match(r'^tagline(\s*\(\s*s\s*\))?$', k.lower().replace('\xa0', '')) or
                                   re.match(r'^tagline\s*\(\s*s\s*\)$', k.lower().replace('\xa0', ''))]
    main_tagline = box.get("Tagline", "").strip() if "Tagline" in box else ""
    other_tagline = ""
    for k in tagline_keys:
        if k != "Tagline" and box.get(k, "").strip():
            other_tagline = box[k].strip()
            break  # take first non-empty
    for k in tagline_keys:
        box.pop(k, None)
    box["Tagline"] = main_tagline if main_tagline else other_tagline
    return box

def relabel_keys(box):
    """
    Relabel "Brand(s)" and close variants to "brand",
    and "Buy rate" (case-insensitive, with or without space) to "buy_rate".
    """
    # Normalize key mapping: key in lower, spaces removed for matching
    new_box = {}
    for key, value in box.items():
        norm = key.lower().replace(' ', '').replace('\xa0','')
        # Brand(s) or Brand (s) variants
        if norm in ['brand(s)', 'brand(s)', 'brands', 'brand(s)']:
            new_box["brand"] = value
        # Buy rate, any case/spacing
        elif norm in ['buyrate', 'buy rate', 'buy rate']:
            new_box["buy_rate"] = value
        else:
            new_box[key] = value  # Keep original
    # In case a dict ends up with both original and relabeled, remove dupes
    for old_key in list(new_box.keys()):
        norm = old_key.lower().replace(' ', '').replace('\xa0', '')
        if (norm in ['brand(s)', 'brands']) or (norm in ['buyrate', 'buy rate', 'buy rate']):
            if old_key not in ["brand", "buy_rate"]:
                new_box.pop(old_key, None)
    return new_box

if __name__ == "__main__":
    event_links = get_event_links()
    all_infoboxes = []

    print(f"Found {len(event_links)} SummerSlam event pages...")

    for event_title, event_url in tqdm(event_links, desc="Processing events"):
        try:
            resp = requests.get(event_url)
            resp.raise_for_status()
            soup = BeautifulSoup(resp.text, "html.parser")
            infobox = get_infobox(soup)
            infobox["Event"] = event_title
            infobox = merge_tagline(infobox)
            infobox = relabel_keys(infobox)
            all_infoboxes.append(infobox)
        except Exception as e:
            print(f"Error processing {event_title}: {e}")

    # Collect fields after taglines merged and keys relabeled
    all_fields = set()
    for box in all_infoboxes:
        all_fields.update(box.keys())
    # Remove any "Tagline(s)" variant and ensure preferred order
    all_fields = {f for f in all_fields if not re.match(r'^tagline\s*\(\s*s\s*\)$', f.lower().replace('\xa0',''))}
    # Event and Tagline up front, then brand and buy_rate if present, then rest
    field_order = ["Event", "Tagline"]
    if "brand" in all_fields:
        field_order.append("brand")
    if "buy_rate" in all_fields:
        field_order.append("buy_rate")
    # Add any remaining fields
    field_order += sorted(f for f in all_fields if f not in field_order)
    df = pd.DataFrame([{f: box.get(f, "") for f in field_order} for box in all_infoboxes])
    df = df[field_order]  # ensure order
    df.sort_values("Event", inplace=True)
    df.to_csv("summerslam_infobox_master.csv", index=False)

    print("DONE! See file: summerslam_infobox_master.csv")


Found 39 SummerSlam event pages...


Processing events: 100%|███████████████████████████████████████████████████████████████| 39/39 [00:10<00:00,  3.55it/s]

DONE! See file: summerslam_infobox_master.csv





In [12]:
# results scraper

import requests
from bs4 import BeautifulSoup, Tag
import pandas as pd
from tqdm import tqdm
import re

WIKI_BASE = "https://en.wikipedia.org"
CATEGORY_URL = f"{WIKI_BASE}/wiki/Category:SummerSlam"
DESIRED_COLS = ["No.", "Results", "Stipulations", "Times"]

def clean_header(header):
    header = re.sub(r"\[.*?\]", "", header.replace('\xa0', ' ')).strip()
    header = re.sub(r"\s+", " ", header)
    return header.lower()

def canonical_name(header):
    h = clean_header(header)
    if h in {"no.", "no"}: return "No."
    if "result" in h:     return "Results"
    if h.startswith("stipul"): return "Stipulations"
    if h.startswith("time"):   return "Times"
    return None

def get_event_links():
    resp = requests.get(CATEGORY_URL)
    resp.raise_for_status()
    soup = BeautifulSoup(resp.text, "html.parser")
    links = []
    for a in soup.select('.mw-category a[href*="SummerSlam_"]'):
        href = a['href']
        title = a.get_text(strip=True)
        if '(' in title and ')' in title and 'SummerSlam' in title:
            links.append((title, WIKI_BASE + href))
    links = list(set(links))
    def year_key(x):
        m = re.search(r"(\d{4})", x[0])
        return int(m.group(1)) if m else 0
    links.sort(key=year_key)
    return links

def find_results_table_1990(soup):
    """Find the FIRST wikitable after the 'Results' heading. Returns table or None."""
    results_header = soup.find(lambda tag: tag.name in ['h2','h3'] and 'Results' in tag.get_text())
    if not results_header:
        return None
    # Find the first 'wikitable' after the Results heading
    next_tag = results_header
    for _ in range(15):
        next_tag = next_tag.find_next()
        if not next_tag:
            return None
        if isinstance(next_tag, Tag):
            if next_tag.name == "table" and "wikitable" in (next_tag.get('class') or []):
                return next_tag
    return None

def find_results_table(soup, year):
    # For 1990: specifically use find_results_table_1990
    if year == "1990":
        table = find_results_table_1990(soup)
        if table:
            return table
    # For others: use the general finder (looks for a wikitable with right headers)
    for table in soup.find_all("table", class_="wikitable"):
        headers = [th.get_text(" ", strip=True) for th in table.find_all("th")]
        found = [canonical_name(h) for h in headers]
        if "No." in found and "Results" in found:
            return table
    return None

def table_to_rows(table):
    tr_iter = table.find_all('tr')
    if not tr_iter:
        return []
    header_cells = [th.get_text(' ', strip=True) for th in tr_iter[0].find_all(['th', 'td'])]
    canon_map = [canonical_name(h) for h in header_cells]
    rows = []
    for tr in tr_iter[1:]:
        tds = tr.find_all(['td', 'th'])
        if not tds or all(not td.get_text(strip=True) for td in tds):
            continue
        data = {col: "" for col in DESIRED_COLS}
        for idx, td in enumerate(tds):
            if idx < len(canon_map) and canon_map[idx]:
                data[canon_map[idx]] = td.get_text(" ", strip=True)
        # Only keep rows where 'No.' is present and is numeric
        no_clean = data["No."].replace('\xa0', '').strip()
        if no_clean.isdigit():
            data["No."] = no_clean
            rows.append(data)
    return rows

def extract_results_from_list(soup):
    # Fallback logic for future years with a <ul> under a Results header
    rows = []
    results_header = soup.find(lambda tag: tag.name in ['h2','h3'] and 'Results' in tag.get_text())
    if not results_header:
        return []
    ul = results_header.find_next_sibling(lambda tag: tag.name == "ul" and tag.find("li"))
    if not ul:
        return []
    for i, li in enumerate(ul.find_all("li"), 1):
        text = li.get_text(" ", strip=True)
        row = {
            "No.": str(i),
            "Results": text,
            "Stipulations": "",
            "Times": "",
        }
        rows.append(row)
    return rows

def get_event_name(soup, default_name):
    infobox = soup.find("table", class_="infobox")
    if infobox:
        caption = infobox.find("caption")
        if caption and caption.get_text(strip=True):
            return caption.get_text(strip=True)
    return default_name

if __name__ == "__main__":
    event_links = get_event_links()
    all_rows = []
    print(f"Found {len(event_links)} SummerSlam event pages...")

    for event_title, event_url in tqdm(event_links, desc="Processing events"):
        try:
            resp = requests.get(event_url)
            resp.raise_for_status()
            soup = BeautifulSoup(resp.text, "html.parser")
            event_name = get_event_name(soup, event_title)
            m = re.search(r"(\d{4})", event_name)
            event_year = m.group(1) if m else None

            table = find_results_table(soup, event_year)
            if table:
                rows = table_to_rows(table)
            else:
                rows = extract_results_from_list(soup)
            for row in rows:
                row["Event"] = event_name
            all_rows.extend(rows)
        except Exception as e:
            print(f"Error processing {event_title}: {e}")

    out_order = ["Event"] + DESIRED_COLS
    df = pd.DataFrame(all_rows)
    for col in out_order:
        if col not in df.columns:
            df[col] = ""
    df = df[out_order]
    df.to_csv("summerslam_results_master.csv", index=False)
    print(f"Wrote {len(df)} matches from {len(set(df['Event']))} events.")


Found 39 SummerSlam event pages...


Processing events: 100%|███████████████████████████████████████████████████████████████| 39/39 [00:14<00:00,  2.74it/s]

Wrote 307 matches from 37 events.



