In [None]:
'''
author: Alexander Staub
Date: 2025-04-21
Description: Script for scraper of "officialcharts.com", UK charts 

'''

In [None]:
"""
Scrape Official Charts (UK) – Singles & Albums, 1980-2000
========================================================
Outputs a single CSV with the columns

    chart_type        "singles" | "albums"
    chart_week_start  YYYY-MM-DD (Sunday shown in the URL)
    position          1-100 (or 1-50 if the page shows only 50)*
    last_week         integer or NaN
    weeks_on_chart    integer
    song_title
    artist_name
    song_url          URL of the title link (for later label lookup)

The script is **checkpoint-aware**: if you stop it midway you can rerun it
and it will skip the weeks already in the CSV.

Requirement summary
-------------------
1) Two separate scrapes handled by the same script (`chart_type` param)  
2) Every available position each week (page displays up to 100)  
3) `song_url` retained instead of fetching the label now  
4) Produces **one big CSV** in `data/` (relative path)  
5) Works locally & remotely – all paths are relative  
6) Robust to interruptions, rate-limits, and missing weeks  
7) Random 0.5-2 s delay between *HTTP* requests

© 2025 – academic-use only.  ↝ MIT licence if you wish.
"""

In [2]:
# installing required packages
from __future__ import annotations

import csv
import random
import time
from datetime import date, timedelta
from pathlib import Path
from typing import Iterable, List

import pandas as pd
import requests
from bs4 import BeautifulSoup
from tenacity import retry, stop_after_attempt, wait_random_exponential

In [3]:
# ──────────────────────────────────────────────────────────────
# CONSTANTS & CONFIG
# ──────────────────────────────────────────────────────────────
BASE_URL = "https://www.officialcharts.com/charts"
CHART_IDS = {
    "singles": 7501,  # /singles-chart/{date}/7501/
    "albums": 7502,   # /albums-chart/{date}/7502/
}

# Go three levels up from the current working directory
base_dir = Path.cwd().parents[2]
CSV_PATH = base_dir / "data" / "raw_data" / "country_chart_data" / "uk_charts_1980_2000.csv"
CSV_PATH.parent.mkdir(parents=True, exist_ok=True)


HEADERS = {
    "User-Agent": "chart-research/1.0 (+https://github.com/YOUR_GH_HANDLE)",
    "Accept-Language": "en-GB,en;q=0.9",
}

START_DATE = date(1979, 12, 30)   # first Sunday in 1980
END_DATE   = date(1999, 12, 26) # last Sunday in 2000

REQUEST_DELAY_RANGE = (1, 3.0)  # polite crawling

In [4]:
# ──────────────────────────────────────────────────────────────
# HTTP helpers
# ──────────────────────────────────────────────────────────────
session = requests.Session()
session.headers.update(HEADERS)


@retry(
    reraise=True,
    stop=stop_after_attempt(5),
    wait=wait_random_exponential(multiplier=1, max=20),
)
def fetch(url: str) -> str:
    """GET a URL with retries & exponential back-off (tenacity)."""
    response = session.get(url, timeout=30)
    response.raise_for_status()
    return response.text


In [None]:
# ──────────────────────────────────────────────────────────────
# Core scraping logic
# ──────────────────────────────────────────────────────────────
def weekly_dates(start: date, end: date) -> Iterable[date]:
    """Yield Sunday dates inclusive."""
    d = start
    one_week = timedelta(days=7)
    while d <= end:
        yield d
        d += one_week


def parse_chart(html: str) -> List[dict]:
    """Extract rows from a weekly chart page."""
    soup = BeautifulSoup(html, "lxml")

    # list view is the default – every entry lives inside <div class="chart-positions">
    # but some historical pages use <div class="chart-positions-list">.
    entries = soup.select("div.chart-positions > div") or soup.select(
        "div.chart-positions-list > div"
    )

    rows = []
    for e in entries:
        pos_tag = e.select_one(".position")
        title_tag = e.select_one(".title")
        artist_tag = e.select_one(".artist")
        meta_tag = e.select_one(".meta")  # contains LW & Weeks

        if not all((pos_tag, title_tag, artist_tag, meta_tag)):
            # malformed entry – skip quietly
            continue

        # meta text example: "LW: 3 | Peak: 1 | Weeks: 7"
        meta_text = meta_tag.get_text(strip=True).lower()
        parts = {kv.split(":")[0]: kv.split(":")[1] for kv in meta_text.split(",")}
        last_week = parts.get("lw", "").strip() or None
        weeks_on = parts.get("weeks", "").strip() or None

        rows.append(
            {
                "position": int(pos_tag.get_text(strip=True)),
                "song_title": title_tag.get_text(strip=True),
                "artist_name": artist_tag.get_text(strip=True),
                "last_week": int(last_week) if last_week and last_week.isdigit() else None,
                "weeks_on_chart": int(weeks_on) if weeks_on and weeks_on.isdigit() else None,
                "song_url": title_tag.find("a")["href"] if title_tag.find("a") else None,
            }
        )
    return rows


def scrape_chart_for_week(chart_type: str, week: date) -> pd.DataFrame | None:
    chart_id = CHART_IDS[chart_type]
    url = f"{BASE_URL}/{chart_type}-chart/{week:%Y%m%d}/{chart_id}/"
    try:
        html = fetch(url)
    except requests.HTTPError as exc:
        print(f"[WARN] HTTP error {exc.response.status_code} for {url} – skipping")
        return None
    except Exception as exc:
        print(f"[ERROR] {exc} – skipping {url}")
        return None

    rows = parse_chart(html)
    if not rows:  # page present but empty (rare)
        print(f"[WARN] No rows extracted from {url}")
        return None

    df = pd.DataFrame(rows)
    df.insert(0, "chart_week_start", pd.Timestamp(week))
    df.insert(0, "chart_type", chart_type)
    return df

In [None]:
# ──────────────────────────────────────────────────────────────
# Progress / resume helpers
# ──────────────────────────────────────────────────────────────
def already_scraped_weeks() -> set[tuple[str, str]]:
    """Read existing CSV (if any) and return {(chart_type, ISO week-start)}."""
    if not CSV_PATH.exists():
        return set()
    df = pd.read_csv(CSV_PATH, usecols=["chart_type", "chart_week_start"])
    return {(row.chart_type, row.chart_week_start) for row in df.itertuples()}


def append_to_csv(df: pd.DataFrame) -> None:
    header = not CSV_PATH.exists()
    df.to_csv(CSV_PATH, mode="a", index=False, header=header, quoting=csv.QUOTE_MINIMAL)

In [None]:
# ──────────────────────────────────────────────────────────────
# Main driver
# ──────────────────────────────────────────────────────────────
def main() -> None:
    done = already_scraped_weeks()
    total_weeks = len(list(weekly_dates(START_DATE, END_DATE)))
    for chart_type in ("singles", "albums"):
        print(f"\n→  Starting chart: {chart_type.upper()}")
        for i, week in enumerate(weekly_dates(START_DATE, END_DATE), 1):
            key = (chart_type, str(week))
            if key in done:
                continue

            print(
                f"[{chart_type[:1].upper()}] "
                f"{week}  ({i}/{total_weeks}) … ",
                end="",
                flush=True,
            )
            df = scrape_chart_for_week(chart_type, week)
            if df is not None:
                append_to_csv(df)
                print(f"✓  {len(df)} rows")
            time.sleep(random.uniform(*REQUEST_DELAY_RANGE))

    print("\nAll done – dataset saved to", CSV_PATH.relative_to(Path.cwd()))


if __name__ == "__main__":
    main()