In [None]:
'''
author: Alexander Staub
Date: 2025-04-21
Description: Script for scraper of "officialcharts.com", UK charts 

'''

In [None]:
"""
Scrape Official Charts (UK) – Singles & Albums, 1980-2000
========================================================
Outputs a single CSV with the columns

    chart_type        "singles" | "albums"
    chart_week_start  YYYY-MM-DD (Sunday shown in the URL)
    position          1-100 (or 1-50 if the page shows only 50)*
    last_week         integer or NaN
    weeks_on_chart    integer
    song_title
    artist_name
    song_url          URL of the title link (for later label lookup)

The script is **checkpoint-aware**: if you stop it midway you can rerun it
and it will skip the weeks already in the CSV.

Requirement summary
-------------------
1) Two separate scrapes handled by the same script (`chart_type` param)  
2) Every available position each week (page displays up to 100)  
3) `song_url` retained instead of fetching the label now  
4) Produces **one big CSV** in `data/` (relative path)  
5) Works locally & remotely – all paths are relative  
6) Robust to interruptions, rate-limits, and missing weeks  
7) Random 0.5-2 s delay between *HTTP* requests

© 2025 – academic-use only.  ↝ MIT licence if you wish.
"""

In [1]:
# installing required packages
from __future__ import annotations

import csv
import random
import time
import re # for regex
from datetime import date, timedelta
from pathlib import Path
from typing import Iterable, List
import logging
from tqdm.notebook import tqdm # for progress bar

import pandas as pd
import requests
from bs4 import BeautifulSoup
from tenacity import retry, stop_after_attempt, wait_random_exponential

In [2]:
# ──────────────────────────────────────────────────────────────
# CONSTANTS & CONFIG
# ──────────────────────────────────────────────────────────────
BASE_URL = "https://www.officialcharts.com/charts"
CHART_IDS = {
    "singles": 7501,  # /singles-chart/{date}/7501/
    "albums": 7502,   # /albums-chart/{date}/7502/
}

# Go three levels up from the current working directory
base_dir = Path.cwd().parents[2]
CSV_PATH = base_dir / "data" / "raw_data" / "country_chart_data" / "uk_charts_1980_2000.csv"
CSV_PATH.parent.mkdir(parents=True, exist_ok=True)


HEADERS = {
    "User-Agent": "chart-research/1.0 (+https://github.com/deskreject)",
    "Accept-Language": "en-GB,en;q=0.9",
}

START_DATE = date(1979, 12, 30)   # first Sunday in 1980
END_DATE   = date(1999, 12, 26) # last Sunday in 2000

REQUEST_DELAY_RANGE = (1, 3.0)  # polite crawling

# --- Add Logging Configuration ---
LOG_FILE_PATH = base_dir / "code" / "logs" / "uk_charts_scraping.log" # Define log file path
LOG_FILE_PATH.parent.mkdir(parents=True, exist_ok=True) # Create logs directory

logging.basicConfig(
    level=logging.INFO, # Log INFO, WARNING, ERROR, CRITICAL levels
    format="%(asctime)s [%(levelname)s] %(message)s", # Include timestamp and level
    handlers=[
        logging.FileHandler(LOG_FILE_PATH, mode='a'), # Append logs to this file
        logging.StreamHandler() # Also print logs to the console
    ]
)

In [3]:
# ──────────────────────────────────────────────────────────────
# HTTP helpers
# ──────────────────────────────────────────────────────────────
session = requests.Session()
session.headers.update(HEADERS)


@retry(
    reraise=True,
    stop=stop_after_attempt(5),
    wait=wait_random_exponential(multiplier=1, max=20),
)
def fetch(url: str) -> str:
    """GET a URL with retries & exponential back‑off (tenacity)."""
    resp = session.get(url, timeout=30)
    resp.raise_for_status()
    return resp.text

In [4]:
# ──────────────────────────────────────────────────────────────
# Core scraping logic
# ──────────────────────────────────────────────────────────────
def weekly_dates(start: date, end: date) -> Iterable[date]:
    """Yield Sunday dates inclusive."""
    d = start
    one_week = timedelta(days=7)
    while d <= end:
        yield d
        d += one_week


def _int_from_text(text: str) -> Optional[int]:
    """Return the first int found in *text* or None."""
    m = re.search(r"(\d+)", text)
    return int(m.group(1)) if m else None


def parse_chart(html: str) -> List[dict]:
    """Extract rows from a weekly chart page (robust across 1979‑1999)."""
    soup = BeautifulSoup(html, "lxml")
    rows: List[dict] = []

    for item in soup.select(".chart-item"):
        if "chart-ad" in item.get("class", []):  # skip adverts
            continue

        rank_tag   = item.select_one(".chart-key strong")
        title_tag  = item.select_one("a.chart-name")
        artist_tag = item.select_one("a.chart-artist")
        if not all((rank_tag, title_tag, artist_tag)):
            continue  # malformed card

        # ── core fields ────────────────────────────────────────
        position     = int(rank_tag.text.strip())
        song_title   = title_tag.get_text(strip=True)
        artist_name  = artist_tag.get_text(strip=True)
        song_url     = title_tag.get("href")

        # ── last‑week & weeks‑on chart (new vs legacy layouts) ─
        last_week: Optional[int] = None
        weeks_on:  Optional[int] = None

        # modern layout (1990s‑present) — inside <div class="stats">
        lw_span    = item.select_one(".stats .movement span span")
        weeks_span = (
            item.select_one(".stats .weeks span span") or
            item.select_one(".stats .weeks span")
        )
        if lw_span and lw_span.text.strip().isdigit():
            last_week = int(lw_span.text.strip())
        if weeks_span and weeks_span.text.strip().isdigit():
            weeks_on = int(weeks_span.text.strip())

        # legacy fallback (older cached pages) — single <div class="meta">
        if last_week is None or weeks_on is None:
            meta = item.select_one(".meta")
            if meta:
                meta_txt = meta.get_text(" ", strip=True)
                if last_week is None:
                    last_week = _int_from_text(meta_txt.split("LW")[-1])
                if weeks_on is None:
                    weeks_on = _int_from_text(meta_txt.split("Weeks")[-1])

        rows.append(
            {
                "position": position,
                "song_title": song_title,
                "artist_name": artist_name,
                "last_week": last_week,
                "weeks_on_chart": weeks_on,
                "song_url": song_url,
            }
        )
    return rows


def scrape_chart_for_week(chart_type: str, week: date) -> pd.DataFrame | None:
    chart_id = CHART_IDS[chart_type]
    url = f"{BASE_URL}/{chart_type}-chart/{week:%Y%m%d}/{chart_id}/"
    try:
        logging.debug(f"Requesting URL: {url}") # Optional: Log URL being fetched at DEBUG level
        html = fetch(url)
    except requests.HTTPError as exc:
        # print(f"[WARN] HTTP {exc.response.status_code} for {url} – skipping")
        logging.warning(f"HTTP {exc.response.status_code} for {url} – skipping week")
        return None
    except Exception as exc:
        # print(f"[ERROR] {exc} – skipping {url}")
        logging.error(f"Failed to fetch or parse {url} - skipping week", exc_info=True)
        return None

    rows = parse_chart(html)
    if not rows:
        # print(f"[WARN] No rows extracted from {url}")
        logging.warning(f"No chart rows extracted from {url} - skipping week")
        return None

    df = pd.DataFrame(rows)
    df.insert(0, "chart_week_start", pd.Timestamp(week))
    df.insert(0, "chart_type", chart_type)
    return df

In [5]:
# ──────────────────────────────────────────────────────────────
# Progress / resume helpers
# ──────────────────────────────────────────────────────────────

def already_scraped_weeks() -> set[tuple[str, str]]:
    """Read existing CSV (if any) and return {(chart_type, ISO week‑start)}."""
    if not CSV_PATH.exists():
        return set()
    df = pd.read_csv(CSV_PATH, usecols=["chart_type", "chart_week_start"])
    return {(row.chart_type, row.chart_week_start) for row in df.itertuples()}


def append_to_csv(df: pd.DataFrame) -> None:
    header = not CSV_PATH.exists()
    df.to_csv(CSV_PATH, mode="a", index=False, header=header, quoting=csv.QUOTE_MINIMAL)

In [8]:
def main(full_range: bool = True) -> None:
    """Scrape singles & albums for either the full range or a test slice.

    Args
    ----
    full_range : if False, use the TRIAL_START / TRIAL_END globals
    """
    logging.info("="*20 + " Scraping Script Started " + "="*20)
    done = already_scraped_weeks()
    logging.info(f"Existing CSV contains {len(done):,} week/chart combos.")

    # choose date span
    span_start = START_DATE if full_range else TRIAL_START
    span_end   = END_DATE   if full_range else TRIAL_END

    for chart_type in ("singles", "albums"):
        weeks = list(weekly_dates(span_start, span_end))
        weeks_to_process = [w for w in weeks if (chart_type, str(w)) not in done]
        total_to_process = len(weeks_to_process)

        logging.info(
            f"{chart_type.upper()}  |  {span_start} → {span_end}  "
            f"({total_to_process} weeks still to fetch)"
        )

        processed_count = 0
        # tqdm progress bar ✨
        for week in tqdm(
            weeks_to_process,
            desc=f"{chart_type.upper()} ({total_to_process})",
            unit="wk",
            leave=False,
        ):
            logging.debug("Attempting %s  %s", chart_type, week)

            df = scrape_chart_for_week(chart_type, week)
            if df is not None:
                append_to_csv(df)
                processed_count += 1
                tqdm.write(f"[{chart_type[0].upper()}] {week}  ✓ {len(df)} rows")
            time.sleep(random.uniform(*REQUEST_DELAY_RANGE))

        logging.info(
            f"Finished {chart_type.upper()} | new rows this run: {processed_count:,}"
        )

    logging.info(f"Done. Dataset saved → {CSV_PATH}")
    logging.info("="*20 + " Scraping Script Finished " + "="*20)

# ---------------------------------------------------
# Run cell
# ---------------------------------------------------
main(full_range=True)        # <- set to False to re‑run a 1‑month trial

2025-05-02 16:07:01,107 [INFO] Existing CSV contains 0 week/chart combos.
2025-05-02 16:07:01,109 [INFO] SINGLES  |  1979-12-30 → 1999-12-26  (1044 weeks still to fetch)


SINGLES (1044):   0%|          | 0/1044 [00:00<?, ?wk/s]

[S] 1979-12-30  ✓ 75 rows
[S] 1980-01-06  ✓ 75 rows
[S] 1980-01-13  ✓ 75 rows
[S] 1980-01-20  ✓ 75 rows
[S] 1980-01-27  ✓ 75 rows
[S] 1980-02-03  ✓ 75 rows
[S] 1980-02-10  ✓ 75 rows
[S] 1980-02-17  ✓ 75 rows
[S] 1980-02-24  ✓ 75 rows
[S] 1980-03-02  ✓ 75 rows
[S] 1980-03-09  ✓ 75 rows
[S] 1980-03-16  ✓ 75 rows
[S] 1980-03-23  ✓ 75 rows
[S] 1980-03-30  ✓ 75 rows
[S] 1980-04-06  ✓ 75 rows
[S] 1980-04-13  ✓ 75 rows
[S] 1980-04-20  ✓ 75 rows
[S] 1980-04-27  ✓ 75 rows
[S] 1980-05-04  ✓ 75 rows
[S] 1980-05-11  ✓ 75 rows
[S] 1980-05-18  ✓ 75 rows
[S] 1980-05-25  ✓ 75 rows
[S] 1980-06-01  ✓ 75 rows
[S] 1980-06-08  ✓ 75 rows
[S] 1980-06-15  ✓ 75 rows
[S] 1980-06-22  ✓ 75 rows
[S] 1980-06-29  ✓ 75 rows
[S] 1980-07-06  ✓ 75 rows
[S] 1980-07-13  ✓ 75 rows
[S] 1980-07-20  ✓ 75 rows
[S] 1980-07-27  ✓ 75 rows
[S] 1980-08-03  ✓ 75 rows
[S] 1980-08-10  ✓ 75 rows
[S] 1980-08-17  ✓ 75 rows
[S] 1980-08-24  ✓ 75 rows
[S] 1980-08-31  ✓ 75 rows
[S] 1980-09-07  ✓ 75 rows
[S] 1980-09-14  ✓ 75 rows
[S] 1980-09-

2025-05-02 17:22:36,788 [INFO] Finished SINGLES | new rows this run: 1,044
2025-05-02 17:22:36,790 [INFO] ALBUMS  |  1979-12-30 → 1999-12-26  (1044 weeks still to fetch)


ALBUMS (1044):   0%|          | 0/1044 [00:00<?, ?wk/s]

[A] 1979-12-30  ✓ 75 rows
[A] 1980-01-06  ✓ 75 rows
[A] 1980-01-13  ✓ 75 rows
[A] 1980-01-20  ✓ 75 rows
[A] 1980-01-27  ✓ 75 rows
[A] 1980-02-03  ✓ 75 rows
[A] 1980-02-10  ✓ 75 rows
[A] 1980-02-17  ✓ 75 rows
[A] 1980-02-24  ✓ 75 rows
[A] 1980-03-02  ✓ 75 rows
[A] 1980-03-09  ✓ 75 rows
[A] 1980-03-16  ✓ 75 rows


2025-05-02 17:25:18,534 [ERROR] Failed to fetch or parse https://www.officialcharts.com/charts/albums-chart/19800323/7502/ - skipping week
Traceback (most recent call last):
  File "c:\Users\astaub1\AppData\Local\anaconda3\envs\music_data_chartmetric\Lib\site-packages\urllib3\connection.py", line 198, in _new_conn
    sock = connection.create_connection(
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\astaub1\AppData\Local\anaconda3\envs\music_data_chartmetric\Lib\site-packages\urllib3\util\connection.py", line 60, in create_connection
    for res in socket.getaddrinfo(host, port, family, socket.SOCK_STREAM):
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\astaub1\AppData\Local\anaconda3\envs\music_data_chartmetric\Lib\socket.py", line 976, in getaddrinfo
    for res in _socket.getaddrinfo(host, port, family, type, proto, flags):
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
socket.gaierror: [Errno 11001]

[A] 1980-04-06  ✓ 75 rows
[A] 1980-04-13  ✓ 75 rows
[A] 1980-04-20  ✓ 75 rows
[A] 1980-04-27  ✓ 75 rows
[A] 1980-05-04  ✓ 75 rows
[A] 1980-05-11  ✓ 75 rows
[A] 1980-05-18  ✓ 75 rows
[A] 1980-05-25  ✓ 75 rows
[A] 1980-06-01  ✓ 75 rows
[A] 1980-06-08  ✓ 75 rows
[A] 1980-06-15  ✓ 75 rows
[A] 1980-06-22  ✓ 75 rows
[A] 1980-06-29  ✓ 75 rows
[A] 1980-07-06  ✓ 75 rows
[A] 1980-07-13  ✓ 75 rows
[A] 1980-07-20  ✓ 75 rows
[A] 1980-07-27  ✓ 75 rows
[A] 1980-08-03  ✓ 75 rows
[A] 1980-08-10  ✓ 75 rows
[A] 1980-08-17  ✓ 75 rows
[A] 1980-08-24  ✓ 75 rows
[A] 1980-08-31  ✓ 75 rows
[A] 1980-09-07  ✓ 75 rows
[A] 1980-09-14  ✓ 75 rows
[A] 1980-09-21  ✓ 75 rows
[A] 1980-09-28  ✓ 75 rows
[A] 1980-10-05  ✓ 75 rows
[A] 1980-10-12  ✓ 75 rows
[A] 1980-10-19  ✓ 75 rows
[A] 1980-10-26  ✓ 75 rows
[A] 1980-11-02  ✓ 75 rows
[A] 1980-11-09  ✓ 75 rows
[A] 1980-11-16  ✓ 75 rows
[A] 1980-11-23  ✓ 75 rows
[A] 1980-11-30  ✓ 75 rows
[A] 1980-12-07  ✓ 75 rows
[A] 1980-12-14  ✓ 75 rows
[A] 1980-12-21  ✓ 75 rows
[A] 1980-12-

2025-05-02 19:21:47,670 [INFO] Finished ALBUMS | new rows this run: 1,042
2025-05-02 19:21:47,671 [INFO] Done. Dataset saved → c:\Users\astaub1\R_work\Research\music_data\data\raw_data\country_chart_data\uk_charts_1980_2000.csv


# Running a 1 month trial

In [6]:
#-------------
# setup of trial
#--------------

# 1 month trial period for testing

TRIAL_START = date(1990, 5, 6)   # first chart in May-1990
TRIAL_END   = date(1990, 5, 27)  # last chart in May-1990
weeks       = list(weekly_dates(TRIAL_START, TRIAL_END))

In [7]:
# testing on singles charts

trial_csv = CSV_PATH.parent / "uk_charts_1_month_trial.csv"

header = not trial_csv.exists()

for week in tqdm(weeks, desc="SINGLES"):
    df = scrape_chart_for_week("singles", week)
    if df is not None:
        df.to_csv(trial_csv, mode="a", index=False, header=header)
        header = False

SINGLES:   0%|          | 0/4 [00:00<?, ?it/s]