In [None]:
'''
author: Alexander Staub
Date: 2025-05-08
Description: Script for scraper of "offiziellecharts.de", DE charts 

'''

In [None]:
"""
Scrape Official Charts (DE) – Singles & Albums, 1980-2000
========================================================
Outputs a single CSV with the columns

    chart_type        "singles" | "albums"
    chart_week_start  YYYY-MM-DD (Sunday shown in the URL)
    position          1-100 (or 1-50 if the page shows only 50, 1-75 if page only shows 75)*
    last_week         integer or NaN
    weeks_on_chart    integer
    song_title
    artist_name
    record_label      

The script is **checkpoint-aware**: if you stop it midway you can rerun it
and it will skip the weeks already in the CSV.

Requirement summary
-------------------
1) Two separate scrapes handled by the same script (`chart_type` param)  
2) Every available position each week (page displays up to 100)  
4) Produces **one big CSV** in `data/` (relative path)  
5) Works locally & remotely – all paths are relative  
6) Robust to interruptions, rate-limits, and missing weeks  
7) Random 1-3 s delay between *HTTP* requests

© 2025 – academic-use only.  ↝ MIT licence if you wish.
"""

In [4]:
# installing required packages
from __future__ import annotations

import csv
import random
import time
import re # for regex
from datetime import date, timedelta
from pathlib import Path
from typing import Iterable, List
import logging
from tqdm.notebook import tqdm # for progress bar

import pandas as pd
import requests
from bs4 import BeautifulSoup
from tenacity import retry, stop_after_attempt, wait_random_exponential

In [26]:
# ──────────────────────────────────────────────────────────────
# CONSTANTS & CONFIG
# ──────────────────────────────────────────────────────────────
BASE_URLS = {                                                               # ###  DE CHANGE  ###
    "singles": "https://www.offiziellecharts.de/charts/single/for-date-{epoch}",  # ###  DE CHANGE  ###
    "albums":  "https://www.offiziellecharts.de/charts/album/for-date-{epoch}",   # ###  DE CHANGE  ###
}

# Go three levels up from the current working directory (same as notebook)
BASE_DIR  = Path.cwd().resolve().parents[2]
CSV_PATH  = BASE_DIR / "data" / "raw_data" / "country_chart_data" / "de_charts_1980_2000.csv"
CSV_PATH.parent.mkdir(parents=True, exist_ok=True)

# Full scrape: Monday 31 Dec 1979 → Monday 03 Jan 2000
START_DATE = date(1979, 12, 31)
END_DATE   = date(2000, 1, 3)

HEADERS = {
    "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) "
                  "AppleWebKit/537.36 (KHTML, like Gecko) "
                  "Chrome/124.0 Safari/537.36"
}

REQUEST_DELAY_RANGE = (1, 3.0)  # polite crawling

# --- Logging Configuration (unchanged) ---
LOG_FILE_PATH = BASE_DIR / "code" / "logs" / "de_charts_scraping.log"
LOG_FILE_PATH.parent.mkdir(parents=True, exist_ok=True)

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",
    handlers=[logging.FileHandler(LOG_FILE_PATH, mode="a"),
              logging.StreamHandler()]
)
logger = logging.getLogger(__name__)

In [27]:
# ──────────────────────────────────────────────────────────────
# HTTP helpers
# ──────────────────────────────────────────────────────────────
session = requests.Session()
session.headers.update(HEADERS)


@retry(
    reraise=True,
    stop=stop_after_attempt(5),
    wait=wait_random_exponential(multiplier=1, max=20),
)
def fetch(url: str) -> str:
    """GET a URL with retries & exponential back‑off (tenacity)."""
    resp = session.get(url, timeout=30)
    resp.raise_for_status()
    return resp.text

In [28]:
# ──────────────────────────────────────────────────────────────
# Core scraping logic
# ──────────────────────────────────────────────────────────────
def weekly_dates(start: date, end: date) -> Iterable[date]:
    """Yield Sunday dates inclusive."""
    d = start
    one_week = timedelta(days=7)
    while d <= end:
        yield d
        d += one_week

def build_chart_url(chart_type: str, week_start: date) -> str:              # ###  DE CHANGE  ###
    """Return the OffizielleCharts URL for *chart_type* at *week_start*."""  # ###  DE CHANGE  ###
    epoch_ms = int(time.mktime(week_start.timetuple())) * 1000              # ###  DE CHANGE  ###
    return BASE_URLS[chart_type].format(epoch=epoch_ms)                     # ###  DE CHANGE  ###


def _int_from_text(text: str) -> Optional[int]:
    """Return the first int found in *text* or None."""
    m = re.search(r"(\d+)", text)
    return int(m.group(1)) if m else None


def _extract_weeks_on_chart(wrap) -> Optional[int]:                         # ###  DE CHANGE  ###
    tag = next((s for s in wrap.select("span.plus-data")                    # ###  DE CHANGE  ###
                if "In Charts" in s.text), None)                            # ###  DE CHANGE  ###
    if not tag:                                                             # ###  DE CHANGE  ###
        return None                                                         # ###  DE CHANGE  ###
    match = re.search(r"In Charts:\s*(\d+)", tag.text)                    # ###  DE CHANGE  ###
    return int(match.group(1)) if match else None                           # ###  DE CHANGE  ###

def parse_chart_page(html: str, chart_type: str) -> List[dict]:             # ###  DE CHANGE  ###
    """Extract all rows from a weekly DE chart page."""                      # ###  DE CHANGE  ###
    soup = BeautifulSoup(html, "lxml")                                      # ###  DE CHANGE  ###
    rows: List[dict] = []                                                   # ###  DE CHANGE  ###

    # chart start date (dd.mm.yyyy) is inside span.ch-header strong         # ###  DE CHANGE  ###
    header = soup.select_one("span.ch-header strong")                       # ###  DE CHANGE  ###
    if not header:                                                          # ###  DE CHANGE  ###
        logger.warning("Could not locate week header")                      # ###  DE CHANGE  ###
        return rows                                                         # ###  DE CHANGE  ###
    day, month, year = map(int, header.text.strip().split("."))             # ###  DE CHANGE  ###
    week_start_iso = date(year, month, day).isoformat()                     # ###  DE CHANGE  ###

    for row in soup.select("table.chart-table tr.drill-down-link"):         # ###  DE CHANGE  ###
        # Position & movement                                              # ###  DE CHANGE  ###
        pos = int(row.select_one("td.ch-pos span.this-week").text.strip())  # ###  DE CHANGE  ###
        lw_tag = row.select_one("td.ch-trend span.last-week")               # ###  DE CHANGE  ###
        last_week = int(lw_tag.text.strip()) if lw_tag and lw_tag.text.strip().isdigit() else None  # ###  DE CHANGE  ###

        wrap = row.select_one("td.ch-info div.wrap")                        # ###  DE CHANGE  ###
        entry = {
            "chart_type": chart_type,
            "chart_week_start": week_start_iso,
            "position": pos,
            "last_week": last_week,
            "weeks_on_chart": _extract_weeks_on_chart(wrap),
            "song_title": wrap.select_one("span.info-title").text.strip(),
            "artist_name": wrap.select_one("span.info-artist").text.strip(),
            "record_label": wrap.select_one("span.info-label").text.strip(),
        }
        rows.append(entry)
    return rows

#
def get_soup(url: str) -> str:
    """Fetch a URL and return the raw HTML string."""
    return fetch(url)        # ← fetch() is your retry‑capable helper

def scrape_chart_for_week(chart_type: str, week: date) -> Optional[pd.DataFrame]:
    """
    Single‑week wrapper that:
      • builds the German URL
      • downloads & parses HTML
      • returns a tidy DataFrame with the 8 required columns
    """
    url = build_chart_url(chart_type, week)          # from §3
    soup = get_soup(url)                             # <- STILL your helper
    rows = parse_chart_page(soup, chart_type)        # from §3
    if not rows:
        logger.warning("No rows parsed for %s  %s", chart_type, week)
        return None
    return pd.DataFrame(rows)

In [29]:
# ──────────────────────────────────────────────────────────────
# Progress / resume helpers
# ──────────────────────────────────────────────────────────────

def already_scraped_weeks() -> set[tuple[str, str]]:
    """Read existing CSV (if any) and return {(chart_type, ISO week‑start)}."""
    if not CSV_PATH.exists():
        return set()
    df = pd.read_csv(CSV_PATH, usecols=["chart_type", "chart_week_start"])
    return {(row.chart_type, row.chart_week_start) for row in df.itertuples()}


def append_to_csv(df: pd.DataFrame) -> None:
    header = not CSV_PATH.exists()
    df.to_csv(CSV_PATH, mode="a", index=False, header=header, quoting=csv.QUOTE_MINIMAL)

In [30]:
def main(full_range: bool = True) -> None:
    """Scrape singles & albums for either the full range or a test slice."""
    logger.info("="*20 + " Scraping Script Started " + "="*20)

    done = already_scraped_weeks()          # <-- unchanged
    logger.info("Existing CSV contains %,d week/chart combos.", len(done))

    # choose date span
    span_start = START_DATE if full_range else TRIAL_START
    span_end   = END_DATE   if full_range else TRIAL_END

    for chart_type in ("singles", "albums"):
        weeks = list(weekly_dates(span_start, span_end))
        weeks_to_process = [w for w in weeks if (chart_type, str(w)) not in done]
        total = len(weeks_to_process)

        logger.info("%s  |  %s → %s  (%d weeks still to fetch)",
                    chart_type.upper(), span_start, span_end, total)

        processed = 0
        for week in tqdm(weeks_to_process,
                         desc=f"{chart_type.upper()} ({total})",
                         unit="wk", leave=False):

            logger.debug("Attempting %s  %s", chart_type, week)
            try:
                df = scrape_chart_for_week(chart_type, week)
            except Exception:
                logger.exception("Fatal error for %s %s – skipping", chart_type, week)
                continue

            if df is not None:
                append_to_csv(df)           # <-- unchanged helper
                processed += 1
                tqdm.write(f"[{chart_type[0].upper()}] {week}  ✓ {len(df)} rows")

            time.sleep(random.uniform(*REQUEST_DELAY_RANGE))

        logger.info("Finished %s | new rows this run: %,d",
                    chart_type.upper(), processed)

    logger.info("Done. Dataset saved → %s", CSV_PATH)
    logger.info("="*20 + " Scraping Script Finished " + "="*20)

# ---------------------------------------------------
# Run cell
# ---------------------------------------------------
main(full_range=True)        # <- set to False to re‑run a 1‑month trial

--- Logging error ---
Traceback (most recent call last):
  File "c:\Users\astaub1\AppData\Local\anaconda3\envs\music_data_chartmetric\Lib\logging\__init__.py", line 1160, in emit
    msg = self.format(record)
          ^^^^^^^^^^^^^^^^^^^
  File "c:\Users\astaub1\AppData\Local\anaconda3\envs\music_data_chartmetric\Lib\logging\__init__.py", line 999, in format
    return fmt.format(record)
           ^^^^^^^^^^^^^^^^^^
  File "c:\Users\astaub1\AppData\Local\anaconda3\envs\music_data_chartmetric\Lib\logging\__init__.py", line 703, in format
    record.message = record.getMessage()
                     ^^^^^^^^^^^^^^^^^^^
  File "c:\Users\astaub1\AppData\Local\anaconda3\envs\music_data_chartmetric\Lib\logging\__init__.py", line 392, in getMessage
    msg = msg % self.args
          ~~~~^~~~~~~~~~~
ValueError: unsupported format character ',' (0x2c) at index 23
Call stack:
  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "c:\Us

SINGLES (1045):   0%|          | 0/1045 [00:00<?, ?wk/s]

[S] 1979-12-31  ✓ 50 rows
[S] 1980-01-07  ✓ 75 rows
[S] 1980-01-14  ✓ 75 rows
[S] 1980-01-21  ✓ 75 rows
[S] 1980-01-28  ✓ 75 rows
[S] 1980-02-04  ✓ 75 rows
[S] 1980-02-11  ✓ 75 rows
[S] 1980-02-18  ✓ 75 rows
[S] 1980-02-25  ✓ 75 rows
[S] 1980-03-03  ✓ 75 rows
[S] 1980-03-10  ✓ 75 rows
[S] 1980-03-17  ✓ 75 rows
[S] 1980-03-24  ✓ 75 rows
[S] 1980-03-31  ✓ 75 rows
[S] 1980-04-07  ✓ 75 rows
[S] 1980-04-14  ✓ 75 rows
[S] 1980-04-21  ✓ 75 rows
[S] 1980-04-28  ✓ 75 rows
[S] 1980-05-05  ✓ 75 rows
[S] 1980-05-12  ✓ 75 rows
[S] 1980-05-19  ✓ 75 rows
[S] 1980-05-26  ✓ 75 rows
[S] 1980-06-02  ✓ 75 rows
[S] 1980-06-09  ✓ 75 rows
[S] 1980-06-16  ✓ 75 rows
[S] 1980-06-23  ✓ 75 rows
[S] 1980-06-30  ✓ 75 rows
[S] 1980-07-07  ✓ 75 rows
[S] 1980-07-14  ✓ 75 rows
[S] 1980-07-21  ✓ 75 rows
[S] 1980-07-28  ✓ 75 rows
[S] 1980-08-04  ✓ 75 rows
[S] 1980-08-11  ✓ 75 rows
[S] 1980-08-18  ✓ 75 rows
[S] 1980-08-25  ✓ 75 rows
[S] 1980-09-01  ✓ 75 rows
[S] 1980-09-08  ✓ 75 rows
[S] 1980-09-15  ✓ 75 rows
[S] 1980-09-

--- Logging error ---
Traceback (most recent call last):
  File "c:\Users\astaub1\AppData\Local\anaconda3\envs\music_data_chartmetric\Lib\logging\__init__.py", line 1160, in emit
    msg = self.format(record)
          ^^^^^^^^^^^^^^^^^^^
  File "c:\Users\astaub1\AppData\Local\anaconda3\envs\music_data_chartmetric\Lib\logging\__init__.py", line 999, in format
    return fmt.format(record)
           ^^^^^^^^^^^^^^^^^^
  File "c:\Users\astaub1\AppData\Local\anaconda3\envs\music_data_chartmetric\Lib\logging\__init__.py", line 703, in format
    record.message = record.getMessage()
                     ^^^^^^^^^^^^^^^^^^^
  File "c:\Users\astaub1\AppData\Local\anaconda3\envs\music_data_chartmetric\Lib\logging\__init__.py", line 392, in getMessage
    msg = msg % self.args
          ~~~~^~~~~~~~~~~
ValueError: unsupported format character ',' (0x2c) at index 34
Call stack:
  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "c:\Us

ALBUMS (1045):   0%|          | 0/1045 [00:00<?, ?wk/s]

[A] 1979-12-31  ✓ 50 rows
[A] 1980-01-07  ✓ 65 rows
[A] 1980-01-14  ✓ 65 rows
[A] 1980-01-21  ✓ 65 rows
[A] 1980-01-28  ✓ 65 rows
[A] 1980-02-04  ✓ 65 rows
[A] 1980-02-11  ✓ 65 rows
[A] 1980-02-18  ✓ 65 rows
[A] 1980-02-25  ✓ 65 rows
[A] 1980-03-03  ✓ 65 rows
[A] 1980-03-10  ✓ 65 rows
[A] 1980-03-17  ✓ 65 rows
[A] 1980-03-24  ✓ 65 rows
[A] 1980-03-31  ✓ 65 rows
[A] 1980-04-07  ✓ 65 rows
[A] 1980-04-14  ✓ 65 rows
[A] 1980-04-21  ✓ 65 rows
[A] 1980-04-28  ✓ 65 rows
[A] 1980-05-05  ✓ 65 rows
[A] 1980-05-12  ✓ 65 rows
[A] 1980-05-19  ✓ 65 rows
[A] 1980-05-26  ✓ 65 rows
[A] 1980-06-02  ✓ 65 rows
[A] 1980-06-09  ✓ 65 rows
[A] 1980-06-16  ✓ 65 rows
[A] 1980-06-23  ✓ 65 rows
[A] 1980-06-30  ✓ 65 rows
[A] 1980-07-07  ✓ 65 rows
[A] 1980-07-14  ✓ 65 rows
[A] 1980-07-21  ✓ 65 rows
[A] 1980-07-28  ✓ 65 rows
[A] 1980-08-04  ✓ 65 rows
[A] 1980-08-11  ✓ 65 rows
[A] 1980-08-18  ✓ 65 rows
[A] 1980-08-25  ✓ 65 rows
[A] 1980-09-01  ✓ 65 rows
[A] 1980-09-08  ✓ 65 rows
[A] 1980-09-15  ✓ 65 rows
[A] 1980-09-

--- Logging error ---
Traceback (most recent call last):
  File "c:\Users\astaub1\AppData\Local\anaconda3\envs\music_data_chartmetric\Lib\logging\__init__.py", line 1160, in emit
    msg = self.format(record)
          ^^^^^^^^^^^^^^^^^^^
  File "c:\Users\astaub1\AppData\Local\anaconda3\envs\music_data_chartmetric\Lib\logging\__init__.py", line 999, in format
    return fmt.format(record)
           ^^^^^^^^^^^^^^^^^^
  File "c:\Users\astaub1\AppData\Local\anaconda3\envs\music_data_chartmetric\Lib\logging\__init__.py", line 703, in format
    record.message = record.getMessage()
                     ^^^^^^^^^^^^^^^^^^^
  File "c:\Users\astaub1\AppData\Local\anaconda3\envs\music_data_chartmetric\Lib\logging\__init__.py", line 392, in getMessage
    msg = msg % self.args
          ~~~~^~~~~~~~~~~
ValueError: unsupported format character ',' (0x2c) at index 34
Call stack:
  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "c:\Us

# Running a 1 month trial

In [18]:
#-------------
# setup of trial
#--------------

# 1 month trial period for testing

TRIAL_START = date(1990, 5, 7)   # first chart in May-1990
TRIAL_END   = date(1990, 5, 28)  # last chart in May-1990
weeks       = list(weekly_dates(TRIAL_START, TRIAL_END))

In [19]:
# testing on singles charts

trial_csv = CSV_PATH.parent / "de_charts_1_month_trial.csv"

header = not trial_csv.exists()

for week in tqdm(weeks, desc="SINGLES"):
    df = scrape_chart_for_week("singles", week)
    if df is not None:
        df.to_csv(trial_csv, mode="a", index=False, header=header)
        header = False

SINGLES:   0%|          | 0/4 [00:00<?, ?it/s]