In [None]:
'''
author: Alexander Staub
Date: 2025-04-21
Description: Script for scraper of "hitparadeitalia.it", IT charts 

'''

In [None]:
"""
Scrape IT singles charts  1980-2000
========================================================
Outputs a single CSV with the columns

    chart_type        "singles" 
    chart_date        YYYY-MM-DD (Sunday shown in the URL)
    position          1-20 (or 1-50 if the page shows more)*
    last_week         integer or NaN if not available
    song_title
    artist_name

The script is **checkpoint-aware**: if you stop it midway you can rerun it
and it will skip the weeks already in the CSV.

Requirement summary
-------------------
2) Every available position each week (page displays up to 100)  
4) Produces **one big CSV** in `data/` (relative path)  
5) Works locally & remotely – all paths are relative  
6) Robust to interruptions, rate-limits, and missing weeks  
7) Random 2-3 s delay between *HTTP* requests

© 2025 – academic-use only.  ↝ MIT licence if you wish.
"""

In [1]:
# installing required packages
from __future__ import annotations

import csv
import random
import time
import re # for regex
from datetime import datetime
from pathlib import Path
from typing import Iterable, List
import logging
from tqdm.notebook import tqdm # for progress bar

import pandas as pd
import requests
from bs4 import BeautifulSoup
from tenacity import retry, stop_after_attempt, wait_random_exponential

In [29]:
# ──────────────────────────────────────────────────────────────
# CONSTANTS & CONFIG
# ──────────────────────────────────────────────────────────────
BASE_URL = "https://hitparadeitalia.it/hp_weeks/"

# Go three levels up from the current working directory
base_dir = Path.cwd().parents[2]
CSV_PATH = base_dir / "data" / "raw_data" / "country_chart_data" / "it_charts_1980_2000.csv"
CSV_PATH.parent.mkdir(parents=True, exist_ok=True)


HEADERS = {
    "User-Agent": "chart-research/1.0 (+https://github.com/deskreject)",
    "Accept-Language": "en-GB,en;q=0.9",
}


#YEARS = list(range(1979, 2000 + 1)) # full range
Years = list(range(1979, 1985+1)) # limited to unscraped years after first run

REQUEST_DELAY_RANGE = (1, 3.0)  # polite crawling

# --- Add Logging Configuration ---
LOG_PATH = base_dir / "code" / "logs" / "it_charts_scraping.log" # Define log file path
LOG_PATH.parent.mkdir(parents=True, exist_ok=True) # Create logs directory

logging.basicConfig(
    level=logging.DEBUG, # Log INFO, WARNING, ERROR, CRITICAL levels
    format="%(asctime)s [%(levelname)s] %(message)s", # Include timestamp and level
    handlers=[
        logging.FileHandler(LOG_PATH, mode='a'), # Append logs to this file
        logging.StreamHandler() # Also print logs to the console
    ]
)

In [3]:
# ──────────────────────────────────────────────────────────────
# HTTP helpers
# ──────────────────────────────────────────────────────────────
session = requests.Session()
session.headers.update(HEADERS)


@retry(
    reraise=True,
    stop=stop_after_attempt(5),
    wait=wait_random_exponential(multiplier=1, max=20),
)
def fetch(url: str) -> str:
    """GET a URL with retries & exponential back‑off (tenacity)."""
    resp = session.get(url, timeout=30)
    resp.raise_for_status()
    return resp.text

In [4]:
# ──────────────────────────────────────────────────────────────
# Chart link extraction and parsing logic
# ──────────────────────────────────────────────────────────────
def get_weekly_chart_links(year):
    """Return [(full_url, date)] for all weeks in given year."""
    index_url = f"{BASE_URL}hpw_{year}.htm"
    try:
        res = session.get(index_url, timeout=10)
        res.raise_for_status()
    except Exception as e:
        logging.error(f"Failed to fetch {index_url}: {e}")
        logging.info(f"{len(links)} weekly links found for year {year}")
        return []

    soup = BeautifulSoup(res.content, "html.parser")
    links = []
    for a in soup.find_all('a', href=True):
        href = a['href']
        match = re.match(r"^(\d{2})/hp(\d{6})\.htm$", href)
        logging.debug(f"Found href: {href}")
        if match:
            year_short, datecode = match.groups()
            full_url = f"{BASE_URL}{href}"
            # Parse date: e.g., hp900106 = 1990-01-06
            dt = datetime.strptime(f"{year_short}{datecode[2:]}", "%y%m%d")
            links.append((full_url, dt.strftime('%Y-%m-%d')))
        else:
            logging.debug(f"Did not match pattern: {href}")
    return links

In [5]:
# ──────────────────────────────────────────────────────────────
# Progress / resume helpers
# ──────────────────────────────────────────────────────────────

def already_scraped_weeks() -> set[tuple[str, str]]:
    """Read existing CSV (if any) and return {(chart_type, ISO week‑start)}."""
    if not CSV_PATH.exists():
        return set()
    df = pd.read_csv(CSV_PATH, usecols=["chart_type", "chart_week_start"])
    return {(row.chart_type, row.chart_week_start) for row in df.itertuples()}


def append_to_csv(df: pd.DataFrame) -> None:
    header = not CSV_PATH.exists()
    df.to_csv(CSV_PATH, mode="a", index=False, header=header, quoting=csv.QUOTE_MINIMAL)

In [30]:
#-────────────────────────────────────────────────────────────
# Chart Page Parsing - handling both logics of italian weekly charts
#-────────────────────────────────────────────────────────────

def parse_chart_page(url, chart_date):
    try:
        res = session.get(url, timeout=10)
        res.raise_for_status()
        soup = BeautifulSoup(res.content, "html.parser")
    except Exception as e:
        logging.error(f"Error fetching {url}: {e}")
        return []

    tables = soup.find_all("table")
    rows = []

    # Try table format (modern)
    if len(tables) >= 2:
        chart_table = tables[1]
        trs = chart_table.find_all("tr")
        for tr in trs:
            tds = tr.find_all("td")
            if len(tds) == 4:
                pos_txt = tds[0].get_text(strip=True).replace('.', '').replace('\xa0', '')
                if not pos_txt or not pos_txt[0].isdigit():
                    continue
                try:
                    pos = int(pos_txt)
                except Exception:
                    continue
                last_txt = tds[1].get_text(strip=True)
                try:
                    last = int(last_txt)
                except Exception:
                    last = None
                song = tds[2].get_text(strip=True)
                artist = tds[3].get_text(strip=True)
                rows.append({
                    'chart_type': 'singles',
                    'chart_week_start': chart_date,
                    'position': pos,
                    'last_week': last,
                    'song_title': song,
                    'artist_name': artist
                })
        if rows:
            logging.info(f"Parsed {len(rows)} rows from table in {url} ({chart_date})")
            return rows

    # --- Fallback: raw <ol> parsing (robust to no </li> tags) ---
    ol = soup.find('ol')
    if ol:
        ol_html = str(ol)
        # Find all lines beginning with <li>, case insensitive, greedy to EOL or next tag
        entries = re.findall(r'<li[^>]*>\s*([^<\n\r]+)', ol_html, re.IGNORECASE)
        for pos, entry in enumerate(entries, 1):
            # Split on last dash for song/artist
            if '-' in entry:
                song, artist = [x.strip() for x in entry.rsplit('-', 1)]
            else:
                song, artist = entry.strip(), ""
            rows.append({
                'chart_type': 'singles',
                'chart_week_start': chart_date,
                'position': pos,
                'last_week': None,
                'song_title': song,
                'artist_name': artist
            })
        if rows:
            logging.info(f"Parsed {len(rows)} rows from <ol> via raw parsing in {url} ({chart_date})")
            return rows

    logging.warning(f"No data parsed for {url} ({chart_date})")
    return []

In [31]:
def main(full_range=True, trial_year=None, save_every=10):
    """Scrape all available singles charts (full_range or trial year)."""
    logging.info("="*20 + " Scraping Script Started " + "="*20)
    done = already_scraped_weeks()
    logging.info(f"Already scraped {len(done):,} week/chart combos.")

    years = YEARS if full_range else [trial_year]
    all_rows = []
    count = 0
    for year in years:
        for url, chart_date in get_weekly_chart_links(year):
            key = ("singles", chart_date)
            if key in done:
                continue
            rows = parse_chart_page(url, chart_date)
            if rows:
                all_rows.extend(rows)
                count += 1
                if count % save_every == 0:
                    pd.DataFrame(all_rows).to_csv(CSV_PATH, mode="a", index=False, header=not CSV_PATH.exists())
                    all_rows = []
            time.sleep(random.uniform(1, 2))
    # Final save
    if all_rows:
        pd.DataFrame(all_rows).to_csv(CSV_PATH, mode="a", index=False, header=not CSV_PATH.exists())
    logging.info(f"Done! Scraped and saved {count:,} weeks.")

# ---------------------------------------------------
# Run cell
# ---------------------------------------------------
main(full_range=True)        # <- set to False to re‑run a 1‑month trial

2025-05-28 22:21:28,957 [INFO] Already scraped 841 week/chart combos.
2025-05-28 22:21:28,960 [DEBUG] Resetting dropped connection: hitparadeitalia.it
2025-05-28 22:21:29,341 [DEBUG] https://hitparadeitalia.it:443 "GET /hp_weeks/hpw_1979.htm HTTP/1.1" 200 2991
2025-05-28 22:21:29,349 [DEBUG] Encoding detection: ascii is most likely the one.
2025-05-28 22:21:29,353 [DEBUG] Found href: 79/hp790106.htm
2025-05-28 22:21:29,355 [DEBUG] Found href: 79/hp790113.htm
2025-05-28 22:21:29,356 [DEBUG] Found href: 79/hp790120.htm
2025-05-28 22:21:29,357 [DEBUG] Found href: 79/hp790127.htm
2025-05-28 22:21:29,358 [DEBUG] Found href: 79/hp790203.htm
2025-05-28 22:21:29,358 [DEBUG] Found href: 79/hp790210.htm
2025-05-28 22:21:29,358 [DEBUG] Found href: 79/hp790217.htm
2025-05-28 22:21:29,358 [DEBUG] Found href: 79/hp790224.htm
2025-05-28 22:21:29,358 [DEBUG] Found href: 79/hp790303.htm
2025-05-28 22:21:29,365 [DEBUG] Found href: 79/hp790310.htm
2025-05-28 22:21:29,366 [DEBUG] Found href: 79/hp790317.h

# Running a 1 month / 1 year trial

In [None]:
#-------------
# setup of trial
#--------------

# Set this to your chosen year for trial
trial_year = 1979  # change as needed


In [28]:
# testing on singles charts

main(full_range=False, trial_year=trial_year, save_every=5)

2025-05-28 22:20:22,183 [INFO] Already scraped 841 week/chart combos.
2025-05-28 22:20:22,767 [DEBUG] https://hitparadeitalia.it:443 "GET /hp_weeks/hpw_%5B1979,%201980,%201981,%201982,%201983,%201984,%201985%5D.htm HTTP/1.1" 404 268
2025-05-28 22:20:22,767 [ERROR] Failed to fetch https://hitparadeitalia.it/hp_weeks/hpw_[1979, 1980, 1981, 1982, 1983, 1984, 1985].htm: 404 Client Error: Not Found for url: https://hitparadeitalia.it/hp_weeks/hpw_%5B1979,%201980,%201981,%201982,%201983,%201984,%201985%5D.htm


UnboundLocalError: cannot access local variable 'links' where it is not associated with a value

In [None]:
#________________________
# debugging steps
#_____________________

In [None]:
url = "https://hitparadeitalia.it/hp_weeks/90/hp900106.htm"
res = requests.get(url)
soup = BeautifulSoup(res.content, "html.parser")

# Print all tables with their index
for idx, table in enumerate(soup.find_all("table")):
    print(f"TABLE {idx}\n{'-'*40}")
    print(table.prettify()[:1500])  # print first 1500 chars for brevity

In [None]:
# Assume target table is e.g. tables[1], replace with correct index
target_table = soup.find_all("table")[1]  # try 2, 3, 4 if needed

rows = target_table.find_all("tr")
print(f"Found {len(rows)} rows in table {2}")
for idx, tr in enumerate(rows):
    tds = tr.find_all("td")
    print(f"Row {idx}: {len(tds)} columns, values: {[td.get_text(strip=True) for td in tds]}")

In [19]:
#-----------
# debugging the old chart format
#-----------

test_url = "https://hitparadeitalia.it/hp_weeks/82/hp820102.htm"
rows = parse_chart_page(test_url, "1/2/1982")
for row in rows:
    print(row)

2025-05-28 22:16:53,116 [DEBUG] Resetting dropped connection: hitparadeitalia.it
2025-05-28 22:16:53,411 [DEBUG] https://hitparadeitalia.it:443 "GET /hp_weeks/82/hp820102.htm HTTP/1.1" 200 2097
2025-05-28 22:16:53,413 [INFO] Parsed 10 rows from <ol> via raw parsing in https://hitparadeitalia.it/hp_weeks/82/hp820102.htm (1/2/1982)


{'chart_type': 'singles', 'chart_week_start': '1/2/1982', 'position': 1, 'last_week': None, 'song_title': 'Cicale', 'artist_name': 'Heather Parisi'}
{'chart_type': 'singles', 'chart_week_start': '1/2/1982', 'position': 2, 'last_week': None, 'song_title': 'Reality', 'artist_name': 'Richard Sanderson'}
{'chart_type': 'singles', 'chart_week_start': '1/2/1982', 'position': 3, 'last_week': None, 'song_title': 'Sharazan', 'artist_name': 'Al Bano e Romina'}
{'chart_type': 'singles', 'chart_week_start': '1/2/1982', 'position': 4, 'last_week': None, 'song_title': "You can't stay the night", 'artist_name': 'Miguel Bosè'}
{'chart_type': 'singles', 'chart_week_start': '1/2/1982', 'position': 5, 'last_week': None, 'song_title': 'Bette Davis eyes', 'artist_name': 'Kim Carnes'}
{'chart_type': 'singles', 'chart_week_start': '1/2/1982', 'position': 6, 'last_week': None, 'song_title': 'Every little thing she does..', 'artist_name': 'Police'}
{'chart_type': 'singles', 'chart_week_start': '1/2/1982', 'pos