In [None]:
'''
author: Alexander Staub
Date: 2025-04-21
Description: Script for scraper of "hitparadeitalia.it", IT charts 

'''

In [None]:
"""
Scrape IT singles charts  1980-2000
========================================================
Outputs a single CSV with the columns

    chart_type        "singles" 
    chart_date        YYYY-MM-DD (Sunday shown in the URL)
    position          1-20 (or 1-50 if the page shows more)*
    last_week         integer or NaN if not available
    song_title
    artist_name

The script is **checkpoint-aware**: if you stop it midway you can rerun it
and it will skip the weeks already in the CSV.

Requirement summary
-------------------
2) Every available position each week (page displays up to 100)  
4) Produces **one big CSV** in `data/` (relative path)  
5) Works locally & remotely – all paths are relative  
6) Robust to interruptions, rate-limits, and missing weeks  
7) Random 2-3 s delay between *HTTP* requests

© 2025 – academic-use only.  ↝ MIT licence if you wish.
"""

In [10]:
# installing required packages
from __future__ import annotations

import csv
import random
import time
import re # for regex
from datetime import datetime
from pathlib import Path
from typing import Iterable, List
import logging
from tqdm.notebook import tqdm # for progress bar

import pandas as pd
import requests
from bs4 import BeautifulSoup
from tenacity import retry, stop_after_attempt, wait_random_exponential

In [32]:
# ──────────────────────────────────────────────────────────────
# CONSTANTS & CONFIG
# ──────────────────────────────────────────────────────────────
BASE_URL = "https://hitparadeitalia.it/hp_weeks/"

# Go three levels up from the current working directory
base_dir = Path.cwd().parents[2]
CSV_PATH = base_dir / "data" / "raw_data" / "country_chart_data" / "it_charts_1980_2000.csv"
CSV_PATH.parent.mkdir(parents=True, exist_ok=True)


HEADERS = {
    "User-Agent": "chart-research/1.0 (+https://github.com/deskreject)",
    "Accept-Language": "en-GB,en;q=0.9",
}

YEARS = list(range(1979, 2000 + 1))

REQUEST_DELAY_RANGE = (1, 3.0)  # polite crawling

# --- Add Logging Configuration ---
LOG_PATH = base_dir / "code" / "logs" / "it_charts_scraping.log" # Define log file path
LOG_PATH.parent.mkdir(parents=True, exist_ok=True) # Create logs directory

logging.basicConfig(
    level=logging.DEBUG, # Log INFO, WARNING, ERROR, CRITICAL levels
    format="%(asctime)s [%(levelname)s] %(message)s", # Include timestamp and level
    handlers=[
        logging.FileHandler(LOG_PATH, mode='a'), # Append logs to this file
        logging.StreamHandler() # Also print logs to the console
    ]
)

In [33]:
# ──────────────────────────────────────────────────────────────
# HTTP helpers
# ──────────────────────────────────────────────────────────────
session = requests.Session()
session.headers.update(HEADERS)


@retry(
    reraise=True,
    stop=stop_after_attempt(5),
    wait=wait_random_exponential(multiplier=1, max=20),
)
def fetch(url: str) -> str:
    """GET a URL with retries & exponential back‑off (tenacity)."""
    resp = session.get(url, timeout=30)
    resp.raise_for_status()
    return resp.text

In [34]:
# ──────────────────────────────────────────────────────────────
# Chart link extraction and parsing logic
# ──────────────────────────────────────────────────────────────
def get_weekly_chart_links(year):
    """Return [(full_url, date)] for all weeks in given year."""
    index_url = f"{BASE_URL}hpw_{year}.htm"
    try:
        res = session.get(index_url, timeout=10)
        res.raise_for_status()
    except Exception as e:
        logging.error(f"Failed to fetch {index_url}: {e}")
        logging.info(f"{len(links)} weekly links found for year {year}")
        return []

    soup = BeautifulSoup(res.content, "html.parser")
    links = []
    for a in soup.find_all('a', href=True):
        href = a['href']
        match = re.match(r"^(\d{2})/hp(\d{6})\.htm$", href)
        logging.debug(f"Found href: {href}")
        if match:
            year_short, datecode = match.groups()
            full_url = f"{BASE_URL}{href}"
            # Parse date: e.g., hp900106 = 1990-01-06
            dt = datetime.strptime(f"{year_short}{datecode[2:]}", "%y%m%d")
            links.append((full_url, dt.strftime('%Y-%m-%d')))
        else:
            logging.debug(f"Did not match pattern: {href}")
    return links

In [35]:
# ──────────────────────────────────────────────────────────────
# Progress / resume helpers
# ──────────────────────────────────────────────────────────────

def already_scraped_weeks() -> set[tuple[str, str]]:
    """Read existing CSV (if any) and return {(chart_type, ISO week‑start)}."""
    if not CSV_PATH.exists():
        return set()
    df = pd.read_csv(CSV_PATH, usecols=["chart_type", "chart_week_start"])
    return {(row.chart_type, row.chart_week_start) for row in df.itertuples()}


def append_to_csv(df: pd.DataFrame) -> None:
    header = not CSV_PATH.exists()
    df.to_csv(CSV_PATH, mode="a", index=False, header=header, quoting=csv.QUOTE_MINIMAL)

In [44]:
#-────────────────────────────────────────────────────────────
# Chart Page Parsing - handling both logics of italian weekly charts
#-────────────────────────────────────────────────────────────

def parse_chart_page(url, chart_date):
    """Parse Italian weekly singles chart from the correct table structure (tables[1])."""
    try:
        res = session.get(url, timeout=10)
        res.raise_for_status()
        soup = BeautifulSoup(res.content, "html.parser")
    except Exception as e:
        logging.error(f"Error fetching {url}: {e}")
        return []

    tables = soup.find_all("table")
    if len(tables) < 2:
        logging.warning(f"Less than 2 tables found in {url}")
        return []

    chart_table = tables[1]  # this is the main chart table
    trs = chart_table.find_all("tr")
    rows = []
    for tr in trs:
        tds = tr.find_all("td")
        # Only parse rows with 4 columns, first column must be a chart position
        if len(tds) == 4:
            pos_txt = tds[0].get_text(strip=True).replace('.', '').replace('\xa0', '')
            if not pos_txt or not pos_txt[0].isdigit():
                continue  # skip header and any malformed rows
            try:
                pos = int(pos_txt)
            except Exception:
                continue
            last_txt = tds[1].get_text(strip=True)
            try:
                last = int(last_txt)
            except Exception:
                last = None
            song = tds[2].get_text(strip=True)
            artist = tds[3].get_text(strip=True)
            rows.append({
                'chart_type': 'singles',
                'chart_week_start': chart_date,
                'position': pos,
                'last_week': last,
                'song_title': song,
                'artist_name': artist
            })
            logging.debug(f"Parsed row: pos={pos}, last={last}, song={song!r}, artist={artist!r}")

    if rows:
        logging.info(f"Parsed {len(rows)} rows from {url} ({chart_date})")
        return rows

    # Fallback: try OL/LI old style
    ol = soup.find('ol')
    if ol:
        ol_rows = []
        for pos, li in enumerate(ol.find_all('li'), 1):
            text = li.get_text(strip=True)
            if '-' in text:
                song, artist = [x.strip() for x in text.split('-', 1)]
            else:
                song, artist = text, ""
            ol_rows.append({
                'chart_type': 'singles',
                'chart_week_start': chart_date,
                'position': pos,
                'last_week': None,
                'song_title': song,
                'artist_name': artist
            })
        if ol_rows:
            logging.info(f"Parsed {len(ol_rows)} rows from <ol> at {url}")
            return ol_rows

    logging.warning(f"No data parsed for {url} ({chart_date})")
    return []

In [48]:
def main(full_range=True, trial_year=None, save_every=10):
    """Scrape all available singles charts (full_range or trial year)."""
    logging.info("="*20 + " Scraping Script Started " + "="*20)
    done = already_scraped_weeks()
    logging.info(f"Already scraped {len(done):,} week/chart combos.")

    years = YEARS if full_range else [trial_year]
    all_rows = []
    count = 0
    for year in years:
        for url, chart_date in get_weekly_chart_links(year):
            key = ("singles", chart_date)
            if key in done:
                continue
            rows = parse_chart_page(url, chart_date)
            if rows:
                all_rows.extend(rows)
                count += 1
                if count % save_every == 0:
                    pd.DataFrame(all_rows).to_csv(CSV_PATH, mode="a", index=False, header=not CSV_PATH.exists())
                    all_rows = []
            time.sleep(random.uniform(1, 2))
    # Final save
    if all_rows:
        pd.DataFrame(all_rows).to_csv(CSV_PATH, mode="a", index=False, header=not CSV_PATH.exists())
    logging.info(f"Done! Scraped and saved {count:,} weeks.")

# ---------------------------------------------------
# Run cell
# ---------------------------------------------------
main(full_range=True)        # <- set to False to re‑run a 1‑month trial

2025-05-23 21:44:39,589 [INFO] Already scraped 72 week/chart combos.
2025-05-23 21:51:26,422 [INFO] Parsed 20 rows from https://hitparadeitalia.it/hp_weeks/85/hp850105.htm (1985-01-05)
2025-05-23 21:51:28,158 [INFO] Parsed 20 rows from https://hitparadeitalia.it/hp_weeks/85/hp850112.htm (1985-01-12)
2025-05-23 21:51:29,708 [INFO] Parsed 20 rows from https://hitparadeitalia.it/hp_weeks/85/hp850119.htm (1985-01-19)
2025-05-23 21:51:31,591 [INFO] Parsed 20 rows from https://hitparadeitalia.it/hp_weeks/85/hp850126.htm (1985-01-26)
2025-05-23 21:51:33,535 [INFO] Parsed 20 rows from https://hitparadeitalia.it/hp_weeks/85/hp850202.htm (1985-02-02)
2025-05-23 21:51:34,876 [INFO] Parsed 20 rows from https://hitparadeitalia.it/hp_weeks/85/hp850209.htm (1985-02-09)
2025-05-23 21:51:36,650 [INFO] Parsed 20 rows from https://hitparadeitalia.it/hp_weeks/85/hp850216.htm (1985-02-16)
2025-05-23 21:51:37,716 [INFO] Parsed 20 rows from https://hitparadeitalia.it/hp_weeks/85/hp850223.htm (1985-02-23)
202

# Running a 1 month trial

In [46]:
#-------------
# setup of trial
#--------------

# Set this to your chosen year for trial
trial_year = 1990  # change as needed


In [47]:
# testing on singles charts

main(full_range=False, trial_year=trial_year, save_every=5)

2025-05-23 21:44:00,712 [INFO] Already scraped 52 week/chart combos.
2025-05-23 21:44:00,981 [INFO] Parsed 20 rows from https://hitparadeitalia.it/hp_weeks/90/hp900106.htm (1990-01-06)
2025-05-23 21:44:02,379 [INFO] Parsed 20 rows from https://hitparadeitalia.it/hp_weeks/90/hp900113.htm (1990-01-13)
2025-05-23 21:44:04,395 [INFO] Parsed 20 rows from https://hitparadeitalia.it/hp_weeks/90/hp900120.htm (1990-01-20)
2025-05-23 21:44:06,429 [INFO] Parsed 20 rows from https://hitparadeitalia.it/hp_weeks/90/hp900127.htm (1990-01-27)
2025-05-23 21:44:08,437 [INFO] Parsed 20 rows from https://hitparadeitalia.it/hp_weeks/90/hp900203.htm (1990-02-03)
2025-05-23 21:44:10,245 [INFO] Parsed 20 rows from https://hitparadeitalia.it/hp_weeks/90/hp900210.htm (1990-02-10)
2025-05-23 21:44:11,477 [INFO] Parsed 20 rows from https://hitparadeitalia.it/hp_weeks/90/hp900217.htm (1990-02-17)
2025-05-23 21:44:13,528 [INFO] Parsed 20 rows from https://hitparadeitalia.it/hp_weeks/90/hp900224.htm (1990-02-24)
202

KeyboardInterrupt: 

In [None]:
#________________________
# debugging steps
#_____________________

In [42]:
url = "https://hitparadeitalia.it/hp_weeks/90/hp900106.htm"
res = requests.get(url)
soup = BeautifulSoup(res.content, "html.parser")

# Print all tables with their index
for idx, table in enumerate(soup.find_all("table")):
    print(f"TABLE {idx}\n{'-'*40}")
    print(table.prettify()[:1500])  # print first 1500 chars for brevity

TABLE 0
----------------------------------------
<table border="0" cellpadding="10" cellspacing="0" width="100%">
 <tr>
  <td class="normale">
   <b>
    HitParadeItalia
  -   Top20 del 06 Gennaio 1990
   </b>
  </td>
 </tr>
</table>

TABLE 1
----------------------------------------
<table border="0" cellpadding="1" cellspacing="5" width="100%">
 <tr>
  <td bgcolor="#ffffff">
   <table border="0" cellpadding="5" cellspacing="1" class="piccolo" width="100%">
    <tr align="center" bgcolor="#b5ffda" style="color:black">
     <td>
      <b>
       Pos.
       <br/>
       Att.
      </b>
     </td>
     <td>
      <b>
       Pos.
       <br/>
       Prec.
      </b>
     </td>
     <td align="left">
      <b>
       Titolo
      </b>
     </td>
     <td align="left">
      <b>
       Interprete
      </b>
     </td>
    </tr>
   </table>
  </td>
  <td bgcolor="#ffffff" rowspan="21" width="122">
   <script type="text/javascript">
    <!--
google_ad_client = "pub-9635531430093553";
google_a

In [43]:
# Assume target table is e.g. tables[1], replace with correct index
target_table = soup.find_all("table")[1]  # try 2, 3, 4 if needed

rows = target_table.find_all("tr")
print(f"Found {len(rows)} rows in table {2}")
for idx, tr in enumerate(rows):
    tds = tr.find_all("td")
    print(f"Row {idx}: {len(tds)} columns, values: {[td.get_text(strip=True) for td in tds]}")

Found 22 rows in table 2
Row 0: 6 columns, values: ['Pos.Att.Pos.Prec.TitoloInterprete', 'Pos.Att.', 'Pos.Prec.', 'Titolo', 'Interprete', '']
Row 1: 4 columns, values: ['Pos.Att.', 'Pos.Prec.', 'Titolo', 'Interprete']
Row 2: 4 columns, values: ['1', '1', 'Lambada', 'Kaoma']
Row 3: 4 columns, values: ['2', '2', 'Another day in paradise', 'Phil Collins']
Row 4: 4 columns, values: ['3', '5', "Un'estate italiana", 'Gianna Nannini & Edoardo Bennato']
Row 5: 4 columns, values: ['4', '4', 'Varietà', 'Gianni Morandi']
Row 6: 4 columns, values: ['5', '3', 'Sowing the seeds of love', 'Tears For Fears']
Row 7: 4 columns, values: ['6', '9', 'Pump up the jam', 'Technotronic']
Row 8: 4 columns, values: ['7', '15', 'Burning the ground', 'Duran Duran']
Row 9: 4 columns, values: ['8', '7', 'Healing hands', 'Elton John']
Row 10: 4 columns, values: ['9', '6', 'Personal Jesus', 'Depeche Mode']
Row 11: 4 columns, values: ['10', '11', 'The best', 'Tina Turner']
Row 12: 4 columns, values: ['11', '8', 'Ti ric