In [3]:
# import libraries and packages
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime, timedelta
import time
import os
import glob

In [1]:
# get dates of weekly chart releases
def get_chart_dates_for_year(year):
    dates = []
    d = datetime(year, 1, 1)

    # move to first Saturday of week
    while d.weekday() != 5:
        d += timedelta(days=1)

    # collect all Saturdays
    while d.year == year:
        dates.append(d.strftime("%Y-%m-%d"))
        d += timedelta(days=7)
    return dates

# get weekly top 100 billboard chart
def get_weekly_hot100(date_str):
    # billboard base URL
    url = f"https://www.billboard.com/charts/hot-100/{date_str}/"
    headers = {"User-Agent": "Mozilla/5.0"}

    r = requests.get(url, headers=headers)
    r.raise_for_status()

    soup = BeautifulSoup(r.text, "html.parser")
    items = soup.select("li.o-chart-results-list__item")

    # initialize lists
    titles = []
    artists = []
    ranks = []

    for item in items:
        h3 = item.find("h3")
        if not h3:
            continue

        # get song title
        title = h3.get_text(strip = True)
        titles.append(title)

        # aget song artist
        span = h3.find_next("span")
        artist = span.get_text(strip = True) if span else "UNKNOWN"
        artists.append(artist)

    # add columns to dataframe
    df = pd.DataFrame({
        "title": titles,
        "artist": artists,
        "chart_date": date_str
    })
    return df

# get all weekly top 100 billboard charts for a year and save to .csv
def get_all_hot100_for_year(year, sleep_time = 0.1, save_dir = "billboard_data"):
    print(f"\n=== Scraping year {year} ===")
    os.makedirs(save_dir, exist_ok = True)

    # initialize dates and weeks list
    dates = get_chart_dates_for_year(year)
    all_weeks = []

    # get all weekly top 100 billbaord charts for a year and add to list
    for d in dates:
        try:
            print(f"Fetching {d}...", end = "")
            df_week = get_weekly_hot100(d)
            all_weeks.append(df_week)
            print(" OK")
        except Exception as e:
            print(f" ERROR: {e}")

        time.sleep(sleep_time)

    if not all_weeks:
        print(f"No data scraped for {year}")
        return None

    df_all = pd.concat(all_weeks, ignore_index = True)

    # clean title and artist
    df_all["title"] = df_all["title"].str.strip()
    df_all["artist"] = df_all["artist"].str.strip()

    # pull chart year from chart date
    df_all["chart_year"] = pd.to_datetime(df_all["chart_date"]).dt.year
    df_all.drop(columns=["chart_date"], inplace = True)

    # drop duplicate rows
    df_all.drop_duplicates(subset=["title", "artist", "chart_year"], inplace = True)

    # save to csv
    csv_path = os.path.join(save_dir, f"billboard_hot100_{year}.csv")
    df_all.to_csv(csv_path, index = False)
    print(f"Saved {year} data to {csv_path}")
    return df_all

# get weekly top 100 billboard charts for multiple years
def get_hot100_multi_year(start_year, end_year, save_dir = "billboard_data"):
    for y in range(start_year, end_year + 1):
        # skip if CSV already exists
        csv_path = os.path.join(save_dir, f"billboard_hot100_{y}.csv")
        if os.path.exists(csv_path):
            print(f"{y} already scraped, skipping...")
            continue
        get_all_hot100_for_year(y, save_dir=save_dir)

# get a unique list of all songs from the saved csvs
def combine_all_csvs(save_dir = "billboard_data"):
    all_files = glob.glob(os.path.join(save_dir, "billboard_hot100_*.csv"))
    if not all_files:
        print("No CSV files found.")
        return None
    df = pd.concat([pd.read_csv(f) for f in all_files], ignore_index=True)
    df.drop_duplicates(subset=["title", "artist", "chart_year"], inplace=True)
    return df

In [None]:
# get all weekly top 100 billboard charts from 1990 to 2025
get_hot100_multi_year(1990, 2025)

In [5]:
# get dataframe of all unique songs
df = combine_all_csvs()

save_dir = "billboard_data"
csv_path = os.path.join(save_dir, "all_songs.csv")
df.to_csv(csv_path, index = False)

In [6]:
df.head(10)

Unnamed: 0,title,artist,chart_year,artist_clean,lyrics
0,I'm Your Angel,R. Kelly&Celine Dion,1999,R KellyCeline Dion,
1,Nobody's Supposed To Be Here,Deborah Cox,1999,Deborah Cox,How did you get here?\nNobody's supposed to be...
2,Lately,Divine,1999,Divine,Lately (I've been watchin' you)\nBeen thinkin'...
3,...Baby One More Time,Britney Spears,1999,Britney Spears,"Oh, baby, baby\nOh, baby, baby\n\nOh, baby, ba..."
4,Because Of You,98 Degrees,1999,98 Degrees,You're my sunshine after the rain\nYou're the ...
5,From This Moment On,Shania Twain,1999,Shania Twain,I do swear that I'll always be there\nI'd give...
6,Doo Wop (That Thing),Lauryn Hill,1999,Lauryn Hill,"Yo, 'member back on the bouley when cats used ..."
7,Trippin',Total FeaturingMissy Elliott,1999,Total,"Bad Boy '98\nTotal, Kima, Keisha, Pam\nMissy, ..."
8,Have You Ever?,Brandy,1999,Brandy,Have you ever loved somebody so much it makes ...
9,Love Like This,Faith Evans,1999,Faith Evans,I never knew there was a love like this before...


In [44]:
df.count()

title         18959
artist        18959
chart_year    18959
dtype: int64

In [45]:
df.shape

(18959, 3)