In [26]:
# Billboard Hot 100 Scraper
# Copy this into your data_collection.ipynb

import requests
from bs4 import BeautifulSoup
import pandas as pd

# ============================================
# STEP 1: Scrape Billboard Hot 100
# ============================================

url = "https://www.billboard.com/charts/hot-100/"
response = requests.get(url)

if response.status_code == 200:
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Current Billboard CSS selectors (as of late 2024)
    # These may need tweaking if Billboard updates their site
    
    # Song titles
    songs = [element.get_text(strip=True) for element in 
             soup.select('li.o-chart-results-list__item h3.c-title')]
    
    # Artist names  
    artists = [element.get_text(strip=True) for element in 
               soup.select('li.o-chart-results-list__item span.c-label.a-no-trucate')]
    
    print(f"Found {len(songs)} songs and {len(artists)} artists")
    
else:
    print(f"Failed to retrieve page. Status code: {response.status_code}")

# ============================================
# STEP 2: Create DataFrame
# ============================================

# Make sure we have matching lengths
min_len = min(len(songs), len(artists))
songs = songs[:min_len]
artists = artists[:min_len]

billboard_df = pd.DataFrame({
    'song_title': songs,
    'artist': artists
})

print(billboard_df.head(10))
print(f"\nTotal songs scraped: {len(billboard_df)}")

# ============================================
# STEP 3: Save to CSV
# ============================================

billboard_df.to_csv('/Users/chandlershortlidge/Desktop/Ironhack/DA_FT_Extra_Week10/data/billboard_hot100.csv', index=False)
print("\nSaved to data/billboard_hot100.csv")


# ============================================
# DEBUGGING: If the above doesn't work
# ============================================
# 
# Billboard changes their HTML structure often. If you're not getting
# 100 songs, uncomment this block to inspect the page structure:
#
# # Look at all h3 tags
# all_h3 = soup.find_all('h3')
# print(f"Found {len(all_h3)} h3 tags")
# for i, h3 in enumerate(all_h3[:5]):
#     print(f"{i}: {h3.get_text(strip=True)[:50]}... | classes: {h3.get('class')}")
#
# # Look at the raw HTML around the first song
# chart_list = soup.select('li.o-chart-results-list__item')
# if chart_list:
#     print("\nFirst chart item HTML:")
#     print(chart_list[0].prettify()[:1000])

Found 100 songs and 100 artists
                                    song_title  \
0              All I Want For Christmas Is You   
1                               Last Christmas   
2            Rockin' Around The Christmas Tree   
3                             Jingle Bell Rock   
4                                       Golden   
5                          The Fate Of Ophelia   
6                                     Ordinary   
7                                Santa Tell Me   
8  The Christmas Song (Merry Christmas To You)   
9     It's The Most Wonderful Time Of The Year   

                                 artist  
0                          Mariah Carey  
1                                 Wham!  
2                            Brenda Lee  
3                           Bobby Helms  
4  HUNTR/X: EJAE, Audrey Nuna & REI AMI  
5                          Taylor Swift  
6                           Alex Warren  
7                         Ariana Grande  
8                       Nat "King" Cole