# Songs Scraping

The following code is used to scrape the title and the artist of Billboard Top 100 Songs for each year from 1960 from 2023. We use the Python library BeautifulSoup. 

In [None]:
pip install beautifulsoup4 requests

In [53]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import random

#Function to scrape the songs for a given year
def scrape(url, year):
    page = requests.get(url, headers={'User-Agent': random.choice(user_agents_list)})
    soup = BeautifulSoup(page.content, 'html.parser')

    table = soup.find("table", class_="wikitable sortable")

    songs_and_artists = []

    if table:
        rows = table.find_all("tr")[1:]  #Skip the header row

        for row in rows:
            cells = row.find_all("td")
            #Check if the cells list is long enough to contain song and artist information
            try:
                song_title = cells[1].text.strip()
                artist = cells[2].text.strip()
                songs_and_artists.append((year, song_title, artist))
            except:
                song_title = cells[1].text.strip()
                artist = "above"
                songs_and_artists.append((year, song_title, artist))
    else:
        print(f"Table not found for {year}")

    return songs_and_artists

years = list(range(1960, 2024))
user_agents_list = [
    'Mozilla/5.0 (iPad; CPU OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.83 Safari/537.36',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36'
]

tot_songs = []

for year in years:
    URL = f'https://en.wikipedia.org/wiki/Billboard_Year-End_Hot_100_singles_of_{year}'
    try:
        songs = scrape(URL, year)
        tot_songs.extend(songs)  #Use extend to add elements of the list, not the list itself
    except Exception as e:
        print(f"Error processing {year}: {e}")

df_songs = pd.DataFrame(tot_songs, columns=['Year', 'Song Title', 'Artist'])

df_songs

Unnamed: 0,Year,Song Title,Artist
0,1960,"""Theme from A Summer Place""",Percy Faith
1,1960,"""He'll Have to Go""",Jim Reeves
2,1960,"""Cathy's Clown""",The Everly Brothers
3,1960,"""Running Bear""",Johnny Preston
4,1960,"""Teen Angel""",Mark Dinning
...,...,...,...
6396,2023,"""Bzrp Music Sessions, Vol. 53""",Bizarrap and Shakira
6397,2023,"""Meltdown""",Travis Scott featuring Drake
6398,2023,"""Put It on da Floor Again""",Latto featuring Cardi B
6399,2023,"""Bloody Mary""",Lady Gaga


This function scans the dataframe for rows where the artist name is "above" and replaces it with the artist name from the preceding row. This function is necessary when multiple songs are produced by the same artist.

In [60]:

for i in range(1, len(df_songs)):  #Start from 1 since there's no row above the first row
    #Check if the artist name is "above"
    if df_songs.loc[i, 'Artist'].lower() == "above":
        #Update the artist name with the one from the row above
        df_songs.loc[i, 'Artist'] = df_songs.loc[i-1, 'Artist']


df_songs.to_csv("dataset_songs.csv")

Year                      1961
Song Title    "Spanish Harlem"
Artist             Ben E. King
Name: 163, dtype: object