In [1]:
import requests
import base64
import pandas as pd
from dotenv import load_dotenv
import os
from bs4 import BeautifulSoup as bs4

In [2]:
load_dotenv()
CLIENT_ID = os.getenv("CLIENT_ID")
CLIENT_SECRET = os.getenv("CLIENT_SECRET")

auth_str = f"{CLIENT_ID}:{CLIENT_SECRET}"
b64_auth_str = base64.b64encode(auth_str.encode()).decode()

url = "https://accounts.spotify.com/api/token"
headers = {
    "Authorization": f"Basic {b64_auth_str}",
    "Content-Type": "application/x-www-form-urlencoded"
}
data = {"grant_type": "client_credentials"}

response = requests.post(url, headers=headers, data=data)
token = response.json().get("access_token")
print(token)

BQAbpmVhoZzYFiBEaGPjkMRNEqY4V2i2N1a6v4nJY3ixKqnce5740joa_3W0Zi3soBBWRp3d1F4BOsWvC6BhWFDzSiYFiUeKbkj664kYLHkD7qCMnXUVq8MaQAIm1eo7HWVEIu71-9E


In [3]:
ACCESS_TOKEN = token
playlist_id = '6UeSakyzhiEt4NB3UAd6NQ?si=GzuWZxKKSsagMcd6rjPOpw'

url = f"https://api.spotify.com/v1/playlists/{playlist_id}/tracks"
headers = {"Authorization": f"Bearer {ACCESS_TOKEN}"}

response = requests.get(url, headers=headers)
data = response.json()

# Scrapping Billboard Hot 100 Playlist

In [10]:
track_list = []

# Define max total artists in each playlist data 
max_artists = max(len(item["track"]["artists"]) for item in data["tracks"]["items"])

for item in data["tracks"]["items"]:
    track = item["track"]
    album_name = track["album"]["name"]
    release_date = track["album"]["release_date"]
    track_name = track["name"]
    track_id = track['id']
    track_popularity = track['popularity']
    
    # List for artists name and id
    artist_names = [artist["name"] for artist in track["artists"]]
    artist_ids = [artist["id"] for artist in track["artists"]]

    # Filled artist column with "None" when the total is less than the max 
    while len(artist_names) < max_artists:
        artist_names.append(None)
        artist_ids.append(None)
    
    # Create dictionary for each track
    track_info = {
        "album": album_name,
        "release_date": release_date,
        "track": track_name,
        "track_id" : track_id,
        "track_popularity": track_popularity
    }
    
    # Added artists name and id to dictionary 
    for i in range(max_artists):
        track_info[f"artist_{i+1}"] = artist_names[i]
        track_info[f"id_artist_{i+1}"] = artist_ids[i]
    
    track_list.append(track_info)

In [11]:
df = pd.DataFrame(track_list)
df

Unnamed: 0,album,release_date,track,track_id,track_popularity,artist_1,id_artist_1,artist_2,id_artist_2,artist_3,id_artist_3
0,GNX,2024-11-22,luther (with sza),45J4avUb9Ni0bnETYaYFVJ,93,Kendrick Lamar,2YZyLoL8N0Wb9xBt1NhZWg,SZA,7tYKF4w9nC0nq9CsPZTHyP,,
1,MUSIC,2025-03-14,EVIL J0RDAN,6iycYUk3oB0NPMdaDUrN1w,92,Playboi Carti,699OTQXzgjhIYAHMy9RyPD,,,,
2,Die With A Smile,2024-08-16,Die With A Smile,2plbrEY59IikOBgBGLjaoe,100,Lady Gaga,1HY2Jd0NmPuamShAr6KMms,Bruno Mars,0du5cEVh5yTK9QJze8zA0C,,
3,MUSIC,2025-03-14,RATHER LIE (with The Weeknd),68qeaZhtMZ6abrJCYt6nQn,92,Playboi Carti,699OTQXzgjhIYAHMy9RyPD,The Weeknd,1Xyo4u8uXC1ZmMpatF05PJ,,
4,The Giver,2025-03-13,The Giver,5xHgo5JN0wfsV41HnRaos5,89,Chappell Roan,7GlBOeep6PqTfFi59PTUUN,,,,
...,...,...,...,...,...,...,...,...,...,...,...
95,MUSIC,2025-03-14,WALK,5Qya13gFXqupr4sSmZMKDg,77,Playboi Carti,699OTQXzgjhIYAHMy9RyPD,,,,
96,I Ain't Sayin',2024-07-26,I Ain't Sayin',7lCs3stpWCtAn5Y2lmnPhV,77,Jordan Davis,77kULmXAQ6vWer7IIHdGzI,,,,
97,Love Sick (Deluxe),2023-02-28,No Pole,0eaVIYo2zeOaGJeqZ5TwYz,89,Don Toliver,4Gso3d4CscCijv0lmajZWs,,,,
98,Am I Okay?,2024-07-12,Am I Okay?,36wAwEkSDpNQ4oQeRLYC3L,79,Megan Moroney,5Ppie0uPnbnvGBYRwYmlt0,,,,


In [12]:
def get_artist_genre(artist_id):
    url = f"https://api.spotify.com/v1/artists/{artist_id}"
    headers = {"Authorization": f"Bearer {ACCESS_TOKEN}"}
    
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        data = response.json()
        genres = ", ".join(data.get("genres", []))  # Merge genre as a string
        return genres if genres else None  # If empty, change to None
    return None  # Jika tidak ada genre


df["artist_1_genre"] = df["id_artist_1"].apply(get_artist_genre)

# Scrapping genre from Last.fm

In [13]:
# Function for scrapping genre from Last.fm website
def scrape_lastfm_genre(artist_name):
    formatted_name = artist_name.replace(" ", "+")  # URL Format for artist name
    url = f"https://www.last.fm/music/{formatted_name}"

    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
        "Accept-Language": "en-US,en;q=0.9",
        "Referer": "https://www.google.com/",
        "Connection": "keep-alive"
    }

    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        soup = bs4(response.text, "html.parser")
        
        # Find the first element within <ul class="tags-list tags-list--global">
        tag_element = soup.select_one("ul.tags-list.tags-list--global li.tag a")
        if tag_element:
            return tag_element.text.strip()  # Take the first genre text

    return None  # If empty

for index, row in df.iterrows():
    if row["artist_1_genre"] is None:  # If the genre None, find in Last.fm
        artist_name = row["artist_1"]
        genre = scrape_lastfm_genre(artist_name)
        df.at[index, "artist_1_genre"] = genre  # Update dataframe

In [14]:
df

Unnamed: 0,album,release_date,track,track_id,track_popularity,artist_1,id_artist_1,artist_2,id_artist_2,artist_3,id_artist_3,artist_1_genre
0,GNX,2024-11-22,luther (with sza),45J4avUb9Ni0bnETYaYFVJ,93,Kendrick Lamar,2YZyLoL8N0Wb9xBt1NhZWg,SZA,7tYKF4w9nC0nq9CsPZTHyP,,,"hip hop, west coast hip hop"
1,MUSIC,2025-03-14,EVIL J0RDAN,6iycYUk3oB0NPMdaDUrN1w,92,Playboi Carti,699OTQXzgjhIYAHMy9RyPD,,,,,rage rap
2,Die With A Smile,2024-08-16,Die With A Smile,2plbrEY59IikOBgBGLjaoe,100,Lady Gaga,1HY2Jd0NmPuamShAr6KMms,Bruno Mars,0du5cEVh5yTK9QJze8zA0C,,,"art pop, pop"
3,MUSIC,2025-03-14,RATHER LIE (with The Weeknd),68qeaZhtMZ6abrJCYt6nQn,92,Playboi Carti,699OTQXzgjhIYAHMy9RyPD,The Weeknd,1Xyo4u8uXC1ZmMpatF05PJ,,,rage rap
4,The Giver,2025-03-13,The Giver,5xHgo5JN0wfsV41HnRaos5,89,Chappell Roan,7GlBOeep6PqTfFi59PTUUN,,,,,pop
...,...,...,...,...,...,...,...,...,...,...,...,...
95,MUSIC,2025-03-14,WALK,5Qya13gFXqupr4sSmZMKDg,77,Playboi Carti,699OTQXzgjhIYAHMy9RyPD,,,,,rage rap
96,I Ain't Sayin',2024-07-26,I Ain't Sayin',7lCs3stpWCtAn5Y2lmnPhV,77,Jordan Davis,77kULmXAQ6vWer7IIHdGzI,,,,,country
97,Love Sick (Deluxe),2023-02-28,No Pole,0eaVIYo2zeOaGJeqZ5TwYz,89,Don Toliver,4Gso3d4CscCijv0lmajZWs,,,,,rap
98,Am I Okay?,2024-07-12,Am I Okay?,36wAwEkSDpNQ4oQeRLYC3L,79,Megan Moroney,5Ppie0uPnbnvGBYRwYmlt0,,,,,country


Data description
- album = Album name
- release_date = Date when album released
- track = Title track that made it to playlist
- track_id = id for track
- track_popularity = The popularity of the track. The value will be between 0 and 100, with 100 being the most popular.
- artist_1,artist_2, etc = artist name
- artist_id = id for artist
- artist_1_genre = The genre that refered to artist_1