# Carga de librerías

In [None]:
# Basic
import pandas as pd
import numpy as np
import time
import os
from tqdm import tqdm
import re
import requests
import pickle
# Scraping
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
from selenium.common import exceptions as SeleniumExceptions
from selenium.webdriver.common.keys import Keys

# Load web driver

In [None]:
chromeOptions = webdriver.ChromeOptions()
chromeOptions.add_argument("--start-maximized") # Maximize the browser window to ensure all elements are visible
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()),
                          options=chromeOptions)

In [None]:
root = "https://open.spotify.com/collection/tracks" # URL for liked songs

In [None]:
# Load cookies json
import json
with open('cookies.json', 'r') as f:
    cookies = json.load(f)

In [None]:
for cookie in tqdm(cookies):
  u = driver.execute_cdp_cmd(
                  'Network.setCookie',
                  {
                      'domain': cookie['domain'],
                      'path': cookie['path'],
                      'name': cookie['name'],
                      'value': cookie['value'],
                      'httpOnly': cookie['httpOnly'],
                      'secure': cookie['secure'],
                  },
              )

In [None]:
driver.get(root)

# Start scraping

In [None]:
# Close player aside if it exists (can interfere with selectors and general layout)
selector_aside_player = (By.CSS_SELECTOR, "aside[aria-label]")
selector_close_aside = (By.CSS_SELECTOR, "div[data-testid='PanelHeader_CloseButton']>button")
try:
    WebDriverWait(driver, 3).until(EC.presence_of_element_located(selector_aside_player))
    driver.find_element(*selector_aside_player).find_element(*selector_close_aside).click()
except SeleniumExceptions.TimeoutException:
    pass

In [None]:
df_songs = pd.DataFrame(columns=['title', 'artist', 'artist_link', 'artist_description'])

## Scroll until load all songs

In [None]:
selector_songs_parent = (By.CSS_SELECTOR, "div[data-testid='track-list']")
selector_songs_header = (By.CSS_SELECTOR, "div.IpXjqI9ouS_N5zi0WM88")
selector_songs_container = (By.CSS_SELECTOR, "div.JUa6JJNj7R_Y3i4P8YUX")
selector_song_title = (By.CSS_SELECTOR, "a[data-testid='internal-track-link']")
selector_song_artist = (By.CSS_SELECTOR, "a[data-testid='internal-track-link']~span a") # Just first artist is taken
try:
    # Get the number of songs
    songs_parent = WebDriverWait(driver, 10).until(
        EC.presence_of_element_located(selector_songs_parent)
    )
    song_count = songs_parent.get_attribute("aria-rowcount")
    song_count = int(song_count) if song_count else 0
    # Song container to scroll to
    songs_container = songs_parent.find_element(*selector_songs_container)
    body = driver.find_element(By.TAG_NAME, "body")
    for i in tqdm(range(2, song_count + 1)): # Row 1 is the header
      song = None
      for j in range(10): # Scroll until find the song, max 10 tries
        try:
          song = songs_parent.find_element(By.CSS_SELECTOR, f"div[role='row'][aria-rowindex='{i}']")
          break
        except SeleniumExceptions.NoSuchElementException:
          # Use Keys.PAGE_DOWN to scroll
          songs_parent.find_element(*selector_songs_header).click() # Safe zone to click and enter to focus
          body.send_keys(Keys.PAGE_DOWN)
      if not song:
        raise Exception(f"Song {i} not found")
      song_title = song.find_element(*selector_song_title)
      song_artist = song.find_element(*selector_song_artist)
      song_artist_link = song_artist.get_attribute("href")
      
      df_song = pd.DataFrame({'title': [song_title.text],
                              'artist': [song_artist.text],
                              'artist_link': [song_artist_link],
                              'artist_description': [None]})
      df_songs = pd.concat([df_songs, df_song], ignore_index=True)
      
except SeleniumExceptions.TimeoutException:
    raise Exception("Check if the page is loaded correctly and the CSS selector is correct")

## Load artists info

In [None]:
df_artists = pd.DataFrame(columns=['name', 'description', 'is_famous'])

In [None]:
def scrap_artists_info(artist_link: str, driver: webdriver.Chrome, listeners_threshold: int = 500_000) -> pd.DataFrame:
  selector_artist_name = (By.CSS_SELECTOR, "h1.encore-text-headline-large")
  selector_artist_listeners = (By.CSS_SELECTOR, "button[aria-label] div.encore-text-body-medium-bold")
  selector_artist_description = (By.CSS_SELECTOR, "button[aria-label] div.encore-text-body-medium")
  try:
      driver.get(artist_link)
      artist_name = WebDriverWait(driver, 10).until(
          EC.presence_of_element_located(selector_artist_name)
      ).text
      artist_listeners = WebDriverWait(driver, 10).until(
          EC.presence_of_element_located(selector_artist_listeners)
      ).text
      artist_listeners = int(''.join(re.findall(r'\d+', artist_listeners)))
      is_famous = artist_listeners > listeners_threshold
      # Some artists don't have description
      try:
        artist_description = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located(selector_artist_description)
        ).text
      except SeleniumExceptions.TimeoutException:
        artist_description = None
      df_artist = pd.DataFrame({'name': [artist_name],
                                'description': [artist_description],
                                'is_famous': [is_famous]})
      return df_artist
  except SeleniumExceptions.TimeoutException:
      raise Exception("Check if the page is loaded correctly and the CSS selector is correct")
  except ValueError:
      raise Exception("Error parsing listeners count")

In [None]:
def api_artists_info(artists_ids: list[str], followers_threshold: int = 1_000_000) -> pd.DataFrame:
  client_id = os.getenv('SPOTIFY_CLIENT_ID')
  client_secret = os.getenv('SPOTIFY_CLIENT_SECRET')
  if not client_id or not client_secret:
    raise Exception("Client ID and Client Secret are required for Spotify API")
  # Get access token
  url = "https://accounts.spotify.com/api/token"
  payload = {
      'grant_type': 'client_credentials'
  }
  headers = {
      'Content-Type': 'application/x-www-form-urlencoded'
  }
  response = requests.post(url, headers=headers, data=payload, auth=(client_id, client_secret))
  if response.status_code == 200:
      access_token = response.json().get('access_token')
  else:
      raise Exception(f"Error: {response.status_code}")
  # Get artists info
  query_ids = ','.join(artists_ids)
  url = f'https://api.spotify.com/v1/artists?ids={query_ids}'
  headers = {
      'Authorization': f'Bearer {access_token}'
  }
  response = requests.get(url, headers=headers)
  if response.status_code == 200:
      artists_data = response.json()
      df_artists = pd.DataFrame(columns=['name', 'description', 'followers', 'is_famous'])
      for artist_data in artists_data['artists']:
        followers = artist_data['followers']['total']
        is_famous = followers > followers_threshold
        df_artist_partial = pd.DataFrame({'name': [artist_data['name']],
                                  'description': [artist_data['genres']],
                                  'is_famous': [is_famous]})
        df_artists = pd.concat([df_artists, df_artist_partial], ignore_index=True)
  else:
      raise Exception(f"Error: {response.status_code}")
  return df_artists

In [None]:
df_songs['artist_id'] = df_songs['artist_link'].apply(lambda x: x.split('/')[-1])

In [None]:
api_artists_info(df_songs['artist_id'][0:5].tolist())