# Final Project for Data & Databases
### C.J. Robinson
### Fall 2024

-----
### Scrape main page

In [1]:
import re
import requests
from bs4 import BeautifulSoup
import time
import pandas as pd

This seems like it relies of JS suprisingly after pulling with requests

In [2]:
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac macOS 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36'}

In [3]:
my_url = "https://www.ultimate-guitar.com/explore?order=hitstotal_desc"
raw_html = requests.get(my_url, headers=headers).content
soup_doc = BeautifulSoup(raw_html, "html.parser")
print(soup_doc.prettify()[:10000])

<!DOCTYPE html>
<html lang="en">
 <head prefix="og: http://ogp.me/ns# fb: http://ogp.me/ns/fb# article: http://ogp.me/ns/article#">
  <meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
  <title>
   Most popular of all time chords and tabs @ Ultimate-Guitar.Com
  </title>
  <meta content="Easily find your favorite songs! Most popular of all time chords and tabs for all levels, from beginners to pros. Start playing today!" name="description"/>
  <meta content="" name="keywords"/>
  <link as="script" href="https://www.ultimate-guitar.com/static/public/build/ug_react_es6/vendor.e764671c5b5579043d09c41e89c69d34.js" importance="high" rel="preload"/>
  <link as="script" href="https://www.ultimate-guitar.com/static/public/build/ug_react_es6/202412/4578.367711593e55ab030099747673c177dd.js" importance="high" rel="preload"/>
  <link as="script" href="https://www.ultimate-guitar.com/static/public/build/ug_react_es6/ug~runtime.ee5688b4c4adb49acd73276afd92cb97.js" importance="high"

In [4]:
print(soup_doc.find('article'))


None


### Shifting to Playwright

In [5]:
from playwright.async_api import async_playwright

In [6]:
playwright = await async_playwright().start()
browser = await playwright.chromium.launch(headless=False)
page = await browser.new_page()
await page.goto("https://www.ultimate-guitar.com/explore?order=hitstotal_desc")

<Response url='https://www.ultimate-guitar.com/explore?order=hitstotal_desc' request=<Request url='https://www.ultimate-guitar.com/explore?order=hitstotal_desc' method='GET'>>

In [7]:
html = await page.content()
soup_doc = BeautifulSoup(html, "html.parser")

In [8]:
table = soup_doc.find_all('article')[1]

### Generalize one loop for the first page

In [9]:
all_songs = []

rank_counter = 1
for row in table.find_all("div", class_ = "LQUZJ")[1:]:
    song_dict = {}

    # there's no rank, so imputing one here
    song_dict['rank'] = rank_counter

    # based off of class names which...seem static
    song_dict['artist'] = row.find(class_ ="lIKMM lz4gy").text
    song_dict['song'] = row.find(class_ ="lIKMM g7KZB").text
    song_dict['ratings'] = row.find(class_ ="lIKMM eznJV").text
    song_dict['hits'] = row.find(class_ ="lIKMM UpKH8").text.strip()
    song_dict['type'] = row.find(class_ ="lIKMM PdXKy").text

    song_dict['song_link'] = row.find("a", class_ = "aPPf7 HT3w5 lBssT")['href']
    song_dict['artist_link'] = row.find("a", class_ = "aPPf7 jtEAE lBssT")['href']

    #stars are a bit interesting
    # take all full stars
    star_count = len(row.find_all("span", class_ = "kd3Q7 DSnE7")) * 1.0
    # add half stars
    star_count += len(row.find_all("span", class_ = "kd3Q7 RCXwf DSnE7")) * .5
    # we wont care about empty stars - assign star
    song_dict['star_count'] = star_count
    
    #up the counter
    rank_counter += 1

    all_songs.append(song_dict)

In [10]:
print(all_songs)

[{'rank': 1, 'artist': 'Ed Sheeran', 'song': 'Perfect', 'ratings': '48,237', 'hits': '41,204,641', 'type': 'chords', 'song_link': 'https://tabs.ultimate-guitar.com/tab/ed-sheeran/perfect-chords-1956589', 'artist_link': 'https://www.ultimate-guitar.com/artist/ed_sheeran_30232', 'star_count': 5.0}, {'rank': 2, 'artist': 'Jeff Buckley', 'song': 'Hallelujah (ver\xa02)', 'ratings': '54,482', 'hits': '39,806,615', 'type': 'chords', 'song_link': 'https://tabs.ultimate-guitar.com/tab/jeff-buckley/hallelujah-chords-198052', 'artist_link': 'https://www.ultimate-guitar.com/artist/jeff_buckley_9898', 'star_count': 5.0}, {'rank': 3, 'artist': 'Elvis Presley', 'song': 'Cant Help Falling In Love', 'ratings': '32,809', 'hits': '33,889,387', 'type': 'chords', 'song_link': 'https://tabs.ultimate-guitar.com/tab/elvis-presley/cant-help-falling-in-love-chords-1086983', 'artist_link': 'https://www.ultimate-guitar.com/artist/elvis_presley_11125', 'star_count': 5.0}, {'rank': 4, 'artist': 'Passenger', 'song':

In [11]:
pd.json_normalize(all_songs)

Unnamed: 0,rank,artist,song,ratings,hits,type,song_link,artist_link,star_count
0,1,Ed Sheeran,Perfect,48237,41204641,chords,https://tabs.ultimate-guitar.com/tab/ed-sheera...,https://www.ultimate-guitar.com/artist/ed_shee...,5.0
1,2,Jeff Buckley,Hallelujah (ver 2),54482,39806615,chords,https://tabs.ultimate-guitar.com/tab/jeff-buck...,https://www.ultimate-guitar.com/artist/jeff_bu...,5.0
2,3,Elvis Presley,Cant Help Falling In Love,32809,33889387,chords,https://tabs.ultimate-guitar.com/tab/elvis-pre...,https://www.ultimate-guitar.com/artist/elvis_p...,5.0
3,4,Passenger,Let Her Go,24248,31904487,chords,https://tabs.ultimate-guitar.com/tab/passenger...,https://www.ultimate-guitar.com/artist/passeng...,5.0
4,5,John Legend,All Of Me,26699,29790419,chords,https://tabs.ultimate-guitar.com/tab/john-lege...,https://www.ultimate-guitar.com/artist/john_le...,5.0
5,6,Jason Mraz,Im Yours (ver 11),16069,24611812,chords,https://tabs.ultimate-guitar.com/tab/jason-mra...,https://www.ultimate-guitar.com/artist/jason_m...,5.0
6,7,Led Zeppelin,Stairway To Heaven,14093,24568410,tab,https://tabs.ultimate-guitar.com/tab/led-zeppe...,https://www.ultimate-guitar.com/artist/led_zep...,5.0
7,8,Radiohead,Creep,32992,22828385,chords,https://tabs.ultimate-guitar.com/tab/radiohead...,https://www.ultimate-guitar.com/artist/radiohe...,5.0
8,9,Ed Sheeran,Thinking Out Loud,17356,22366606,chords,https://tabs.ultimate-guitar.com/tab/ed-sheera...,https://www.ultimate-guitar.com/artist/ed_shee...,5.0
9,10,Vance Joy,Riptide (ver 2),28313,21109813,chords,https://tabs.ultimate-guitar.com/tab/vance-joy...,https://www.ultimate-guitar.com/artist/vance_j...,5.0


### Loop through all the pages!

In [15]:
# start up the page!
playwright = await async_playwright().start()
browser = await playwright.chromium.launch(headless=False)
page = await browser.new_page()


In [None]:
all_songs = []
rank_counter = 1

for page_num in range(1,5):
    url = f"https://www.ultimate-guitar.com/explore?order=hitstotal_desc&page={str(page_num)}"
    print(url)
    await page.goto(url)
    await page.get_by_text("Artist", exact=True).wait_for()
    html = await page.content()
    soup_doc = BeautifulSoup(html, "html.parser")
    table = soup_doc.find_all('article')[1]
    
    for row in table.find_all("div", class_ = "LQUZJ")[1:]:
        song_dict = {}

        # there's no rank, so imputing one here
        song_dict['rank'] = rank_counter
    
        # based off of class names which...seem static
        try:
            #get just raw artist text (feat, &, commas)
            artist_list = []
            song_dict['artist'] = row.find(class_ ="lIKMM lz4gy").text

            # get each individual artist in a list...just in case!
            for artist in row.find_all('a', class_ ="aPPf7 jtEAE lBssT"):
                artist_list.append(artist.text)
            
            song_dict['artist_list'] = artist_list
        except:
            print(f"No artist for {rank_counter} rank")
            
        try:    
            song_dict['song'] = row.find(class_ ="lIKMM g7KZB").text
        except:
            print(f"No song for {rank_counter} rank")

        try:
            song_dict['ratings'] = row.find(class_ ="lIKMM eznJV").text
        except:
            print(f"No ratings for {rank_counter} rank")
            
        try:   
            song_dict['hits'] = row.find(class_ ="lIKMM UpKH8").text.strip()
        except:
            print(f"No hits for {rank_counter} rank")
            
        try:    
            song_dict['type'] = row.find(class_ ="lIKMM PdXKy").text
        except:
            print(f"No type for {rank_counter} rank")
    
        song_dict['song_link'] = row.find("a", class_ = "aPPf7 HT3w5 lBssT")['href']
        song_dict['artist_link'] = row.find("a", class_ = "aPPf7 jtEAE lBssT")['href']
    
        #stars are a bit interesting
        # take all full stars
        star_count = len(row.find_all("span", class_ = "kd3Q7 DSnE7")) * 1.0
        # add half stars
        star_count += len(row.find_all("span", class_ = "kd3Q7 RCXwf DSnE7")) * .5
        # we wont care about empty stars - assign star
        try:
            song_dict['star_count'] = star_count
        except:
            print(f"No stars for {rank_counter} rank")
        
        #up the counter
        rank_counter += 1
    
        all_songs.append(song_dict)

    # wait a couple seconds
    time.sleep(5)

df = pd.json_normalize(all_songs)
df.to_csv("top_songs.csv")

https://www.ultimate-guitar.com/explore?order=hitstotal_desc&page=1


In [8]:
df.tail()

Unnamed: 0,rank,artist,song,ratings,hits,type,song_link,artist_link,star_count
195,196,Sia,Chandelier (ver 3),4656,4882356,chords,https://tabs.ultimate-guitar.com/tab/sia/chand...,https://www.ultimate-guitar.com/artist/sia_9670,5.0
196,197,Misc Christmas,Jingle Bells,3919,4879299,chords,https://tabs.ultimate-guitar.com/tab/misc-chri...,https://www.ultimate-guitar.com/artist/misc_ch...,5.0
197,198,America,A Horse With No Name,6979,4845166,chords,https://tabs.ultimate-guitar.com/tab/america/a...,https://www.ultimate-guitar.com/artist/america...,5.0
198,199,Red Hot Chili Peppers,Californication,611,4797670,tab,https://tabs.ultimate-guitar.com/tab/red-hot-c...,https://www.ultimate-guitar.com/artist/red_hot...,4.5
199,200,Bruno Mars,Count On Me (ver 3),6251,4781391,chords,https://tabs.ultimate-guitar.com/tab/bruno-mar...,https://www.ultimate-guitar.com/artist/bruno_m...,5.0
