Scraping song lyrics
===

## Setup

In [1]:
import requests
from bs4 import BeautifulSoup#
import os
import re
import pandas as pd


GENIUS_API_TOKEN = open("GENIUS_API_TOKEN.txt", "r").readlines()[0]

## Functions

In [12]:
def request_artist_info(artist_name, page):
    """
    Get artist object from Genius API
    
    """
    base_url = 'https://api.genius.com'
    headers = {'Authorization': 'Bearer ' + GENIUS_API_TOKEN}
    search_url = base_url + '/search?per_page=10&page=' + str(page)
    data = {'q': artist_name}
    response = requests.get(search_url, data=data, headers=headers)
    return response# Get Genius.com song url's from artist object

   
    
def request_song_url(artist_name, title):
    """
    Get url for song
    
    """
    
    page = 0
    url = None
    
    while True:
        response = request_artist_info(artist_name, page)
        json = response.json()        # Collect up to song_cap song objects from artist
        song_info = []
        for hit in json['response']['hits']:
            if artist_name.lower() in hit['result']['primary_artist']['name'].lower():
                song_info.append(hit)
    
        # Collect song URL's from song objects
        for song in song_info:
            if (song['result']['title'].lower() == title.lower()):
                url = song['result']['url']
        
        page += 1
        if url != None:
            break
        if len(json["response"]["hits"]) == 0:
            url = ""
            break
            
    return url
    

def scrape_song_lyrics(url):
    """
    Scrape lyrics from a Genius.com song URL
    
    """
    if url != "":
        page = requests.get(url)
        html = BeautifulSoup(page.text, 'html.parser')
        html_text = html.select('div[class^="Lyrics__Container"], .song_body-lyrics p')
        lyrics = []
        for line in html_text:
            line = str(line)
            to_clean = re.compile('<.*?>')
            remove_chorus = re.compile('\[.*?\]')

            line = re.sub('<br/>', '\n', line)
            line = re.sub(to_clean, '', line)
            line = re.sub(remove_chorus, '', line)
            lyrics.append(line)
        lyrics = "\n".join(lyrics)
    else:
        lyrics = ""
    
    return lyrics

## Scrape

In [6]:
artist_name = 'nine inch nails'
title = 'hurt'

lyrics = scrape_song_lyrics(request_song_url(artist_name, title))

print(lyrics)

got hits for nine inch nails: 10
got url for hurt

I hurt myself today
To see if I still feel
I focus on the pain
The only thing that's real
The needle tears a hole
The old familiar sting
Try to kill it all away
But I remember everything


What have I become?
My sweetest friend
Everyone I know
Goes away in the end

You could have it all
My empire of dirt
I will let you down
I will make you hurt


I wear this crown of shit
Upon my liar's chair
Full of broken thoughts
I cannot repair
Beneath the stains of time
The feelings disappear
You are someone else
I am still right here


What have I become?
My sweetest friend
Everyone I know
Goes away in the end


And you could have it all
My empire of dirt
I will let you down
I will make you hurt


If I could start again
A million miles away
I would keep myself
I would find a way


## Scrape billboard charts

In [7]:
end_date = pd.Timestamp("2018-08-04")
end_date

Timestamp('2018-08-04 00:00:00')

In [8]:
date_list = []

n_timestamps = 70

for i in range(n_timestamps):
    date_list.append(end_date - pd.Timedelta(f"{i} y"))

  exec(code_obj, self.user_global_ns, self.user_ns)


In [120]:
df = pd.DataFrame(columns=["date", "rank", "author", "title"])

for billboard_date in date_list:

    url = f"https://www.billboard.com/charts/hot-100/{billboard_date.strftime('%Y-%m-%d')}/"
    res = requests.get(url)
    soup = BeautifulSoup(res.text)

    data = []

    for e in soup.find_all(attrs={'class':'o-chart-results-list-row-container'}):
        data.append({
            'title':e.h3.get_text(strip=True),
            'author':e.h3.find_next('span').get_text(strip=True)
        })

    df_append = pd.DataFrame(data).reset_index().rename(columns={"index":"rank"})
    df_append["rank"]+=1
    df_append["date"] = billboard_date

    df = pd.concat([df, df_append], sort=False)
  

df

Unnamed: 0,date,rank,author,title
0,2018-08-04 00:00:00,1,Drake,In My Feelings
1,2018-08-04 00:00:00,2,"Cardi B, Bad Bunny & J Balvin",I Like It
2,2018-08-04 00:00:00,3,Maroon 5 Featuring Cardi B,Girls Like You
3,2018-08-04 00:00:00,4,6ix9ine Featuring Nicki Minaj & Murda Beatz,FEFE
4,2018-08-04 00:00:00,5,Post Malone,Better Now
...,...,...,...,...
95,1949-08-04 06:25:12,96,Thurston Harris,Over And Over
96,1949-08-04 06:25:12,97,Robert & Johnny,I Believe In You
97,1949-08-04 06:25:12,98,The Ames Brothers,Little Serenade
98,1949-08-04 06:25:12,99,Billy Williams,I'll Get By (As Long As I Have You)


Save to file

In [122]:
csv_filename = f"billboard_{min(date_list).strftime('%Y-%m-%d')}-{max(date_list).strftime('%Y-%m-%d')}.csv"

idx = 0

while os.path.exists(csv_filename):
    idx += 1
    csv_filename = f"billboard_{min(date_list).strftime('%Y-%m-%d')}-{max(date_list).strftime('%Y-%m-%d')}_{str(idx).zfill(2)}.csv"

df.to_csv(csv_filename, index=False)

Load from file

In [9]:
df = pd.read_csv("billboard_1949-08-04-2018-08-04.csv")

## Get lyrics of songs from billboard charts

In [10]:
songs = df[["author", "title"]].copy()

songs = songs.drop_duplicates(keep="first")

In [17]:
for song in songs.itertuples():
    i = song[0]
    artist_name = song[1]
    title = song[2]
    
    lyrics = scrape_song_lyrics(request_song_url(artist_name, title))
    
    songs.loc[i, "lyrics"] = lyrics

In [18]:
songs.head(25)

Unnamed: 0,author,title,lyrics
0,Drake,In My Feelings,"\nTrap, TrapMoneyBenny\nThis shit got me in my..."
1,"Cardi B, Bad Bunny & J Balvin",I Like It,"\nYeah, baby, I like it like that\nYou gotta b..."
2,Maroon 5 Featuring Cardi B,Girls Like You,
3,6ix9ine Featuring Nicki Minaj & Murda Beatz,FEFE,
4,Post Malone,Better Now,"\nYou prolly think that you are better now, be..."
5,Drake,Nice For What,\nI wanna know who mothafuckin' representin' i...
6,Ella Mai,Boo'd Up,
7,Juice WRLD,Lucid Dreams,"\nEnviyon on the mix\nNo, no, no, no\nNo-no, n..."
8,Post Malone Featuring Ty Dolla $ign,Psycho,
9,Tyga Featuring Offset,Taste,


Save to file

In [20]:
csv_filename = f"billboard_songs_{min(date_list).strftime('%Y-%m-%d')}-{max(date_list).strftime('%Y-%m-%d')}.csv"

idx = 0

while os.path.exists(csv_filename):
    idx += 1
    csv_filename = f"billboard_songs_{min(date_list).strftime('%Y-%m-%d')}-{max(date_list).strftime('%Y-%m-%d')}_{str(idx).zfill(2)}.csv"

songs.to_csv(csv_filename, index=False)

Load from file

In [25]:
songs = pd.read_csv("billboard_songs_1949-08-04-2018-08-04.csv")

In [28]:
sum(pd.notna(songs.lyrics))

576

In [31]:
sum(pd.isna(songs.lyrics))

5507

Found lyrics only for less than 10% of the songs!!!