Scraping song lyrics
===

## Setup

In [101]:
# Make HTTP requests
import requests# Scrape data from an HTML document
from bs4 import BeautifulSoup# I/O
import os# Search and manipulate strings
import re
import pandas as pd


GENIUS_API_TOKEN = open("GENIUS_API_TOKEN.txt", "r").readlines()[0]

## Functions

In [95]:
def request_artist_info(artist_name, page):
    """
    Get artist object from Genius API
    
    """
    base_url = 'https://api.genius.com'
    headers = {'Authorization': 'Bearer ' + GENIUS_API_TOKEN}
    search_url = base_url + '/search?per_page=10&page=' + str(page)
    data = {'q': artist_name}
    response = requests.get(search_url, data=data, headers=headers)
    return response# Get Genius.com song url's from artist object


def request_song_url(artist_name, song_cap):
    """
    Get url for song
    
    """
    page = 1
    songs = []
    
    while True:
        response = request_artist_info(artist_name, page)
        json = response.json()        # Collect up to song_cap song objects from artist
        song_info = []
        for hit in json['response']['hits']:
            if artist_name.lower() in hit['result']['primary_artist']['name'].lower():
                song_info.append(hit)
    
        # Collect song URL's from song objects
        for song in song_info:
            if (len(songs) < song_cap):
                url = song['result']['url']
                songs.append(url)
            
        if (len(songs) == song_cap):
            break
        else:
            page += 1
        
    print('Found {} songs by {}'.format(len(songs), artist_name))
    
    return songs
    

def scrape_song_lyrics(url):
    """
    Scrape lyrics from a Genius.com song URL
    
    """
    page = requests.get(url)
    html = BeautifulSoup(page.text, 'html.parser')
    html_text = html.select('div[class^="Lyrics__Container"], .song_body-lyrics p')
    lyrics = []
    for line in html_text:
        line = str(line)
        to_clean = re.compile('<.*?>')
        remove_chorus = re.compile('\[.*?\]')

        line = re.sub('<br/>', '\n', line)
        line = re.sub(to_clean, '', line)
        line = re.sub(remove_chorus, '', line)
        lyrics.append(line)
        
    return "\n".join(lyrics)

## Scrape

In [96]:
song_urls = request_song_url('ho99o9', 5)

song_urls

Found 5 songs by ho99o9


['https://genius.com/Ho99o9-bone-collector-lyrics',
 'https://genius.com/Ho99o9-war-is-hell-lyrics',
 'https://genius.com/Ho99o9-and-ghostemane-twist-of-fate-cobra-lyrics',
 'https://genius.com/Ho99o9-bite-my-face-lyrics',
 'https://genius.com/Ho99o9-street-power-lyrics']

In [97]:
lyrics = scrape_song_lyrics(song_urls[1])

print(lyrics)


Anarchy and chaos
Bombs come and shake the ground
Horror world collide
Might die with the thunder sound
I'll break it down with fire rounds
Rolling through to lay you down
Pound for pound
Get up in your ass
Slash
Buck and a half
Reap what you sow
When the guns start blazing
Full time motherfuckin' maniac
Hell-raisin'
Givin' birth to my seeds
Meet the children of the corn
Act up
Wrong move
Caught slippin'
Now you're gone

You pray to God I pray for hell (pray for hell)
'Cause niggas like me go to hell (go to hell)
If I could rewind time
They'd probably lock me in the cage
For fuckin' white bitches out in Columbine
Since this nigga's off the leash (I'm off the leash)
It's dinner time
Homies on this side
That's homicide on any side
We livin' on the edge
Of your government
Supreme schemes
I've beat and killed your pastor in wet dreams
This evil shit


World power
Rest In Peace
Fuck the pigs
Love your enemies
Eat the rich
'Til you make 'em bleed
Then you kill 'em all
Then repeat


War is h

## Scrape billboard charts

In [104]:
end_date = pd.Timestamp("2018-08-04")
end_date

Timestamp('2018-08-04 00:00:00')

In [128]:
date_list = []

n_timestamps = 70

for i in range(n_timestamps):
    date_list.append(end_date - pd.Timedelta(f"{i} y"))

In [144]:
df = pd.DataFrame(columns=["date", "rank", "author", "title"])

for billboard_date in date_list:

    url = f"https://www.billboard.com/charts/billboard-200/{billboard_date.strftime('%Y-%m-%d')}/"
    res = requests.get(url)
    soup = BeautifulSoup(res.text)

    data = []

    for e in soup.find_all(attrs={'class':'o-chart-results-list-row-container'}):
        data.append({
            'title':e.h3.get_text(strip=True),
            'author':e.h3.find_next('span').get_text(strip=True)
        })

    df_append = pd.DataFrame(data).reset_index().rename(columns={"index":"rank"})
    df_append["rank"]+=1
    df_append["date"] = billboard_date

    df = pd.concat([df, df_append], sort=False)
  

df

In [147]:
df.to_csv(f"billboard_{min(date_list).strftime('%Y-%m-%d')}-{max(date_list).strftime('%Y-%m-%d')}.csv", index=False)