In [1]:
import sys
import requests
import re
import pandas
import string
from bs4 import BeautifulSoup

# Helper Functions

The following functions are intended to facilitate the data collection (via web scraping) process:

In [28]:
"""
Accepts a Pandas DataFrame column, album_info, which contains
all basic album information manually scraped from Metacritic.
Starting from row 0, every five rows represents an album, with
each set of five rows corresponding to the following information:
    Row 0: Title of album
    Row 1: Average critic score that album received on Metacritic
    Row 2: Name of album artist
    Row 3: Average user score that album received on Metacritic, or "tbd" if unknown
    Row 4: Day on which album was released in the format d-mmm-yy, e.g. 6-Mar-20
-----
Returns: albums, a list of dictionaries where each dictionary is arranged 
in the following format:
    {"artist": (name of album artist),
    "metascore": (average critic score that album received on Metacritic),
    "release_date": (day on which album was released in the format d-mmm-yy, e.g. 6-Mar-20),
    "title": (title of album),
    "user_score": (average user score that album received on Metacritic)}
"""
def organize_basic_album_info(album_info):
    albums = []
    
    # for each set of 5 rows in album_info, maps row number to field name
    field_mapping = {0: "title", 1: "metascore", 2: "artist", 3: "user_score", 4: "release_date"}
    
    for i in range(len(album_info)):
        current_info = album_info[i]
        field_index = i % 5 # used to tell what kind of information is stored in album_info[i]
        album_index = i // 5 # tells us at which index in albums we should store current_info in
        if field_index == 0:
            albums.append(dict())
        # cleans data to obtain user score
        if field_mapping[field_index] == "user_score":
            current_info = current_info[6:]
        albums[album_index][field_mapping[field_index]] = current_info
    return albums

In [17]:
"""
Given BeautifulSoup object, this function constructs a list of 
URLs found on web page in question, each of which leads to page 
containing song lyrics
-----
Returns: urls, a list of URLs as described above
"""
def find_lyric_urls(soup):
    urls = []
    for elem in soup.select(".chart_row-content a"):
        urls.append(elem["href"])
    return urls

In [24]:
"""
Given the URL of a Genius page containing song lyrics,
this function returns all of the song lyrics as a single string.

Example input: "https://genius.com/Childish-gambino-time-lyrics"
-----
Returns: lyrics, a string as described above
"""
def retrieve_lyrics(url):
    lyrics = ""
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "lxml")
    lyric_div = soup.select(".lyrics")
    if len(lyric_div) > 0:
        for elem in lyric_div:
            # filter out lyric annotations
            lyrics += re.sub("\[.*\]", "", elem.text)
    return lyrics

In [30]:
"""
Accepts a list of albums, organized as a list of dictionaries
such that each dictionary features the following structure:
    {"artist": (name of album artist),
    "metascore": (average critic score that album received on Metacritic),
    "release_date": (day on which album was released in the format d-mmm-yy, e.g. 6-Mar-20),
    "title": (title of album),
    "user_score": (average user score that album received on Metacritic)}

For each album in albums, this function finds all available album lyrics
and stores them as a single string in albums.
"""
def add_album_lyrics(albums):
    for album in albums:
        # replacing all punctuation in album title and album artist with spaces
        artist_no_punc = re.sub("[" + string.punctuation + "]", " ", album["artist"])
        title_no_punc = re.sub("[" + string.punctuation + "]", " ", album["title"])

        # arranging album title and artist with '-' separating each space-separted word, 
        # and just the first word capitalized
        # ex: "After Hours" becomes "After-hours"; The Weeknd" becomes "The-weeknd"
        reformatted_artist_name = "-".join(artist_no_punc.lower().split(" ")).capitalize()
        reformatted_album_name = "-".join(title_no_punc.lower().split(" ")).capitalize()

        # construct URL we expect to correspond to Genius overview page on album
        genius_url = "http://genius.com/albums/" + reformatted_artist_name + "/" + reformatted_album_name

        album["lyrics"] = []
        try:
            response = requests.get(genius_url)
            soup = BeautifulSoup(response.text, "lxml")

            # retrieve URLs corresponding to available lyric pages for each album track
            urls = find_lyric_urls(soup)

            # builds out "lyrics" field for each album by retrieving all available album lyrics
            for url in urls:
                lyrics = retrieve_lyrics(url)
                album["lyrics"].append(lyrics)
        except:
            # encountered error in above code due to invalid URL
            continue

# Web Scraping

Here, we scrape data from Metacritic reviews, Genius album tracklists, 
and Genius lyrics pages to ultimately build a list of dictionaries,
such that each dictionary features the following structure:

`
{"artist": (name of album artist),`
`"metascore": (average critic score that album received on Metacritic),`
`"release_date": (day on which album was released in the format d-mmm-yy, e.g. 6-Mar-20),`
`"title": (title of album),`
`"user_score": (average user score that album received on Metacritic),`
`"lyrics": (all available album lyrics, concatenated as a single string)}
`

This list of dictionaries will then be converted into a Pandas DataFrame, which will serve as my primary dataset through this final project.

In [29]:
# read from CSV file, which was manually constructed by copying
# and pasting data from Metacritic lists of top 50 highest ranked albums
# for each year from 2010-2020. (Rankings for 2020 as of May 5, 2020.)
# Albums here are ranked by average critic score, rather than average 
# Metacritic user score.
metacritic_data = pandas.read_csv("data/metacritic_data.csv", header=0)

# initialize albums list using information from CSV file
album_info = metacritic_data["Information"]
albums = organize_basic_album_info(album_info)

Once basic album information has been loaded, we now retrieve all available album lyrics for each album. We proceed year by year for organizational purposes:

In [31]:
# finding lyrics for Top 50 albums in 2020, based off Metacritic score
albums_2020 = albums[:100]
add_album_lyrics(albums_2020)
albums_2020

[{'artist': 'Fiona Apple',
  'lyrics': ["\n\n\nI've waited many years\nEvery print I left upon the track\nHas led me here\nAnd next year, it'll be clear\nThis was only leading me to that\nAnd by that time, I hope that\n\n\nYou love me\nYou love me\n\n\nI move with the trees in the breeze\nI know that time is elastic\nAnd I know when I go\nAll my particles disband and disperse\nAnd I'll be back in the pulse\nAnd I know none of this will matter in the long run\nBut I know a sound is still a sound around no one\nAnd while I'm in this body\nI want somebody to want\nAnd I want what I want and I want\n\n\nYou to love me\nYou\n\n\nAnd I know that you do\nIn the dark, I know that you do\nAnd I know that you know that you got\nThe potential to pick me up\nAnd I want you to use it, blast the music\nBang it, bite it, bruise it\nWhenever you want to begin, begin\nWe don't have to go back to where we've been\nI am the woman who wants you to win\nAnd I've been waiting, waiting for\n\n\nYou to love m

In [34]:
# finding lyrics for Top 50 albums in 2019, based off Metacritic score
albums_2019 = albums[100:200]
add_album_lyrics(albums_2019)

In [36]:
# finding lyrics for Top 50 albums in 2018, based off Metacritic score
albums_2018 = albums[200:300]
add_album_lyrics(albums_2018)

In [38]:
# finding lyrics for Top 50 albums in 2017, based off Metacritic score
albums_2017 = albums[300:400]
add_album_lyrics(albums_2017)

In [42]:
# finding lyrics for Top 99 albums in 2016, based off Metacritic score
albums_2016 = albums[400:499]
add_album_lyrics(albums_2016)

In [48]:
# finding lyrics for Top 99 albums in 2015, based off Metacritic score
albums_2015 = albums[499:598]
add_album_lyrics(albums_2015)

In [68]:
# finding lyrics for Top 95 albums in 2014, based off Metacritic score
albums_2014 = albums[598:693]
add_album_lyrics(albums_2014)

In [125]:
# finding lyrics for Top 98 albums in 2013, based off Metacritic score
albums_2013 = albums[693:791]
add_album_lyrics(albums_2013)

In [126]:
# finding lyrics for Top 99 albums in 2012, based off Metacritic score
albums_2012 = albums[791:890]
add_album_lyrics(albums_2012)

In [127]:
# finding lyrics for Top 96 albums in 2011, based off Metacritic score
albums_2011 = albums[890:986]
add_album_lyrics(albums_2011)

In [128]:
# finding lyrics for Top 98 albums in 2010, based off Metacritic score
albums_2010 = albums[986:]
add_album_lyrics(albums_2010)

Having retrieved all available lyrics data, we now can convert our list of dictionaries into a Pandas DataFrame, a sample of which is displayed below:

In [135]:
# creating final DataFrame object
albums_df = pandas.DataFrame(albums)

In [137]:
albums_df.head()

Unnamed: 0,artist,lyrics,metascore,release_date,title,user_score
0,Fiona Apple,[\n\n\nI've waited many years\nEvery print I l...,100,17-Apr-20,Fetch the Bolt Cutters,8.8
1,Rina Sawayama,[\n\n\nI'm losing myself\nIn the darkness of t...,90,17-Apr-20,Sawayama,9.1
2,Laura Marling,[\n\n\nWhat became of Alexandra\nDid she make ...,90,10-Apr-20,Song for Our Daughter,8.7
3,BC Camplight,[\n\n\nThis afternoon I thought about Buckfast...,89,24-Apr-20,Shortly After Takeoff,8.4
4,Dua Lipa,[\n\n\nFuture\n(Future nostalgia)\n(Future nos...,89,27-Mar-20,Future Nostalgia,9.0


The above DataFrame `albums_df` can now be used to perform the data analysis required for this project. We will export this DataFrame as a CSV file below, so as to be able to access it outside of this Jupyter Notebook:

In [141]:
albums_df.to_csv("/Users/Genghis/Desktop/INFO_2950/INFO_2950_FinalProject/data/albums.csv")

Here, we briefly check that the export completed successfully:

In [142]:
albums_df_duplicate = pandas.read_csv("data/albums.csv")
albums_df_duplicate.head()

Unnamed: 0.1,Unnamed: 0,artist,lyrics,metascore,release_date,title,user_score
0,0,Fiona Apple,"[""\n\n\nI've waited many years\nEvery print I ...",100,17-Apr-20,Fetch the Bolt Cutters,8.8
1,1,Rina Sawayama,"[""\n\n\nI'm losing myself\nIn the darkness of ...",90,17-Apr-20,Sawayama,9.1
2,2,Laura Marling,"[""\n\n\nWhat became of Alexandra\nDid she make...",90,10-Apr-20,Song for Our Daughter,8.7
3,3,BC Camplight,['\n\n\nThis afternoon I thought about Buckfas...,89,24-Apr-20,Shortly After Takeoff,8.4
4,4,Dua Lipa,"[""\n\n\nFuture\n(Future nostalgia)\n(Future no...",89,27-Mar-20,Future Nostalgia,9.0


In [143]:
if len(albums_df) == len(albums_df_duplicate):
    print("It appears that the albums_df DataFrame was exported successfully!")

It appears that the albums_df DataFrame was exported successfully!
