In [2]:
import requests
import urllib
import html2text as ht
import re
import pandas as pd
import numpy as np
import json
import string
import traceback
from bs4 import BeautifulSoup
from collections import Counter

This file was created by Charlie ___ . It will be used again to collect data for the Frost 2019 project, with only the paths and the number of weeks changed. 

## Scrape Billboard for song titles

In [3]:
class DateTracker:
    
    days = {1: 31, 2: 28, 3: 31, 4: 30, 5: 31, 6: 30, 
            7: 31, 8: 31, 9: 30, 10: 31, 11: 30, 12: 31}
    
    def __init__(self, year = None, month = None, day = None):
        self.year = year
        self.month = month
        self.day = day
        
    def previous_week(self):
        
        if (self.year == 1958) and (self.month == 8) and (0 < self.day <= 7):
            return
        
        # if move to previous month
        if self.day - 7 <= 0:
            
            if self.month != 1:
                
                self.month -= 1
                self.day += self.days[self.month] - 7
            
            # if move to previous year
            else:
                
                self.year -= 1
                self.month = 12
                self.day += self.days[self.month] - 7
                
        else:
            
            self.day -= 7
            
    def as_date(self):
        
        return "%04d-%02d-%02d" % (self.year, self.month, self.day)
        
        
class Song:
    
    def __init__(self, artist, name, rank, peakpos, weeksonchart, date):
        self.artist = artist
        self.name = name
        self.rank = rank
        self.peakpos = peakpos
        self.weeksonchart = weeksonchart
        self.date = date
        
    def get_artist(self):
        return self.artist
    
    def get_name(self):
        return self.name
    
    def get_rank(self):
        return self.rank
    
    def get_peak_pos(self):
        return self.peakpos
    
    def get_weeks_on_chart(self):
        return self.weeksonchart
    
    def get_date(self):
        return self.date
    
    def list_form(self):
        return [self.artist, self.name, self.rank, self.peakpos,
                self.weeksonchart, self.date]
    
    def summary(self):
        print("\n".join([str(x) for x in [self.artist, self.name,
                                          self.rank, self.peakpos,
                                          self.weeksonchart, self.date]]))

In [4]:
def clean_text(text):
    bad = {"\\\\n": "\n", 
           "\n+": "\n", 
           "\n": " - ",
           "\[\]\(.{0,}?\)": "",
           "\[Play\]\(.{0,}?\)": "",
           "\[\s{0,5}-\s{0,5}Song\s{0,5}-\s{0,5}Lyrics\s{0,5}-\s{0,5}\]\(.{0,}?\)": ""}

    for x in bad:
        text = re.sub(x, bad[x], text, flags = re.I | re.S)
        
    return text

def clean_line(text):
    repl = {"weeks at no. 1": "",
            "(-\s{1,4}!)+": "",
            "(?<![A-z])! -": "",
            "_": "",
            "\[\s-": "", 
            "-\s\]": "",
            "\[.{0,}?\]|\(.{0,}?\)": "",
            "\s+": " ",
            "(-\s{1,5})+": "- ",}    
    
    for x in repl:
        text = re.sub(x, repl[x], text, flags = re.I | re.S)
    return text

In [8]:
def get_songs_by_week(weeks = 100):
    
    #filler = [" and ", " featuring ", " & ", " x ", " / "]
    namesub = {"f\*\*k": "fuck", "s\*\*t": "shit"}
    path = "https://www.billboard.com/charts/hot-100"
    curdate = DateTracker(year = 2019, month = 7, day = 6)
    songs = []

    for j in range(weeks):
        
        page = requests.get(path + "/" + curdate.as_date())
        if j % 20 == 0:
            print("We are on week {}, it is {}".format(j, curdate.as_date()))

        data = ht.html2text(str(page.content))
        data = clean_text(data)
        data = re.split("date search |in performance - |on chart - ", data, flags = re.I)[1:]

        #first one
        #Small adjust
        try:
            weeksoc = int(re.search("[0-9]{0,1}\s-\sweeks", 
                                    re.sub("weeks at no. 1", "", data[0], flags = re.I | re.S),
                                    flags = re.I | re.S).group()[:2])
        except:
            weeksoc = ""
        tmp = clean_line(data[1]).strip().split("-")
        name = tmp[0].strip()
        for k in namesub:
            name = re.sub(k, namesub[k], name, flags = re.I | re.S)
        artist = tmp[1].strip()
        # for k in filler:
        #     artist = re.sub(k, ", ", artist, flags = re.I | re.S)

        songs.append(Song(artist = artist, name = name, rank = 1,
                          peakpos = 1, weeksonchart = weeksoc, 
                          date = curdate.as_date()))


        #everything else (2 - 100)
        for i in data[2:]:
            #we know we have a rank
            if re.match("^[0-9]{0,3}", i.strip()).group() != "":
                i = clean_line(i)
                tmp = i.split("-")
                rank = tmp[0].strip()
                name = tmp[1].strip()
                for k in namesub:
                    name = re.sub(k, namesub[k], name, flags = re.I | re.S)
                artist = tmp[2].strip()
                # for k in filler:
                #     artist = re.sub(k, ", ", artist, flags = re.I | re.S)
                    
                #if these are empty they're debuts!
                try:
                    peakpos = int(re.search("[0-9]{0,3}\s-\speak position", i, flags = re.I | re.S).group().split(" ")[0])
                except AttributeError:
                    peakpos = ""
                try:
                    weeksoc = int(re.search("[0-9]{0,3}\s-\sweeks", i, flags = re.I | re.S).group().split(" ")[0])
                except AttributeError:
                    weeksoc = ""

                songs.append(Song(artist = artist, name = name, rank = rank,
                                  peakpos = peakpos, weeksonchart = weeksoc,
                                  date = curdate.as_date()))

        curdate.previous_week()
        
    return songs, curdate

In [9]:
songs, curdate = get_songs_by_week(weeks = 1043)

We are on week 0, it is 2019-07-06
We are on week 20, it is 2019-02-16
We are on week 40, it is 2018-09-29
We are on week 60, it is 2018-05-12
We are on week 80, it is 2017-12-23
We are on week 100, it is 2017-08-05
We are on week 120, it is 2017-03-18
We are on week 140, it is 2016-10-29
We are on week 160, it is 2016-06-11
We are on week 180, it is 2016-01-22
We are on week 200, it is 2015-09-04
We are on week 220, it is 2015-04-17
We are on week 240, it is 2014-11-28
We are on week 260, it is 2014-07-11
We are on week 280, it is 2014-02-21
We are on week 300, it is 2013-10-04
We are on week 320, it is 2013-05-17
We are on week 340, it is 2012-12-28
We are on week 360, it is 2012-08-10
We are on week 380, it is 2012-03-23
We are on week 400, it is 2011-11-03
We are on week 420, it is 2011-06-16
We are on week 440, it is 2011-01-27
We are on week 460, it is 2010-09-09
We are on week 480, it is 2010-04-22
We are on week 500, it is 2009-12-03
We are on week 520, it is 2009-07-16
We are 

In [10]:
prep = np.array([x.list_form() for x in songs])
df = pd.DataFrame({
    "Artists": prep[:, 0],
    "Name": prep[:, 1],
    "Weekly rank": prep[:, 2],
    "Peak position": prep[:, 3],
    "Weeks on chart": prep[:, 4],
    "Week": prep[:, 5]
})

In [1]:
df.to_csv("FrostData/Weekly_ranks.csv", header = True, index = False)

NameError: name 'df' is not defined

In [5]:
df = pd.read_csv("FrostData/Weekly_ranks.csv", encoding = "utf-8")

## Scrape Genius for lyrics, genre, etc.


Song name

Artist/Group

Weekly rank 

Peak rank

Year published

Genre

Writing Credits

Lyrics



In [6]:
re.search("A\$AP", "HaNDGUN by YG (Ft. a$aP Rocky)", flags = re.I | re.S)

<re.Match object; span=(19, 23), match='a$aP'>

In [7]:
#get_lyric_link(df.iloc[97]["Artists"], df.iloc[97]["Name"], debug = True)

In [8]:
def get_lyric_link(artists, name, both = False, debug = False):
    
    """
    Returns the link of the song name given artists + name of the song
    """
    
    client_token = "BrGsH3KoiMzSyCUClF4-TyzjrNfQfrr2-Q9bfK6Bhum1fquRgVf0rn-Pq6mr9Uyc"
    headers = {"Authorization": "Bearer " + client_token}
    link = "https://api.genius.com/search?q="
    space = "%20"
    
    #this could prove disastrous, double check it...
    # update: yes it did, just replace accents.
    name_repl = {"Beyonce": "Beyoncé", 
                 "Amine": "Aminé",
                 "D.R.A.M.": "DRAM",
                 "$ign": "\$ign"}
    repl = {
        "a|á|ạ|à|ả|ã|ă|ắ|ặ|ằ|ẳ|ẵ|â|ấ|ậ|ầ|ẩ|ẫ": "a",
        "é|ẹ|è|ẻ|ẽ|ê|ế|ệ|ề|ể|ễ|ë": "e",
        "í|ị|ì|ỉ|ĩ": "i",
        "ñ":"n",
        "ó|ọ|ò|ỏ|õ|ô|ố|ộ|ồ|ổ|ỗ|ơ|ớ|ợ|ờ|ỡ": "o",
        "ú|ụ|ù|ủ|ũ|ư|ứ|ự|ừ|ử|ữ": "u",
        "ý|ỵ|ỳ|ỷ|ỹ": "y",
        "\’":"\'"
    }
    artistregex = {
        "\$": "\\\\$"
    }
    filler = [" and ", " featuring ", " & ", " x ", " / "]

    artiststmp = re.sub(",", "", artists)
    name = re.sub(",", "", name)
    if both:
        page = requests.get(link + re.sub(" ", space, name) +
                            space + re.sub(" ", space, artiststmp), headers = headers)
    else:
        page = requests.get(link + re.sub(" ", space, name), headers = headers)
        
    # now that we searched, remove filler words that may not appear
    # in actual song title
    page = json.loads(page.content)["response"]["hits"]
    for i in filler:
        artiststmp = re.sub(i, " ", artiststmp, flags = re.I | re.S)
    check = [re.sub(",", "", x) for x in artiststmp.split(" ") + name.split(" ") if x not in string.punctuation]


    # fix artist tokens to be used in re.search
    for i in artistregex:
        for j in range(len(check)):
            check[j] = re.sub(i, artistregex[i], check[j], flags = re.I | re.S)
            
    if debug:
        print(check)    
        
    top = []

    
    
    if len(page) == 1:
        return page[0]["result"]["path"]
    else:
        for i in range(len(page)):
            c = 0
            
            # remove accents from title
            title = page[i]["result"]["full_title"]
            for j in repl:
                title = re.sub(j, repl[j], title, flags = re.I | re.S)

            if debug:
                print(title)
                
            # check if every artist + name token in the full title
            for j in check:
                if re.search(j, title, flags = re.I | re.S) != None:
                    c += 1
            if c == len(check):
                try:
                    if page[i]["result"]["stats"]["pageviews"] > 0:
                        return page[i]["result"]["path"]
                except KeyError:
                    continue

        
    #if we are here, search failed. now we include the artists as well
    if both == False:
        return get_lyric_link(artiststmp, name, True)
    
    top = []
    
    #if here, find most popular song
    #print()
    #print("hopefully we never see this message.")
    #print(artiststmp, name)
    for j in range(len(page)):
        try:
            top.append(page[i]["result"]["stats"]["pageviews"])
        except Exception:
            top.append(-1)
    #print(page[np.argmax(top)]["result"]["path"])
    #print()
    return page[np.argmax(top)]["result"]["path"]





def scrape_lyrics(row, snip):
    
    # fix artist names w/ these
    repl = {
        "á|ạ|à|ả|ã|ă|ắ|ặ|ằ|ẳ|ẵ|â|ấ|ậ|ầ|ẩ|ẫ": "a",
        "é|ẹ|è|ẻ|ẽ|ê|ế|ệ|ề|ể|ễ|ë": "e",
        "í|ị|ì|ỉ|ĩ": "i",
        "ñ":"n",
        "ó|ọ|ò|ỏ|õ|ô|ố|ộ|ồ|ổ|ỗ|ơ|ớ|ợ|ờ|ỡ": "o",
        "ú|ụ|ù|ủ|ũ|ư|ứ|ự|ừ|ử|ữ": "u",
        "ý|ỵ|ỳ|ỷ|ỹ": "y",
        "\’":"\'",
        "\$": "\\\\$"
    }
    
    base_link = "https://genius.com"
    content = requests.get(base_link + snip).content
    
    
    #############################################
    
    # get genre
    
    
    genre_sub = {"\"": " ", 
                 "genius": "", 
                 "\s+,": ",",
                 "&quot;": "",
                 "&amp": "&",
                 "&:": "&"}
    genre_regex = "genres&quot;:\[.+?\]"
        
    try:
        genre = re.search("\[.+?\]", re.search(genre_regex, str(content), flags = re.I | re.S).group()).group()[1:-1]
        for k in genre_sub:
            genre = re.sub(k, genre_sub[k], genre, flags = re.I)
    except AttributeError:
        genre = ""
        
    data = BeautifulSoup(content, "lxml").get_text()
    
    ##########################################
    
    # get all writers (expensive search)
    
    meta_data = content.decode("utf8")
    
    start_meta_regex = "verified_lyrics_by.*?writer_artists&quot"
    stop_meta_regex = "itemprop=\"page_data\""
    write_regex = "https://genius.com/artists/[^&]*"

    start = re.search(start_meta_regex, meta_data, flags = re.I | re.S).end()
    stop = re.search(stop_meta_regex, meta_data, flags = re.I | re.S).end()
    meta_data = meta_data[start:stop]

    write_links = re.findall(write_regex, meta_data)
    write = [re.sub("https://genius.com/artists/", "", x) for x in write_links]
    write = ", ".join([re.sub("-", " ", x) for x in write])
    
    
    ##########################################
    
    #subset data to search for lyrics
    
    genre_sub = {"\"": " ", "genius": "", "\s+,": ","}
    lyric_regex = row["Name"] + ".{0,20}?Lyrics.+?More on Genius"
    genre_regex = "genres\":\[.+?\]"
    track_info_regex = "\"{}".format(row["Name"]) + ".{0,20}?\".{0,20}?track info.+?remixed by"
    track_info_regex2 = "\"{}".format(row["Name"]) + ".{0,20}?\".{0,20}?track info.+?cover by"
    date_regex = "release date.{0,15}?20[0-9][0-9]"
    write_regex = "Written By\n+.+?\n"
    
    
    # get lyrics
    #print(data)
    data = data[re.search(base_link + snip, data).end():]
    for i in repl:
        data = re.sub(i, repl[i], data, flags = re.I | re.S)
    
    lyrics = re.search(lyric_regex, data, flags = re.I | re.S).group()[:-15]
    
    #get date
    try:
        date = re.split("\n+", re.search(date_regex, data, flags = re.I | re.S).group())[-1]
    except AttributeError:
        date = ""
        
    return date, genre, write, lyrics

In [9]:
dates = []
genre = []
write = []
lyrics = []
checker = {}

for i in range(0, len(df)):
    
    art = df.iloc[i]["Artists"]
    name = df.iloc[i]["Name"]
    
    # haven't scraped the song lyrics yet
    
    if art + name not in checker:
        try:
            
            temp = get_lyric_link(art, name)
            checker.update({art + name: i})
            d, g, w, l = scrape_lyrics(df.iloc[i], temp)
            
        except Exception:
            
            #rough hack to just skip for now
            #that way we can view all the errors in one go!!
            #i need to sleep...
            #print(i)
            #traceback.print_exc()
            dates.append("")
            genre.append("")
            write.append("")
            lyrics.append("")
            continue
            
        dates.append(d)
        genre.append(g)
        write.append(w)
        lyrics.append(l)
    
    # already scraped song lyrics
    
    else:
        
        num = checker[art + name]
        d, g, w, l = dates[num], genre[num], write[num], lyrics[num]
        
        dates.append(d)
        genre.append(g)
        write.append(w)
        lyrics.append(l)        
    
    if i % 500 == 0:
        print("We are {}% done".format(round(i / len(df) * 100, 4)))

We are 0.0% done
We are 0.4811% done
We are 0.9623% done
We are 1.4434% done
We are 1.9246% done
We are 2.4057% done
We are 2.8869% done
We are 3.368% done
We are 3.8492% done
We are 4.3303% done
We are 4.8115% done
We are 5.2926% done
We are 5.7738% done
We are 6.2549% done
We are 6.7361% done
We are 7.2172% done
We are 7.6984% done
We are 8.1795% done
We are 8.6607% done
We are 9.1418% done
We are 9.623% done
We are 10.1041% done
We are 10.5853% done
We are 11.0664% done
We are 11.5476% done
We are 12.0287% done
We are 12.5099% done
We are 12.991% done
We are 13.4722% done
We are 13.9533% done
We are 14.4345% done
We are 14.9156% done
We are 15.3968% done
We are 15.8779% done
We are 16.3591% done
We are 16.8402% done
We are 17.3213% done
We are 17.8025% done
We are 18.2836% done
We are 18.7648% done
We are 19.2459% done
We are 19.7271% done
We are 20.2082% done
We are 20.6894% done
We are 21.1705% done
We are 21.6517% done
We are 22.1328% done
We are 22.614% done
We are 23.0951% done

In [10]:
sum(True for x in lyrics if x == "")/len(lyrics)

0.06440655131930946

In [11]:
df["Date"] = dates
df["Genre"] = genre
df["Writing Credits"] = write
df["Lyrics"] = lyrics
df.to_csv("FrostData/Weekly_data.csv", header = True, index = False)

In [12]:
df.iloc[108]

Artists                                                       DaBaby
Name                                                            Suge
Weekly rank                                                        9
Peak position                                                      8
Weeks on chart                                                    12
Week                                                      2019-06-29
Date                                                   March 1, 2019
Genre                                           Trap,East Coast,Rap 
Writing Credits                       Jetsonmade, Pooh beatz, Dababy
Lyrics             Suge Lyrics\n\n\n\n[Intro]\nPooh, you a fool f...
Name: 108, dtype: object

Debug log:
 - Te Bote: accent error
 - Tiesto: accent error
 - Logic, Ryan Tedder: just checked hot links

In [13]:
base_link = "https://genius.com"
content = requests.get(base_link + "/Nio-garcia-casper-magico-and-bad-bunny-te-bote-remix-lyrics").content
data = BeautifulSoup(content, "lxml").get_text()

In [14]:
df.iloc[97]

Artists                         Megan Thee Stallion Featuring DaBaby
Name                                                       Cash Shit
Weekly rank                                                       98
Peak position                                                    NaN
Weeks on chart                                                   NaN
Week                                                      2019-07-06
Date                                                    May 17, 2019
Genre                                                East Coast,Rap 
Writing Credits                          Dababy, Megan thee stallion
Lyrics             Cash Shit Lyrics\n\n\n\n[Intro: Megan Thee Sta...
Name: 97, dtype: object