# Data Scraping Notebook

## A sample data scraping code for Genius dataset.

In [1]:
import pandas as pd
import numpy as np
import multiprocessing
from collections import defaultdict
from time import sleep
import lyricsgenius
import re
import urllib.request
from bs4 import BeautifulSoup
from collections import Counter
from Levenshtein import ratio
import pickle

In [2]:
BB = pd.read_csv("../data/Hot Stuff.csv")
BB_artist_song_pair = list(BB.groupby(["Performer", "Song"]).count().index)
print("Total Artist-Song Pair:  {}".format(len(BB_artist_song_pair)))

Total Artist-Song Pair:  27949


### Removing featuring, parenthesis for more precise scaping

In [3]:
def reformat_aritst_song(artist, song):
    if "Featuring" in artist:
        artist = artist.split("Featuring")[0].strip()
    if "Feat." in artist:
        artist = artist.split("Feat.")[0].strip()
    if "featuring" in artist:
        artist = artist.split("featuring")[0].strip()
    if "&" in artist:
        artist = artist.split("&")[0].strip()
    if "With" in artist:
        artist = artist.split("With")[0].strip()
    if "with" in artist:
        artist = artist.split("with")[0].strip()
    if "(" in artist:
        artist = artist.split("(")[0].strip()
    return [artist, song]

### Using Genius API to scape the lyrics given songname and artist name

In [4]:
# Here we are using a well defined Github lyricsgenius functino
genius = lyricsgenius.Genius("dxvLAG0Jhx6B01DK9pLzusCIFlboFV996sOvwEAPQbON-MUPz2vLgu9_z7u-hODo", verbose=False)

In [5]:
def Genius_scrape(ArtSongPair):
    a, s = ArtSongPair
    a_reformatted, s_reformatted = reformat_aritst_song(a, s)
    try:
        song = genius.search_song(s_reformatted, a_reformatted)
        if song == None:
            return [a, s, None]
        else:
            return [song, a, s]
    except Exception as e:
        print(e)
        print("Oof chill. Too many requests")
        sleep(60)
        return Genius_scrape(ArtSongPair)

In [6]:
val = Genius_scrape(("Adele", "Hello"))
if type(val[0]) != str:
    print("Artist: ", val[0].artist)
    print("Song: ", val[0].title)
    print()
    print("\n".join(val[0].lyrics.split("\n")[:5])) #First 5 lines

Artist:  Adele
Song:  Hello

[Verse 1]
Hello, it's me
I was wondering if after all these years you'd like to meet
To go over everything
They say that time's supposed to heal ya, but I ain't done much healing


### Using multiprocessing to speed up scraping 

In [8]:
# Using multiproceeing to speed up scraping 
# Here we are only scraping for 100 songs
# Since multiprocessing is very unstable, If failed, tried rerun this cell 
art_song_lyrics = defaultdict(dict)
missing_art_song_lyrics = defaultdict(set)
with multiprocessing.Pool(processes=16) as pool:
    try:
        count = 0; found = 0; missing = 0
        for val1, val2, val3 in pool.imap_unordered(Genius_scrape, BB_artist_song_pair[:100]):
            if type(val1) != str:
                art_song_lyrics[val2][val3] = {"lyrics": val1.lyrics, "url": val1.url, "obj": val1}
                found += 1
            else:
                missing_art_song_lyrics[val1].add(val2)
                missing += 1
            count += 1
            if count % 1000 == 0:
                print("Till {}:    Found: {}  Missing: {}".format(count, found, missing))
    except Exception as e:
        print(e)
    pool.terminate()

In [10]:
art_song_lyrics['"Groove" Holmes']["Misty"]

{'lyrics': '(Instrumental)',
 'url': 'https://genius.com/Richard-groove-holmes-misty-lyrics',
 'obj': ('Misty', 'Richard “Groove” Holmes')}

In [11]:
original_found_art_song_pair = [[i, j] for i in art_song_lyrics.keys() for j in art_song_lyrics[i].keys()]
original_missing_art_song_pair = [[i, j] for i in missing_art_song_lyrics.keys() for j in missing_art_song_lyrics[i]]
print("Total:  {}    No.Found:  {},  No.Missing:  {}".format(100, len(original_found_art_song_pair), len(original_missing_art_song_pair)))

Total:  100    No.Found:  100,  No.Missing:  0


### Verified Scraped data

In [12]:
# An example where the artist name does not match the scraped data artist name

In [13]:
print("Original Artist Name: ", '"Groove" Holmes')
print("Scraped Artist Name: ", art_song_lyrics['"Groove" Holmes']["What Now My Love"]["obj"].artist)

Original Artist Name:  "Groove" Holmes
Scraped Artist Name:  Marcel Proust


In [14]:
# Use Leverstein Ratio Score to compare the scraped artist_name / song_name. Keep the ones has score >= 0.85

In [15]:
genius_verify_art_song_found = defaultdict(dict)
genius_verify_art_song_missing = defaultdict(dict)
for i in art_song_lyrics.keys():
    for j in art_song_lyrics[i].keys():
        if ratio(art_song_lyrics[i][j]["obj"].artist.lower(), i.lower()) > 0.85 and ratio(art_song_lyrics[i][j]["obj"].title.lower(), j.lower()) > 0.85:
            genius_verify_art_song_found[i][j] = dict(art_song_lyrics[i][j])
        else:
            genius_verify_art_song_missing[i][j] = dict(art_song_lyrics[i][j])

In [16]:
verified_art_song_pair = [(i, j) for i in genius_verify_art_song_found.keys() for j in genius_verify_art_song_found[i].keys()]
not_verified_art_song_pair = [(i, j) for i in genius_verify_art_song_missing.keys() for j in genius_verify_art_song_missing[i].keys()]
total_missing_art_song_pair =  original_missing_art_song_pair + not_verified_art_song_pair
print("Total:  {}    No.Found/Verified:  {},  No.Missing/Not Verified:  {}".format(100, len(verified_art_song_pair), len(total_missing_art_song_pair)))

Total:  100    No.Found/Verified:  48,  No.Missing/Not Verified:  52


### Then saving all of the verified artist_song pair into a csv file

In [17]:
# Lemmatizer to standardize the words
from pattern.en import lemma
def Tokenizer(word):
    def get_rid_of_apostrophe(s):
        if s[-3:] == "in'":
            s = s[:-3] + "ing"
        if s[-2:] == "'s" or s[-2:] == "''":
            s = s[:-2]
        if s[-1:] == "'":
            s = s[:-1]
        return s
    temp = lemma(get_rid_of_apostrophe(word))
    while lemma(get_rid_of_apostrophe(temp)) != temp:
        temp = lemma(get_rid_of_apostrophe(temp))
    return temp

In [18]:
list_of_words = lambda ly : " ".join(re.split(r"&[\s\S]*?;|<[\s\S]*?>|\[[\s\S]*?\]|[^\w'\\-]|\s",ly.lower())).split()

In [19]:
df = pd.DataFrame()
df["artist"] = [i[0] for i in verified_art_song_pair]
df["song"] = [i[1] for i in verified_art_song_pair]
df["genius_raw"] = [genius_verify_art_song_found[i][j]["lyrics"] for i, j in verified_art_song_pair]
df["genius_counter"] = [Counter([Tokenizer(j) for j in list_of_words(i)]) for i in df["genius_raw"]]
df["genius_bag"] = [list(i.keys()) for i in df["genius_counter"]]

In [20]:
df.head()

Unnamed: 0,artist,song,genius_raw,genius_counter,genius_bag
0,"""Weird Al"" Yankovic",Like A Surgeon,I finally made it through med school\nSomehow ...,"{'i': 7, 'finally': 1, 'make': 3, 'it': 3, 'th...","[i, finally, make, it, through, med, school, s..."
1,"""Weird Al"" Yankovic",Ricky,"[Ricky]\nHey Lucy, I'm home\n\n[Lucy]\nOh Rick...","{'hey': 10, 'lucy': 17, 'i'm': 5, 'home': 1, '...","[hey, lucy, i'm, home, oh, ricky, you're, so, ..."
2,"""Weird Al"" Yankovic",Smells Like Nirvana,[Verse 1]\nWhat is this song all about\nCan't ...,"{'what': 7, 'be': 4, 'thi': 2, 'song': 1, 'all...","[what, be, thi, song, all, about, can, figure,..."
3,"""Weird Al"" Yankovic",Fat,"Your butt is wide, well mine is too\nJust watc...","{'your': 3, 'butt': 1, 'be': 3, 'wide': 1, 'we...","[your, butt, be, wide, well, mine, too, just, ..."
4,"""Weird Al"" Yankovic",King Of Suede,[Verse 1:]\nThere's a sale on our gabardine su...,"{'there': 15, 'a': 9, 'sale': 5, 'on': 7, 'our...","[there, a, sale, on, our, gabardine, suit, tod..."


### Adjoining Azlyrics & lyricsAZ, we can obtain the following dataset

In [21]:
lyrics_master = pickle.load(open("../data/lyrics_master.pickle", 'rb'))
lyrics_master.head()

Unnamed: 0,song,artist,genius_raw,genius_bag,az_lyrics_raw,az_lyrics_bag,lyrics_az_raw,lyrics_az_bag,GvAZL,GvLAZ,AZLvLAZ,genius_counter,az_lyrics_counter,lyrics_az_counter,word_intersect,word_intersect_counter
0,sauce it up,lil uzi vert,"[Intro]\nSauce it up, ayy, sauce it up, ayy\nS...","[sauce, it, up, ayy, sauce, it, up, ayy, sauce...","Sauce it up, ayy, sauce it up, ayy<br>\nSauce ...","[sauce, it, up, ayy, sauce, it, up, ayy, sauce...",[Intro]<br />\nSauce it up (aye)<br />\nSauce ...,"[sauce, it, up, aye, sauce, it, up, aye, sauce...",0.981573,0.852666,0.864487,"{'sauce': 18, 'it': 32, 'up': 36, 'ayy': 7, 'g...","{'sauce': 18, 'it': 32, 'up': 36, 'ayy': 7, 'i...","{'sauce': 12, 'it': 28, 'up': 24, 'aye': 6, 'g...","[let, okay, say, on, harley, brrr, woo, cardi,...","{'let': 2.0, 'okay': 1.0, 'say': 2.0, 'on': 4...."
1,backroad song,granger smith,Barbed wire fence carving out a hillside\nCutt...,"[barb, wire, fence, carve, out, a, hillside, c...","Barbed wire fence carving out a hillside, cutt...","[barb, wire, fence, carve, out, a, hillside, c...","Barbed wire fence carving out a hillside, cutt...","[barb, wire, fence, carve, out, a, hillside, c...",0.996372,0.997579,0.998789,"{'barb': 1, 'wire': 1, 'fence': 1, 'carve': 1,...","{'barb': 1, 'wire': 1, 'fence': 1, 'carve': 1,...","{'barb': 1, 'wire': 1, 'fence': 1, 'carve': 1,...","[let, feel, we're, today, grey, on, song, hear...","{'let': 1.0, 'feel': 11.0, 'we're': 1.0, 'toda..."
2,feelin' satisfied,boston,[Verse 1]\nWell come on\nAll you people\nThe t...,"[well, come, on, all, you, people, the, time, ...",Well come on<br>\nAll you people<br>\nThe time...,"[well, come, on, all, you, people, the, time, ...",Well come on<br />\nAll you people<br />\nThe ...,"[well, come, on, all, you, people, the, time, ...",0.949709,0.946203,0.996477,"{'well': 2, 'come': 9, 'on': 9, 'all': 1, 'you...","{'well': 2, 'come': 10, 'on': 11, 'all': 1, 'y...","{'well': 2, 'come': 10, 'on': 11, 'all': 1, 'y...","[people, let, can, alright, satisfy, feel, giv...","{'people': 1.0, 'let': 8.0, 'can': 2.0, 'alrig..."
3,saturday morning confusion,bobby russell,"Here they come, warming up\nI hear the pitter ...","[here, they, come, warm, up, i, hear, the, pit...",,,"Here they come, warming up<br />\nI hear the p...","[here, they, come, warm, up, i, hear, the, pit...",0.0,0.999033,0.0,"{'here': 3, 'they': 3, 'come': 3, 'warm': 2, '...",{},"{'here': 3, 'they': 3, 'come': 3, 'warm': 2, '...","[let, feel, dog, today, how, hear, on, come, r...","{'let': 2.0, 'feel': 1.0, 'dog': 4.0, 'today':..."
4,country man,luke bryan,"[Verse 1]\nYou need hands, rough not soft\nTo ...","[you, need, hand, rough, not, soft, to, come, ...","You need hands, rough not soft<br>\nTo come an...","[you, need, hand, rough, not, soft, to, come, ...","You need hands, rough not soft<br />\nTo come ...","[you, need, hand, rough, not, soft, to, come, ...",0.971121,0.967205,0.99122,"{'you': 7, 'need': 1, 'hand': 2, 'rough': 1, '...","{'you': 7, 'need': 1, 'hand': 2, 'rough': 1, '...","{'you': 7, 'need': 1, 'hand': 2, 'rough': 1, '...","[let, gator, tenni, move, come, way, muscle, t...","{'let': 1.0, 'gator': 1.0, 'tenni': 1.0, 'move..."


#### Since azlyrics and lyricsaz lyrics scraping are customized for each website. For simplicity and readability, only genius scraping is presented here. (The final dataframe for lyricsaz and azlyrics are of the same structure as genius dataframe shown above