In [1]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from string import punctuation
import re
from nltk.stem import SnowballStemmer
from autocorrect import Speller
from collections import Counter 
import spacy
from datetime import datetime
import numpy as np
from bs4 import BeautifulSoup
import requests
import re
import unicodedata

In [2]:
df = pd.DataFrame(pd.read_excel("../data/Hot 100 Audio Features.xlsx"))
df_hotstuff = pd.DataFrame(pd.read_csv("../data/Hot Stuff.csv"))

# drop songs without genres
df.dropna(subset=['spotify_genre'], inplace=True)

# get songs with rap genre
df_rap = pd.DataFrame()
for index, row in df.iterrows():
    genres = df.spotify_genre.squeeze()[index]
    if 'rap' in genres:
        df_rap = df_rap.append(row)

# drop duplicate songs (songs with same songID)
df_rap = df_rap.drop_duplicates(subset=['SongID'], keep='first')

# merge df_rap and df_hotstuff to get weekID
df_merge = pd.merge(df_rap, df_hotstuff, how='left')

# drop songs with no weekID
df_merge.dropna(subset=['WeekID'], inplace=True)

# drop duplicate songs (songs with same songID)
df_merge = df_merge.drop_duplicates(subset=['SongID'],keep='first')

# get years
years = []
for index, row in df_merge.iterrows():
    weekID = df_merge.WeekID[index]
    year = datetime.strptime(weekID, "%m/%d/%Y").year
    years.append(year)
df_merge['Year'] = years

# set up final dataframe with year, performer, and song
df_final = df_merge[['Year', 'Performer', 'Song']]
df_final = df_final.reset_index(drop=True)

df_final

Unnamed: 0,Year,Performer,Song
0,2019,Post Malone Featuring DaBaby,Enemies
1,2019,"Yella Beezy, Gucci Mane & Quavo",Bacc At It Again
2,2019,DaBaby,VIBEZ
3,2019,NF,When I Grow Up
4,2019,Post Malone,Hollywood's Bleeding
...,...,...,...
3589,2016,Chris Brown,Zero
3590,2018,Kodak Black Featuring Travis Scott & Offset,ZEZE
3591,2017,Future,Zoom
3592,2006,Lil' Boosie Featuring Yung Joc,Zoom


In [163]:
#df_final[df_final['Song'].str.contains(r".*[*].*[*]")]
df_final[df_final['Song'].str.contains("\$")]

Unnamed: 0,Year,Performer,Song
92,2019,Lil Tecca,Ran$om
935,2011,Big Sean Featuring Nicki Minaj,Dance (A$$)
1264,2016,Fergie,M.I.L.F. $
1327,1995,Bone Thugs-N-Harmony Featuring Eazy-E,Foe Tha Love Of $
1436,2015,A$AP Rocky,L$D
1832,2018,A$AP Rocky Featuring Moby,A$AP Forever
1900,2015,Lil Dicky Featuring Fetty Wap & Rich Homie Quan,$ave Dat Money
2433,2013,A$AP Rocky,Long Live A$AP
3063,2015,Future,Rich $ex


In [60]:
# strip accents from text
# ex. beyoncé --> beyonce
def strip_accents(text):
    try:
        text = unicode(text, 'utf-8')
    except NameError:
        pass
    text = unicodedata.normalize('NFD', text)\
           .encode('ascii', 'ignore')\
           .decode("utf-8")
    return str(text)

In [309]:
def get_url(song_title, artist_name):
    # print("Searching for: ", song_title, "-", artist_name)
    
    # get song title and artist
    # convert to lowercase, remove non-alphanumeric characters
    
    title = re.sub(r'[^a-zA-Z0-9-$() ]', '', song_title.lower())
    #print("Title:", title)
    
    title_simple = re.sub(r'[^a-zA-Z0-9- ]', '', title)
    #print(title_simple)
    
    # title with no parentheses
    title_noparen = re.sub(r'\([^)]*\)', '', title)
    #print("Title no paren:", title_noparen)
    
    # replace dollar signs with s's
    title_nodollar = title.replace("$", "s")
    #print(title_nodollar)
    
    artist = re.sub(r'[^a-zA-Z0-9-$() ]', '', artist_name.lower())
    #print("Artist:", artist)
    
    artist_simple = re.sub(r'[^a-zA-Z0-9- ]', '', artist)
    #print(artist_simple)
    
    artist_nodollar = artist.replace("$", "s")
    #print(artist_nodollar)
    
    artist_split = artist.split()
    #print("Artist Split:", artist_split)
    
    # main artist
    if 'featuring' in artist:
        artist_nofeat = artist.split('featuring')[0]
    elif ',' in artist:
        artist_nofeat = artist.split(',')[0]
    else:
        artist_nofeat = artist 
    #print("Artist No Feat:", artist_nofeat)
    
    # set up request
    headers = {'Authorization': 'Bearer ' + 'zZ6YtjOlYsm1o5Me_vIO6MczexIf6k5PGlgiMHi4aO6bnZmsyVdG7J7YQ0VXIOHE'}
    data = {'q': title_noparen + ' ' + artist_nofeat}
    base_url = 'https://api.genius.com'
    search_url = base_url + '/search'
    
    current_page = 1 # page number of results
    next_page = True
    
    while next_page:
        params = {'page': current_page} # set page number
        response = requests.get(search_url, data=data, headers=headers, params=params)
        d = response.json()
        page_hits = d['response']['hits']
        
        # if there are hits on the page
        if page_hits:
            # go through all hits
            for hit in page_hits:
                res = hit['result']
                
                # name of primary artist
                name = res['primary_artist']['name']
                name = strip_accents(name)
                name = re.sub(r'[^a-zA-Z0-9- ]', '', name.lower())
                #print("Name:",name)
                
                full_title = res['full_title']
                full_title = strip_accents(full_title)
                # convert full_title to lowercase and remove non-alphanumeric characters
                full_title = re.sub(r'[^a-zA-Z0-9- ]', '', full_title.lower())
                #print("Full Title:", full_title)
                
                if (
                    # 'lyrics' substring is in url
                    'lyrics' in res['url'] and
                     # song title (w/ or w/o parentheses) is in full title
                    (title in full_title or 
                     title_noparen in full_title or
                     title_nodollar in full_title or
                     title_simple in full_title
                    ) and
                    # 1st or 2nd word in artist is in full title or 
                    # main artist (no features) is in full title or name
                    (artist_nofeat in full_title or 
                     artist_nofeat in name or
                     artist_split[0] in full_title or
                     (len(artist_split) > 1 and artist_split[1] in full_title) or
                     artist_nodollar in full_title or
                     artist_nodollar in name or
                     artist_simple in full_title or
                     artist_simple in name
                    ) and
                    # 1st or 2nd word in artist is in name from response
                    (artist_split[0] in name or
                     (len(artist_split) > 1 and artist_split[1] in name)
                    ) and
                    # song is not a translation
                    'espanol' not in full_title and
                    'nederlandse' not in full_title and
                    'polskie' not in full_title and
                    'portugues' not in full_title and
                    'francaise' not in full_title and
                    'deutsche' not in full_title and
                    'oversttelse' not in full_title and
                    'traduzione' not in full_title and
                    'ceviri' not in full_title and
                    'translation' not in full_title and
                    # song is not a review by rap critic
                    'rap critic' not in full_title and
                    # song is not instrumental
                    'instrumental' not in full_title and
                    # song is not a parody
                    'parody' not in full_title
                ):
                    url = res['url']
                    # print("URL found: ", url)
                    return url
                    
            # increment current_page value for next loop
            current_page += 1
            # print("Finished scraping page {}".format(current_page))
            
            # if lyrics not on first 10 pages, stop
            if (current_page == 10):
                next_page = False
        else:
            # if page_hits is empty, stop
            next_page = False
        
    return 0

In [310]:
#get_url("I Gotta Feeling", "The Black Eyed Peas")
#get_url("Sunflower (Spider-Man: Into The Spider-Verse)", "Post Malone & Swae Lee")
#get_url('C U When U Get There (From "Nothing To Lose")', 'Coolio Featuring 40 Thevz')
#get_url("Loyal", "Chris Brown")
#get_url('Freaks', 'Play-N-Skillz Featuring Krayzie Bone & Adina Howard')
#get_url('1st Of Tha Month', 'Bone Thugs-N-Harmony')
#get_url('D.O.A. (Death of Auto-Tune)', 'Jay-Z')
#get_url('Sucker For Pain', 'Lil Wayne, Wiz Khalifa & Imagine Dragons With Logic & Ty Dolla $ign Feat. X Ambassadors')
#get_url('She\'s Mine Pt.1', 'J. Cole')
#get_url('Hot Girl Summer', 'Megan Thee Stallion, Nicki Minaj & Ty Dolla $ign')
#get_url('Pills And Automobiles', 'Chris Brown Featuring Yo Gotti, A Boogie Wit da Hoodie & Kodak Black')
#get_url('Chill Bill', 'Rob $tone')
#get_url('Ran$om', 'Lil Tecca')
#get_url('M.I.L.F. $', 'Fergie')
#get_url('Money In The Ghetto', 'Too $hort')
#get_url('Mood 4 Eva', 'Beyonce, JAY-Z & Childish Gambino Featuring Oumou Sangare')
#get_url('Go Loko', 'YG, Tyga & Jon Z')
#get_url('Boyz-N-Tha Hood', 'Eazy-E')
#get_url('It\'s All The Way Live (Now) (From "Eddie")', 'Coolio')
#get_url('Don\'t Wanna Be A Player (From "Booty Call")', 'Joe')
#get_url('T.H.E (The Hardest Ever)', 'will.i.am Featuring Mick Jagger & Jennifer Lopez')
#get_url('I Don\'t Get Tired (#IDGT)', 'Kevin Gates Featuring August Alsina')
#get_url('Sittin\' Up In My Room (From "Waiting To Exhale")', 'Brandy')
#get_url('Saint-Tropez', 'Post Malone')
get_url('L$D', 'A$AP Rocky')

'https://genius.com/A-ap-rocky-l-d-lyrics'

In [311]:
def get_lyrics(song_title, artist_name):
    url = get_url(song_title, artist_name)
    if url == 0:
        print("Lyrics not found for", song_title, "-", artist_name)
        return np.NaN
    else:
        page = requests.get(url)
        html = BeautifulSoup(page.text, 'html.parser')
        lyrics = html.find('div', class_='lyrics').get_text()
        return lyrics

In [312]:
lyrics_list = []

for i, row in df_final.iterrows():
    if (i % 10 == 0): 
        print (str(round(i/len(df_final) * 100, 2)) + '% done')
    artist = row['Performer']
    song = row['Song']
    lyrics = get_lyrics(song, artist)
    # print(lyrics)
    lyrics_list.append(lyrics)

0.0% done
0.28% done
0.56% done
0.83% done
1.11% done
1.39% done
1.67% done
1.95% done
2.23% done
2.5% done
Lyrics not found for 223's - YNW Melly & 9lokknine
2.78% done
3.06% done
3.34% done
3.62% done
3.9% done
4.17% done
4.45% done
4.73% done
5.01% done
5.29% done
Lyrics not found for How About Now - Drake
5.56% done
5.84% done
6.12% done
6.4% done
Lyrics not found for Apes**t - The Carters
6.68% done
6.96% done
Lyrics not found for Dear Mama/Old School - 2Pac
7.23% done
7.51% done
Lyrics not found for Move B***h - Ludacris Featuring Mystikal & Infamous 2.0
7.79% done
Lyrics not found for From Her Mama (Mama Got A**) - Juvenile
Lyrics not found for Envy/Firewater - Fat Joe
8.07% done
8.35% done
8.63% done
8.9% done
9.18% done
9.46% done
Lyrics not found for Sexy Lady - Yung Berg Featuring Junior
9.74% done
10.02% done
10.29% done
Lyrics not found for She's Mine Pt.1 - J. Cole
10.57% done
10.85% done
11.13% done
11.41% done
Lyrics not found for 3am - Eminem
11.69% done
11.96% done
12

65.94% done
66.22% done
66.5% done
66.78% done
67.06% done
67.33% done
Lyrics not found for I Am The Champion - B.o.B
67.61% done
67.89% done
68.17% done
68.45% done
68.73% done
69.0% done
Lyrics not found for Now & Later - Sage The Gemini
69.28% done
Lyrics not found for My Choppa Hate N****s - 21 Savage & Metro Boomin
69.56% done
69.84% done
Lyrics not found for Ice Tray - Quavo & Lil Yachty
70.12% done
70.4% done
70.67% done
70.95% done
Lyrics not found for Start This S**t Off Right - Lil Wayne Featuring Ashanti & Mack Maine
71.23% done
Lyrics not found for Request Line - The Black Eyed Peas Featuring Macy Gray
71.51% done
71.79% done
72.06% done
Lyrics not found for B-Please - Snoop Dogg Featuring Xzibit & Nate Dogg
72.34% done
72.62% done
Lyrics not found for All Through The Night - Tone-Loc
72.9% done
73.18% done
73.46% done
73.73% done
Lyrics not found for Real Wild Child - Ivan
Lyrics not found for The 81 - Candy & The Kisses
74.01% done
74.29% done
Lyrics not found for Ryde Or

In [313]:
df_final['Lyrics'] = lyrics_list

In [317]:
df_final[df_final['Lyrics'].isnull()]

df_final.to_csv("Hot100DataWithNanLyrics.csv")

In [318]:
df_final.shape

(3594, 4)

In [319]:
# drop rows with no lyrics
df_final.dropna(subset=['Lyrics'], inplace=True)

# export to csv
df_final.to_csv("Hot100Data.csv")

In [320]:
df_final.shape

(3447, 4)

In [321]:
print(3594-3447)

147
