In [1]:
#2018 Notebook

In [2]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt
import numpy as np
import os
import re
from   sklearn.feature_extraction.text import TfidfVectorizer
from   sklearn.feature_selection import SelectKBest, mutual_info_classif
from   sklearn.linear_model import LogisticRegression, LinearRegression
from   sklearn.model_selection import cross_val_score
from   sklearn.preprocessing import StandardScaler

In [3]:
def song_data(date=''):
    '''
    The Billboard Hot 100 chart represents the Hot 100 songs for that week.
    
    date: a string, in the form "YYYY-MM-DD". For example, "2022-05-16" represents May 16, 2022. If no date specified, function
          will select the present chart
    returns: a pandas dataframe containing metadata for Billboard Hot 100 songs of the week of the specified date.
    columns: rank: rank of the week (1-100)
             date: a pandas datetime object. date of the chart as stated on the Billboard website, 
             which uses the Saturday to identify the week (so it is the same week as the user input, but the Saturday
             of that week),
             title: title of the song,
             artist1: main artist,
             artist2: a list of the rest of the artists. np.nan if there are none.
             peak_pos: peak position of the song,
             wks_chart: # of weeks the song has been on the chart
             b_url: url to the billboard chart
    '''
    lsongs=[]
    lartists=[]
    artist1=[]
    artist2=[]
    lpeak_pos=[]
    lwks_chart=[]
    
    URL='https://www.billboard.com/charts/hot-100/'+date

    page=requests.get(URL)
    soup=BeautifulSoup(page.content, 'lxml')
  
    ### get the first song, bc it's in a different div container
    song1 = soup.find("h3",id='title-of-a-story', class_="c-title a-no-trucate a-font-primary-bold-s u-letter-spacing-0021 u-font-size-23@tablet lrv-u-font-size-16 u-line-height-125 u-line-height-normal@mobile-max a-truncate-ellipsis u-max-width-245 u-max-width-230@tablet-only u-letter-spacing-0028@tablet")
    lsongs.append(song1.text.strip())
    
    ### get the first artist, bc it's in a different div container
    artistf=soup.find("span", class_="c-label a-no-trucate a-font-primary-s lrv-u-font-size-14@mobile-max u-line-height-normal@mobile-max u-letter-spacing-0021 lrv-u-display-block a-truncate-ellipsis-2line u-max-width-330 u-max-width-230@tablet-only u-font-size-20@tablet")
    lartists.append(artistf.text.strip())
    
    ### get the first peak position, bc it's in a different div container
    nums=soup.find_all('span', class_="c-label a-font-primary-bold-l a-font-primary-m@mobile-max u-font-weight-normal@mobile-max lrv-u-padding-tb-050@mobile-max u-font-size-32@tablet")
    nums1=[]
    for x in nums:
        nums1.append(x.text.strip())
        
    lpeak_pos.append(nums1[1])
    ### get the first weeks on chart, bc it's in a different div container
    lwks_chart.append(nums1[2])
    
    ### get last 99 songs
    songs = soup.find_all("h3", class_="c-title a-no-trucate a-font-primary-bold-s u-letter-spacing-0021 lrv-u-font-size-18@tablet lrv-u-font-size-16 u-line-height-125 u-line-height-normal@mobile-max a-truncate-ellipsis u-max-width-330 u-max-width-230@tablet-only", id="title-of-a-story")
    for song in songs:
        lsongs.append(song.text.strip())
    
    ### get last 99 artists
    artists = soup.find_all("span", class_="c-label a-no-trucate a-font-primary-s lrv-u-font-size-14@mobile-max u-line-height-normal@mobile-max u-letter-spacing-0021 lrv-u-display-block a-truncate-ellipsis-2line u-max-width-330 u-max-width-230@tablet-only")
    for artist in artists:
        lartists.append(artist.text.strip())
        
    ### get last 99 peak position
    all_num=[]
    peak_pos = soup.find_all("span", class_="c-label a-font-primary-m lrv-u-padding-tb-050@mobile-max")
    for num in peak_pos:
        all_num.append(num.text.strip())
    
    x=1
    for peak in all_num:
        if x <= len(all_num)-5:
            lpeak_pos.append(all_num[x])
            x=x+6

    ### get last 99 weeks on chart
    y=2
    for wk in all_num:
        if y <= len(all_num)-4:
            lwks_chart.append(all_num[y])
            y=y+6            
    
    ### get date as listed on the chart, aka the Saturday of the week of the user input
    date=soup.find('h2', id="section-heading")
    cdate=pd.to_datetime(date.text.strip().replace("Week of ",''))
    
    
    ### separate artists into artist1 and artist2
    for a in lartists:
        if ("X &" not in a) and ("X Featuring" not in a) and ("X /" not in a):
            a=a.replace(" X ",",")
        a=a.replace("Featuring",",")
        a=a.replace("&",",")   
        a=a.replace(" / ",",")
        List=a.split(",")
        artists = [i.strip() for i in List]
        artist1.append(artists[0])
        if len(artists)==1:
            artist2.append(np.nan)
        else:
            artist2.append(artists[1:])
    
    metadata=pd.DataFrame()
    metadata['rank']=(range(1,101)) ### get rank position
    metadata['date']=cdate
    metadata['title']=lsongs
    metadata['artist1']=artist1
    metadata['artist2']=artist2
    metadata['peak_pos']=lpeak_pos
    metadata['wks_chart']=lwks_chart
    metadata['b_url']=URL
    
    metadata=append_lyrics(metadata)
    metadata.reset_index(inplace=True, drop=True)

    return metadata

In [4]:
def append_lyrics(metadata):
    '''
    a helper function for song_data(). gets the song lyrics for a given song. appends the
    song lyrics for a song from Genius.com to a "lyrics" column.
    
    If the function cannot find the song on the Genius lyrics website, it will drop the entire observation from the dataset.
    
    metadata: a pandas dataframe, created from song_data(). at the least contains
              the title column and the artist1 column.
    returns: a pandas dataframe of the original dataframe with a lyrics column and URL to the
             Genius website the lyrics were taken from.
    '''
    all_lyrics=[]
    all_URL=[]
    title=metadata.title.values
    artist1=metadata.artist1.values
    for x in range(len(title)):
        t=title[x]
        a=artist1[x]
  
        t=re.sub(r'[^\w\s]', '', t)
        a=re.sub(r'[!$/]', '-', a)
        a=re.sub(r'["\\#%&;\()*\[\]+,.:;<=>?@^_`{|}~]', '', a) #[\\]
        URL= "https://www.genius.com/"+a.replace(' ','-')+'-'+t.replace(' ','-')+'-lyrics'
        URL=URL.replace('--','-')
        
        page=requests.get(URL)
        soup=BeautifulSoup(page.content, 'lxml')
        if 'Oops! Page not found' not in soup.text.strip():
            lyrics=soup.find_all('div', class_='Lyrics__Container-sc-1ynbvzw-6 jYfhrf')
            Lyrics = [re.sub(r"\[.*?\]",'',i.text.strip()) for i in lyrics]
            LYRICS=" ".join(Lyrics)
            all_lyrics.append(LYRICS)
            all_URL.append(URL)
        
        else: 
            #print(URL)
            metadata.drop([x], inplace=True)
    
    metadata['lyrics']=all_lyrics
    metadata['g_url']=all_URL
    return metadata

In [9]:
def valid_dates(datelist):
    '''
    a preprocessing function to check if a chart exists for a given date.
    e.g. https://www.billboard.com/charts/hot-100/2011-08-06/ does not have a chart, but https://www.billboard.com/charts/hot-100/2011-08-13/ does.
    removes broken links from a list of dates.
    i.e. if a Billboard chart does not exist for a date specified in a list, remove the date from the list.
    
    datelist: a list of strings, each dates in YYYY-MM-DD format.
    returns: a list of valid dates from the datelist.
    '''
    valid=[]
    for date in datelist:
        URL='https://www.billboard.com/charts/hot-100/'+date
        page=requests.get(URL)
        soup=BeautifulSoup(page.content, 'lxml')
        song1 = soup.find("h3",id='title-of-a-story', class_="c-title a-no-trucate a-font-primary-bold-s u-letter-spacing-0021 u-font-size-23@tablet lrv-u-font-size-16 u-line-height-125 u-line-height-normal@mobile-max a-truncate-ellipsis u-max-width-245 u-max-width-230@tablet-only u-letter-spacing-0028@tablet")
        if song1 != None:
            valid.append(date)
    return valid

In [10]:
DATES=valid_dates(Dates)

In [12]:
len(Dates)

52

In [11]:
len(DATES)

52

In [14]:
dates=pd.date_range(start='2018-05-12',end='2018-12-31',freq='W-SAT')
print(len(dates))
print(dates)

34
DatetimeIndex(['2018-05-12', '2018-05-19', '2018-05-26', '2018-06-02',
               '2018-06-09', '2018-06-16', '2018-06-23', '2018-06-30',
               '2018-07-07', '2018-07-14', '2018-07-21', '2018-07-28',
               '2018-08-04', '2018-08-11', '2018-08-18', '2018-08-25',
               '2018-09-01', '2018-09-08', '2018-09-15', '2018-09-22',
               '2018-09-29', '2018-10-06', '2018-10-13', '2018-10-20',
               '2018-10-27', '2018-11-03', '2018-11-10', '2018-11-17',
               '2018-11-24', '2018-12-01', '2018-12-08', '2018-12-15',
               '2018-12-22', '2018-12-29'],
              dtype='datetime64[ns]', freq='W-SAT')


In [15]:
Dates=[date.strftime('%Y-%m-%d') for date in dates]

In [8]:
songs=[]
for date in Dates:
    songs.append(song_data(date))
    print(date)

2018-01-06
2018-01-13
2018-01-20
2018-01-27
2018-02-03
2018-02-10
2018-02-17
2018-02-24
2018-03-03
2018-03-10
2018-03-17
2018-03-24
2018-03-31
2018-04-07
2018-04-14
2018-04-21
2018-04-28
2018-05-05


AttributeError: 'NoneType' object has no attribute 'text'

In [20]:
df=pd.concat(songs)
df.reset_index(inplace=True,drop=True)
df.to_csv('songs2018.csv')

In [13]:
songs[-1]

Unnamed: 0,rank,date,title,artist1,artist2,peak_pos,wks_chart,b_url,lyrics,g_url
0,1,2018-05-05,Nice For What,Drake,,1,3,https://www.billboard.com/charts/hot-100/2018-...,,https://www.genius.com/Drake-Nice-For-What-lyrics
1,2,2018-05-05,God's Plan,Drake,,1,14,https://www.billboard.com/charts/hot-100/2018-...,,https://www.genius.com/Drake-Gods-Plan-lyrics
2,3,2018-05-05,No Tears Left To Cry,Ariana Grande,,3,1,https://www.billboard.com/charts/hot-100/2018-...,,https://www.genius.com/Ariana-Grande-No-Tears-...
3,4,2018-05-05,Meant To Be,Bebe Rexha,[Florida Georgia Line],2,27,https://www.billboard.com/charts/hot-100/2018-...,,https://www.genius.com/Bebe-Rexha-Meant-To-Be-...
4,5,2018-05-05,Psycho,Post Malone,[Ty Dolla $ign],2,9,https://www.billboard.com/charts/hot-100/2018-...,,https://www.genius.com/Post-Malone-Psycho-lyrics
...,...,...,...,...,...,...,...,...,...,...
95,96,2018-05-05,Cry Pretty,Carrie Underwood,,48,2,https://www.billboard.com/charts/hot-100/2018-...,,https://www.genius.com/Carrie-Underwood-Cry-Pr...
96,97,2018-05-05,When We,Tank,,78,17,https://www.billboard.com/charts/hot-100/2018-...,,https://www.genius.com/Tank-When-We-lyrics
97,98,2018-05-05,Close,Rae Sremmurd,[Travis Scott],98,1,https://www.billboard.com/charts/hot-100/2018-...,,https://www.genius.com/Rae-Sremmurd-Close-lyrics
98,99,2018-05-05,Thru Your Phone,Cardi B,,50,3,https://www.billboard.com/charts/hot-100/2018-...,,https://www.genius.com/Cardi-B-Thru-Your-Phone...


In [16]:
for date in Dates:
    songs.append(song_data(date))
    print(date)

2018-05-12
2018-05-19
2018-05-26
2018-06-02
2018-06-09
2018-06-16
2018-06-23
2018-06-30
2018-07-07
2018-07-14
2018-07-21
2018-07-28
2018-08-04
2018-08-11
2018-08-18
2018-08-25
2018-09-01
2018-09-08
2018-09-15
2018-09-22
2018-09-29
2018-10-06
2018-10-13
2018-10-20
2018-10-27
2018-11-03
2018-11-10
2018-11-17
2018-11-24
2018-12-01
2018-12-08
2018-12-15
2018-12-22
2018-12-29


In [17]:
len(songs)

52

In [19]:
df

Unnamed: 0,index,rank,date,title,artist1,artist2,peak_pos,wks_chart,b_url,lyrics,g_url
0,0,1,2018-01-06,Perfect,Ed Sheeran,,1,18,https://www.billboard.com/charts/hot-100/2018-...,"I found a love for meOh, darling, just dive ri...",https://www.genius.com/Ed-Sheeran-Perfect-lyrics
1,1,2,2018-01-06,Rockstar,Post Malone,[21 Savage],1,15,https://www.billboard.com/charts/hot-100/2018-...,"HahahahahaTank GodAyy, ayyI've been fuckin' ho...",https://www.genius.com/Post-Malone-Rockstar-ly...
2,2,3,2018-01-06,Havana,Camila Cabello,[Young Thug],2,20,https://www.billboard.com/charts/hot-100/2018-...,"HeyHavana, ooh na-na (Ayy)Half of my heart is ...",https://www.genius.com/Camila-Cabello-Havana-l...
3,3,4,2018-01-06,No Limit,G-Eazy,"[A$AP Rocky, Cardi B]",4,16,https://www.billboard.com/charts/hot-100/2018-...,"If I hit it one time, I'ma pipe herIf I hit it...",https://www.genius.com/G-Eazy-No-Limit-lyrics
4,4,5,2018-01-06,Thunder,Imagine Dragons,,4,35,https://www.billboard.com/charts/hot-100/2018-...,Just a young gun with a quick fuseI was uptigh...,https://www.genius.com/Imagine-Dragons-Thunder...
...,...,...,...,...,...,...,...,...,...,...,...
4912,83,95,2018-12-29,Millionaire,Chris Stapleton,,95,3,https://www.billboard.com/charts/hot-100/2018-...,"They say ""Love is more precious than gold""Can'...",https://www.genius.com/Chris-Stapleton-Million...
4913,84,96,2018-12-29,Look Back At It,A Boogie Wit da Hoodie,,95,2,https://www.billboard.com/charts/hot-100/2018-...,"Look back at itShe ain't never do this before,...",https://www.genius.com/A-Boogie-Wit-da-Hoodie-...
4914,85,98,2018-12-29,Uptown Vibes,Meek Mill,"[Fabolous, Anuel AA]",39,3,https://www.billboard.com/charts/hot-100/2018-...,Papamitrou BoiSpanish bitch from uptownI bough...,https://www.genius.com/Meek-Mill-Uptown-Vibes-...
4915,86,99,2018-12-29,Nuketown,Ski Mask The Slump God,[Juice WRLD],63,3,https://www.billboard.com/charts/hot-100/2018-...,"Round one, fightYeah, yeahThe PharmacyUh-uhLik...",https://www.genius.com/Ski-Mask-The-Slump-God-...
