In [1]:
#2005 Notebook

In [2]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt
import numpy as np
import os
import re
from   sklearn.feature_extraction.text import TfidfVectorizer
from   sklearn.feature_selection import SelectKBest, mutual_info_classif
from   sklearn.linear_model import LogisticRegression, LinearRegression
from   sklearn.model_selection import cross_val_score
from   sklearn.preprocessing import StandardScaler

In [3]:
def song_data(date=''):
    '''
    The Billboard Hot 100 chart represents the Hot 100 songs for that week.
    
    date: a string, in the form "YYYY-MM-DD". For example, "2022-05-16" represents May 16, 2022. If no date specified, function
          will select the present chart
    returns: a pandas dataframe containing metadata for Billboard Hot 100 songs of the week of the specified date.
    columns: rank: rank of the week (1-100)
             date: a pandas datetime object. date of the chart as stated on the Billboard website, 
             which uses the Saturday to identify the week (so it is the same week as the user input, but the Saturday
             of that week),
             title: title of the song,
             artist1: main artist,
             artist2: a list of the rest of the artists. np.nan if there are none.
             peak_pos: peak position of the song,
             wks_chart: # of weeks the song has been on the chart
             b_url: url to the billboard chart
    '''
    lsongs=[]
    lartists=[]
    artist1=[]
    artist2=[]
    lpeak_pos=[]
    lwks_chart=[]
    
    URL='https://www.billboard.com/charts/hot-100/'+date

    page=requests.get(URL)
    soup=BeautifulSoup(page.content, 'lxml')
  
    ### get the first song, bc it's in a different div container
    song1 = soup.find("h3",id='title-of-a-story', class_="c-title a-no-trucate a-font-primary-bold-s u-letter-spacing-0021 u-font-size-23@tablet lrv-u-font-size-16 u-line-height-125 u-line-height-normal@mobile-max a-truncate-ellipsis u-max-width-245 u-max-width-230@tablet-only u-letter-spacing-0028@tablet")
    lsongs.append(song1.text.strip())
    
    ### get the first artist, bc it's in a different div container
    artistf=soup.find("span", class_="c-label a-no-trucate a-font-primary-s lrv-u-font-size-14@mobile-max u-line-height-normal@mobile-max u-letter-spacing-0021 lrv-u-display-block a-truncate-ellipsis-2line u-max-width-330 u-max-width-230@tablet-only u-font-size-20@tablet")
    lartists.append(artistf.text.strip())
    
    ### get the first peak position, bc it's in a different div container
    nums=soup.find_all('span', class_="c-label a-font-primary-bold-l a-font-primary-m@mobile-max u-font-weight-normal@mobile-max lrv-u-padding-tb-050@mobile-max u-font-size-32@tablet")
    nums1=[]
    for x in nums:
        nums1.append(x.text.strip())
        
    lpeak_pos.append(nums1[1])
    ### get the first weeks on chart, bc it's in a different div container
    lwks_chart.append(nums1[2])
    
    ### get last 99 songs
    songs = soup.find_all("h3", class_="c-title a-no-trucate a-font-primary-bold-s u-letter-spacing-0021 lrv-u-font-size-18@tablet lrv-u-font-size-16 u-line-height-125 u-line-height-normal@mobile-max a-truncate-ellipsis u-max-width-330 u-max-width-230@tablet-only", id="title-of-a-story")
    for song in songs:
        lsongs.append(song.text.strip())
    
    ### get last 99 artists
    artists = soup.find_all("span", class_="c-label a-no-trucate a-font-primary-s lrv-u-font-size-14@mobile-max u-line-height-normal@mobile-max u-letter-spacing-0021 lrv-u-display-block a-truncate-ellipsis-2line u-max-width-330 u-max-width-230@tablet-only")
    for artist in artists:
        lartists.append(artist.text.strip())
        
    ### get last 99 peak position
    all_num=[]
    peak_pos = soup.find_all("span", class_="c-label a-font-primary-m lrv-u-padding-tb-050@mobile-max")
    for num in peak_pos:
        all_num.append(num.text.strip())
    
    x=1
    for peak in all_num:
        if x <= len(all_num)-5:
            lpeak_pos.append(all_num[x])
            x=x+6

    ### get last 99 weeks on chart
    y=2
    for wk in all_num:
        if y <= len(all_num)-4:
            lwks_chart.append(all_num[y])
            y=y+6            
    
    ### get date as listed on the chart, aka the Saturday of the week of the user input
    date=soup.find('h2', id="section-heading")
    cdate=pd.to_datetime(date.text.strip().replace("Week of ",''))
    
    
    ### separate artists into artist1 and artist2
    for a in lartists:
        if ("X &" not in a) and ("X Featuring" not in a) and ("X /" not in a):
            a=a.replace(" X ",",")
        a=a.replace("Featuring",",")
        a=a.replace("&",",")   
        a=a.replace(" / ",",")
        List=a.split(",")
        artists = [i.strip() for i in List]
        artist1.append(artists[0])
        if len(artists)==1:
            artist2.append(np.nan)
        else:
            artist2.append(artists[1:])
    
    metadata=pd.DataFrame()
    metadata['rank']=(range(1,101)) ### get rank position
    metadata['date']=cdate
    metadata['title']=lsongs
    metadata['artist1']=artist1
    metadata['artist2']=artist2
    metadata['peak_pos']=lpeak_pos
    metadata['wks_chart']=lwks_chart
    metadata['b_url']=URL
    
    metadata=append_lyrics(metadata)
    metadata.reset_index(inplace=True, drop=True)

    return metadata

In [4]:
def append_lyrics(metadata):
    '''
    a helper function for song_data(). gets the song lyrics for a given song. appends the
    song lyrics for a song from Genius.com to a "lyrics" column.
    
    If the function cannot find the song on the Genius lyrics website, it will drop the entire observation from the dataset.
    
    metadata: a pandas dataframe, created from song_data(). at the least contains
              the title column and the artist1 column.
    returns: a pandas dataframe of the original dataframe with a lyrics column and URL to the
             Genius website the lyrics were taken from.
    '''
    all_lyrics=[]
    all_URL=[]
    title=metadata.title.values
    artist1=metadata.artist1.values
    for x in range(len(title)):
        t=title[x]
        a=artist1[x]
  
        t=re.sub(r'[^\w\s]', '', t)
        a=re.sub(r'[!$/]', '-', a)
        a=re.sub(r'["\\#%&;\()*\[\]+,.:;<=>?@^_`{|}~]', '', a) #[\\]
        URL= "https://www.genius.com/"+a.replace(' ','-')+'-'+t.replace(' ','-')+'-lyrics'
        URL=URL.replace('--','-')
        
        page=requests.get(URL)
        soup=BeautifulSoup(page.content, 'lxml')
        if 'Oops! Page not found' not in soup.text.strip():
            lyrics=soup.find_all('div', class_='Lyrics__Container-sc-1ynbvzw-6 jYfhrf')
            Lyrics = [re.sub(r"\[.*?\]",'',i.text.strip()) for i in lyrics]
            LYRICS=" ".join(Lyrics)
            all_lyrics.append(LYRICS)
            all_URL.append(URL)
        
        else: 
            #print(URL)
            metadata.drop([x], inplace=True)
    
    metadata['lyrics']=all_lyrics
    metadata['g_url']=all_URL
    return metadata

In [11]:
dates=pd.date_range(start='2005-08-20',end='2005-12-31',freq='W-SAT')
print(len(dates))
print(dates)

20
DatetimeIndex(['2005-08-20', '2005-08-27', '2005-09-03', '2005-09-10',
               '2005-09-17', '2005-09-24', '2005-10-01', '2005-10-08',
               '2005-10-15', '2005-10-22', '2005-10-29', '2005-11-05',
               '2005-11-12', '2005-11-19', '2005-11-26', '2005-12-03',
               '2005-12-10', '2005-12-17', '2005-12-24', '2005-12-31'],
              dtype='datetime64[ns]', freq='W-SAT')


In [12]:
Dates=[date.strftime('%Y-%m-%d') for date in dates]

In [7]:
songs=[]
for date in Dates:
    songs.append(song_data(date))
    print(date)

2005-01-01
2005-01-08
2005-01-15
2005-01-22
2005-01-29
2005-02-05
2005-02-12
2005-02-19
2005-02-26
2005-03-05
2005-03-12
2005-03-19
2005-03-26
2005-04-02
2005-04-09
2005-04-16
2005-04-23
2005-04-30
2005-05-07
2005-05-14
2005-05-21
2005-05-28
2005-06-04
2005-06-11
2005-06-18
2005-06-25
2005-07-02
2005-07-09
2005-07-16
2005-07-23
2005-07-30
2005-08-06
2005-08-13


ConnectionError: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))

In [15]:
df=pd.concat(songs)
df.reset_index(inplace=True,drop=True)
df.to_csv('songs2005.csv')

In [10]:
songs[-1]

Unnamed: 0,rank,date,title,artist1,artist2,peak_pos,wks_chart,b_url,lyrics,g_url
0,1,2005-08-13,We Belong Together,Mariah Carey,,1,18,https://www.billboard.com/charts/hot-100/2005-...,"Sweet love, yeahI didn't mean it when I said I...",https://www.genius.com/Mariah-Carey-We-Belong-...
1,2,2005-08-13,Pon de Replay,Rihanna,,2,10,https://www.billboard.com/charts/hot-100/2005-...,"Come, Mr. DJ, song pon de replayCome, Mr. DJ, ...",https://www.genius.com/Rihanna-Pon-de-Replay-l...
2,3,2005-08-13,Don't Cha,The Pussycat Dolls,[Busta Rhymes],3,15,https://www.billboard.com/charts/hot-100/2005-...,"Okay (Ahh)Yeah (Ahh)Oh, we about to get it jus...",https://www.genius.com/The-Pussycat-Dolls-Dont...
3,4,2005-08-13,Let Me Hold You,Bow Wow,[Omarion],4,12,https://www.billboard.com/charts/hot-100/2005-...,Y'all know what this isThis what you need to d...,https://www.genius.com/Bow-Wow-Let-Me-Hold-You...
4,5,2005-08-13,Lose Control,Missy Elliott,"[Ciara, Fat Man Scoop]",4,13,https://www.billboard.com/charts/hot-100/2005-...,"Music make you lose control, music make you lo...",https://www.genius.com/Missy-Elliott-Lose-Cont...
...,...,...,...,...,...,...,...,...,...,...
83,93,2005-08-13,Go!,Common,,79,6,https://www.billboard.com/charts/hot-100/2005-...,"(Go) go, go, go, go, go and on the count of th...",https://www.genius.com/Common-Go-lyrics
84,94,2005-08-13,Wake Me Up When September Ends,Green Day,,94,2,https://www.billboard.com/charts/hot-100/2005-...,Summer has come and passedThe innocent can nev...,https://www.genius.com/Green-Day-Wake-Me-Up-Wh...
85,96,2005-08-13,Home,Michael Buble,,96,6,https://www.billboard.com/charts/hot-100/2005-...,Another summer day has come and gone awayIn Pa...,https://www.genius.com/Michael-Buble-Home-lyrics
86,99,2005-08-13,Stay With Me (Brass Bed),Josh Gracin,,99,1,https://www.billboard.com/charts/hot-100/2005-...,"Baby, the clock on the wall is lyingIt's not r...",https://www.genius.com/Josh-Gracin-Stay-With-M...


In [13]:
for date in Dates:
    songs.append(song_data(date))
    print(date)

2005-08-20
2005-08-27
2005-09-03
2005-09-10
2005-09-17
2005-09-24
2005-10-01
2005-10-08
2005-10-15
2005-10-22
2005-10-29
2005-11-05
2005-11-12
2005-11-19
2005-11-26
2005-12-03
2005-12-10
2005-12-17
2005-12-24
2005-12-31


In [14]:
len(songs)

53

In [16]:
df

Unnamed: 0,rank,date,title,artist1,artist2,peak_pos,wks_chart,b_url,lyrics,g_url
0,1,2005-01-01,Let Me Love You,Mario,,1,11,https://www.billboard.com/charts/hot-100/2005-...,,https://www.genius.com/Mario-Let-Me-Love-You-l...
1,2,2005-01-01,Drop It Like It's Hot,Snoop Dogg,[Pharrell],1,14,https://www.billboard.com/charts/hot-100/2005-...,,https://www.genius.com/Snoop-Dogg-Drop-It-Like...
2,3,2005-01-01,"1, 2 Step",Ciara,[Missy Elliott],3,10,https://www.billboard.com/charts/hot-100/2005-...,,https://www.genius.com/Ciara-1-2-Step-lyrics
3,4,2005-01-01,Soldier,Destiny's Child,"[T.I., Lil Wayne]",4,6,https://www.billboard.com/charts/hot-100/2005-...,,https://www.genius.com/Destiny's-Child-Soldier...
4,5,2005-01-01,Lovers And Friends,Lil Jon,"[The East Side Boyz, Usher, Ludacris]",5,6,https://www.billboard.com/charts/hot-100/2005-...,,https://www.genius.com/Lil-Jon-Lovers-And-Frie...
...,...,...,...,...,...,...,...,...,...,...
4950,94,2005-12-31,Remedy,Seether,,94,3,https://www.billboard.com/charts/hot-100/2005-...,Throw your dollar bills and leave your thrills...,https://www.genius.com/Seether-Remedy-lyrics
4951,96,2005-12-31,Hit The Floor,Twista,[Pitbull],95,3,https://www.billboard.com/charts/hot-100/2005-...,"Yea, we gon' take it from the bottom to the Wi...",https://www.genius.com/Twista-Hit-The-Floor-ly...
4952,97,2005-12-31,Confessions Of A Broken Heart (Daughter To Fat...,Lindsay Lohan,,57,6,https://www.billboard.com/charts/hot-100/2005-...,I wait for the postmanTo bring me a letterAnd ...,https://www.genius.com/Lindsay-Lohan-Confessio...
4953,98,2005-12-31,She Don't Tell Me To,Montgomery Gentry,,98,1,https://www.billboard.com/charts/hot-100/2005-...,"Every now an' then, on my homeI stop at a spot...",https://www.genius.com/Montgomery-Gentry-She-D...
