In [1]:
#2020 Notebook

In [2]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt
import numpy as np
import os
import re
from   sklearn.feature_extraction.text import TfidfVectorizer
from   sklearn.feature_selection import SelectKBest, mutual_info_classif
from   sklearn.linear_model import LogisticRegression, LinearRegression
from   sklearn.model_selection import cross_val_score
from   sklearn.preprocessing import StandardScaler

In [3]:
def song_data(date=''):
    '''
    The Billboard Hot 100 chart represents the Hot 100 songs for that week.
    
    date: a string, in the form "YYYY-MM-DD". For example, "2022-05-16" represents May 16, 2022. If no date specified, function
          will select the present chart
    returns: a pandas dataframe containing metadata for Billboard Hot 100 songs of the week of the specified date.
    columns: rank: rank of the week (1-100)
             date: a pandas datetime object. date of the chart as stated on the Billboard website, 
             which uses the Saturday to identify the week (so it is the same week as the user input, but the Saturday
             of that week),
             title: title of the song,
             artist1: main artist,
             artist2: a list of the rest of the artists. np.nan if there are none.
             peak_pos: peak position of the song,
             wks_chart: # of weeks the song has been on the chart
             b_url: url to the billboard chart
    '''
    lsongs=[]
    lartists=[]
    artist1=[]
    artist2=[]
    lpeak_pos=[]
    lwks_chart=[]
    
    URL='https://www.billboard.com/charts/hot-100/'+date

    page=requests.get(URL)
    soup=BeautifulSoup(page.content, 'lxml')
  
    ### get the first song, bc it's in a different div container
    song1 = soup.find("h3",id='title-of-a-story', class_="c-title a-no-trucate a-font-primary-bold-s u-letter-spacing-0021 u-font-size-23@tablet lrv-u-font-size-16 u-line-height-125 u-line-height-normal@mobile-max a-truncate-ellipsis u-max-width-245 u-max-width-230@tablet-only u-letter-spacing-0028@tablet")
    lsongs.append(song1.text.strip())
    
    ### get the first artist, bc it's in a different div container
    artistf=soup.find("span", class_="c-label a-no-trucate a-font-primary-s lrv-u-font-size-14@mobile-max u-line-height-normal@mobile-max u-letter-spacing-0021 lrv-u-display-block a-truncate-ellipsis-2line u-max-width-330 u-max-width-230@tablet-only u-font-size-20@tablet")
    lartists.append(artistf.text.strip())
    
    ### get the first peak position, bc it's in a different div container
    nums=soup.find_all('span', class_="c-label a-font-primary-bold-l a-font-primary-m@mobile-max u-font-weight-normal@mobile-max lrv-u-padding-tb-050@mobile-max u-font-size-32@tablet")
    nums1=[]
    for x in nums:
        nums1.append(x.text.strip())
        
    lpeak_pos.append(nums1[1])
    ### get the first weeks on chart, bc it's in a different div container
    lwks_chart.append(nums1[2])
    
    ### get last 99 songs
    songs = soup.find_all("h3", class_="c-title a-no-trucate a-font-primary-bold-s u-letter-spacing-0021 lrv-u-font-size-18@tablet lrv-u-font-size-16 u-line-height-125 u-line-height-normal@mobile-max a-truncate-ellipsis u-max-width-330 u-max-width-230@tablet-only", id="title-of-a-story")
    for song in songs:
        lsongs.append(song.text.strip())
    
    ### get last 99 artists
    artists = soup.find_all("span", class_="c-label a-no-trucate a-font-primary-s lrv-u-font-size-14@mobile-max u-line-height-normal@mobile-max u-letter-spacing-0021 lrv-u-display-block a-truncate-ellipsis-2line u-max-width-330 u-max-width-230@tablet-only")
    for artist in artists:
        lartists.append(artist.text.strip())
        
    ### get last 99 peak position
    all_num=[]
    peak_pos = soup.find_all("span", class_="c-label a-font-primary-m lrv-u-padding-tb-050@mobile-max")
    for num in peak_pos:
        all_num.append(num.text.strip())
    
    x=1
    for peak in all_num:
        if x <= len(all_num)-5:
            lpeak_pos.append(all_num[x])
            x=x+6

    ### get last 99 weeks on chart
    y=2
    for wk in all_num:
        if y <= len(all_num)-4:
            lwks_chart.append(all_num[y])
            y=y+6            
    
    ### get date as listed on the chart, aka the Saturday of the week of the user input
    date=soup.find('h2', id="section-heading")
    cdate=pd.to_datetime(date.text.strip().replace("Week of ",''))
    
    
    ### separate artists into artist1 and artist2
    for a in lartists:
        if ("X &" not in a) and ("X Featuring" not in a) and ("X /" not in a):
            a=a.replace(" X ",",")
        a=a.replace("Featuring",",")
        a=a.replace("&",",")   
        a=a.replace(" / ",",")
        List=a.split(",")
        artists = [i.strip() for i in List]
        artist1.append(artists[0])
        if len(artists)==1:
            artist2.append(np.nan)
        else:
            artist2.append(artists[1:])
    
    metadata=pd.DataFrame()
    metadata['rank']=(range(1,101)) ### get rank position
    metadata['date']=cdate
    metadata['title']=lsongs
    metadata['artist1']=artist1
    metadata['artist2']=artist2
    metadata['peak_pos']=lpeak_pos
    metadata['wks_chart']=lwks_chart
    metadata['b_url']=URL
    
    metadata=append_lyrics(metadata)
    metadata.reset_index(inplace=True, drop=True)

    return metadata

In [4]:
def append_lyrics(metadata):
    '''
    a helper function for song_data(). gets the song lyrics for a given song. appends the
    song lyrics for a song from Genius.com to a "lyrics" column.
    
    If the function cannot find the song on the Genius lyrics website, it will drop the entire observation from the dataset.
    
    metadata: a pandas dataframe, created from song_data(). at the least contains
              the title column and the artist1 column.
    returns: a pandas dataframe of the original dataframe with a lyrics column and URL to the
             Genius website the lyrics were taken from.
    '''
    all_lyrics=[]
    all_URL=[]
    title=metadata.title.values
    artist1=metadata.artist1.values
    for x in range(len(title)):
        t=title[x]
        a=artist1[x]
  
        t=re.sub(r'[^\w\s]', '', t)
        a=re.sub(r'[!$/]', '-', a)
        a=re.sub(r'["\\#%&;\()*\[\]+,.:;<=>?@^_`{|}~]', '', a) #[\\]
        URL= "https://www.genius.com/"+a.replace(' ','-')+'-'+t.replace(' ','-')+'-lyrics'
        URL=URL.replace('--','-')
        
        page=requests.get(URL)
        soup=BeautifulSoup(page.content, 'lxml')
        if 'Oops! Page not found' not in soup.text.strip():
            lyrics=soup.find_all('div', class_='Lyrics__Container-sc-1ynbvzw-6 jYfhrf')
            Lyrics = [re.sub(r"\[.*?\]",'',i.text.strip()) for i in lyrics]
            LYRICS=" ".join(Lyrics)
            all_lyrics.append(LYRICS)
            all_URL.append(URL)
        
        else: 
            #print(URL)
            metadata.drop([x], inplace=True)
    
    metadata['lyrics']=all_lyrics
    metadata['g_url']=all_URL
    return metadata

In [5]:
dates=pd.date_range(start='2020-01-01',end='2020-12-31',freq='W-SAT')
print(len(dates))
print(dates)

52
DatetimeIndex(['2020-01-04', '2020-01-11', '2020-01-18', '2020-01-25',
               '2020-02-01', '2020-02-08', '2020-02-15', '2020-02-22',
               '2020-02-29', '2020-03-07', '2020-03-14', '2020-03-21',
               '2020-03-28', '2020-04-04', '2020-04-11', '2020-04-18',
               '2020-04-25', '2020-05-02', '2020-05-09', '2020-05-16',
               '2020-05-23', '2020-05-30', '2020-06-06', '2020-06-13',
               '2020-06-20', '2020-06-27', '2020-07-04', '2020-07-11',
               '2020-07-18', '2020-07-25', '2020-08-01', '2020-08-08',
               '2020-08-15', '2020-08-22', '2020-08-29', '2020-09-05',
               '2020-09-12', '2020-09-19', '2020-09-26', '2020-10-03',
               '2020-10-10', '2020-10-17', '2020-10-24', '2020-10-31',
               '2020-11-07', '2020-11-14', '2020-11-21', '2020-11-28',
               '2020-12-05', '2020-12-12', '2020-12-19', '2020-12-26'],
              dtype='datetime64[ns]', freq='W-SAT')


In [6]:
Dates=[date.strftime('%Y-%m-%d') for date in dates]

In [7]:
songs=[]
for date in Dates:
    songs.append(song_data(date))
    print(date)

2020-01-04
2020-01-11
2020-01-18
2020-01-25
2020-02-01
2020-02-08
2020-02-15
2020-02-22
2020-02-29
2020-03-07
2020-03-14
2020-03-21
2020-03-28
2020-04-04
2020-04-11
2020-04-18
2020-04-25
2020-05-02
2020-05-09
2020-05-16
2020-05-23
2020-05-30
2020-06-06
2020-06-13
2020-06-20
2020-06-27
2020-07-04
2020-07-11
2020-07-18
2020-07-25
2020-08-01
2020-08-08
2020-08-15
2020-08-22
2020-08-29
2020-09-05
2020-09-12
2020-09-19
2020-09-26
2020-10-03
2020-10-10
2020-10-17
2020-10-24
2020-10-31
2020-11-07
2020-11-14
2020-11-21
2020-11-28
2020-12-05
2020-12-12
2020-12-19
2020-12-26


In [9]:
df=pd.concat(songs)
df.reset_index(inplace=True,drop=True)
df.to_csv('songs2020.csv')

In [10]:
df

Unnamed: 0,rank,date,title,artist1,artist2,peak_pos,wks_chart,b_url,lyrics,g_url
0,1,2020-01-04,All I Want For Christmas Is You,Mariah Carey,,1,37,https://www.billboard.com/charts/hot-100/2020-...,I don't want a lot for ChristmasThere is just ...,https://www.genius.com/Mariah-Carey-All-I-Want...
1,2,2020-01-04,Rockin' Around The Christmas Tree,Brenda Lee,,2,32,https://www.billboard.com/charts/hot-100/2020-...,Rockin' around the Christmas treeAt the Christ...,https://www.genius.com/Brenda-Lee-Rockin-Aroun...
2,3,2020-01-04,Jingle Bell Rock,Bobby Helms,,3,30,https://www.billboard.com/charts/hot-100/2020-...,"Jingle bell, jingle bell, jingle bell rockJing...",https://www.genius.com/Bobby-Helms-Jingle-Bell...
3,4,2020-01-04,A Holly Jolly Christmas,Burl Ives,,4,15,https://www.billboard.com/charts/hot-100/2020-...,(Ding-dong-ding)(Ding-dong-ding)(Ding-dong-din...,https://www.genius.com/Burl-Ives-A-Holly-Jolly...
4,5,2020-01-04,Circles,Post Malone,,1,17,https://www.billboard.com/charts/hot-100/2020-...,"Oh, oh, ohOh, oh, ohOh, oh, oh, oh, ohWe cou...",https://www.genius.com/Post-Malone-Circles-lyrics
...,...,...,...,...,...,...,...,...,...,...
5183,96,2020-12-26,Good Time,Niko Moon,,71,11,https://www.billboard.com/charts/hot-100/2020-...,,https://www.genius.com/Niko-Moon-Good-Time-lyrics
5184,97,2020-12-26,Throat Baby (Go Baby),BRS Kash,,69,9,https://www.billboard.com/charts/hot-100/2020-...,,https://www.genius.com/BRS-Kash-Throat-Baby-Go...
5185,98,2020-12-26,Errbody,Lil Baby,,41,2,https://www.billboard.com/charts/hot-100/2020-...,,https://www.genius.com/Lil-Baby-Errbody-lyrics
5186,99,2020-12-26,Favorite Time Of Year,Carrie Underwood,,80,3,https://www.billboard.com/charts/hot-100/2020-...,,https://www.genius.com/Carrie-Underwood-Favori...
