In [1]:
#1972-1975 NOTEBOOK

In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt
import numpy as np
import os
import re
from   sklearn.feature_extraction.text import TfidfVectorizer
from   sklearn.feature_selection import SelectKBest, mutual_info_classif
from   sklearn.linear_model import LogisticRegression, LinearRegression
from   sklearn.model_selection import cross_val_score
from   sklearn.preprocessing import StandardScaler

In [2]:
def metadata(date=''):
    '''
    a helper function for song_data.
    
    Creates metadata for the The Billboard Hot 100 chart for a specified week.
    
    date: a string, in the form "YYYY-MM-DD". For example, "2022-05-16" represents May 16, 2022. If no date specified, function
          will select the present chart
    returns: a pandas dataframe containing metadata for Billboard Hot 100 songs of the week of the specified date.
    columns: rank: rank of the week (1-100)
             date: a pandas datetime object. date of the chart as stated on the Billboard website, 
             which uses the Saturday to identify the week (so it is the same week as the user input, but the Saturday
             of that week),
             title: title of the song,
             artist1: main artist,
             artist2: a list of the rest of the artists. np.nan if there are none.
             peak_pos: peak position of the song,
             wks_chart: # of weeks the song has been on the chart
             b_url: url to the billboard chart
    '''
    lsongs=[]
    lartists=[]
    artist1=[]
    artist2=[]
    lpeak_pos=[]
    lwks_chart=[]
    
    URL='https://www.billboard.com/charts/hot-100/'+date

    page=requests.get(URL)
    soup=BeautifulSoup(page.content, 'lxml')
  
    ### get the first song, bc it's in a different div container
    song1 = soup.find("h3",id='title-of-a-story', class_="c-title a-no-trucate a-font-primary-bold-s u-letter-spacing-0021 u-font-size-23@tablet lrv-u-font-size-16 u-line-height-125 u-line-height-normal@mobile-max a-truncate-ellipsis u-max-width-245 u-max-width-230@tablet-only u-letter-spacing-0028@tablet")
    lsongs.append(song1.text.strip())
    
    ### get the first artist, bc it's in a different div container
    artistf=soup.find("span", class_="c-label a-no-trucate a-font-primary-s lrv-u-font-size-14@mobile-max u-line-height-normal@mobile-max u-letter-spacing-0021 lrv-u-display-block a-truncate-ellipsis-2line u-max-width-330 u-max-width-230@tablet-only u-font-size-20@tablet")
    lartists.append(artistf.text.strip())
    
    ### get the first peak position, bc it's in a different div container
    nums=soup.find_all('span', class_="c-label a-font-primary-bold-l a-font-primary-m@mobile-max u-font-weight-normal@mobile-max lrv-u-padding-tb-050@mobile-max u-font-size-32@tablet")
    nums1=[]
    for x in nums:
        nums1.append(x.text.strip())
        
    lpeak_pos.append(nums1[1])
    ### get the first weeks on chart, bc it's in a different div container
    lwks_chart.append(nums1[2])
    
    ### get last 99 songs
    songs = soup.find_all("h3", class_="c-title a-no-trucate a-font-primary-bold-s u-letter-spacing-0021 lrv-u-font-size-18@tablet lrv-u-font-size-16 u-line-height-125 u-line-height-normal@mobile-max a-truncate-ellipsis u-max-width-330 u-max-width-230@tablet-only", id="title-of-a-story")
    for song in songs:
        lsongs.append(song.text.strip())
    
    ### get last 99 artists
    artists = soup.find_all("span", class_="c-label a-no-trucate a-font-primary-s lrv-u-font-size-14@mobile-max u-line-height-normal@mobile-max u-letter-spacing-0021 lrv-u-display-block a-truncate-ellipsis-2line u-max-width-330 u-max-width-230@tablet-only")
    for artist in artists:
        lartists.append(artist.text.strip())
        
    ### get last 99 peak position
    all_num=[]
    peak_pos = soup.find_all("span", class_="c-label a-font-primary-m lrv-u-padding-tb-050@mobile-max")
    for num in peak_pos:
        all_num.append(num.text.strip())
    
    x=1
    for peak in all_num:
        if x <= len(all_num)-5:
            lpeak_pos.append(all_num[x])
            x=x+6

    ### get last 99 weeks on chart
    y=2
    for wk in all_num:
        if y <= len(all_num)-4:
            lwks_chart.append(all_num[y])
            y=y+6            
    
    ### get date as listed on the chart, aka the Saturday of the week of the user input
    date=soup.find('h2', id="section-heading")
    cdate=pd.to_datetime(date.text.strip().replace("Week of ",''))
    
    
    ### separate artists into artist1 and artist2
    for a in lartists:
        if ("X &" not in a) and ("X Featuring" not in a) and ("X /" not in a):
            a=a.replace(" X ",",")
        a=a.replace("Featuring",",")
        a=a.replace("&",",")   
        a=a.replace(" / ",",")
        a=a.replace('Feat.',',')
        a=a.replace('Feat',',')
        a=a.replace('FEAT.',',')
        a=a.replace('FEAT',',')
        a=a.replace('feat.',',')
        a=a.replace('feat',',')
        a=a.replace('featuring',',')
        List=a.split(",")
        artists = [i.strip() for i in List]
        artist1.append(artists[0])
        if len(artists)==1:
            artist2.append(np.nan)
        else:
            artist2.append(artists[1:])
    
    metadata=pd.DataFrame()
    metadata['rank']=(range(1,101)) ### get rank position
    metadata['date']=cdate
    metadata['title']=lsongs
    metadata['artist1']=artist1
    metadata['artist2']=artist2
    metadata['peak_pos']=lpeak_pos
    metadata['wks_chart']=lwks_chart
    metadata['b_url']=URL
    
#    metadata=append_lyrics(metadata)
#    metadata.reset_index(inplace=True, drop=True)

    return metadata

In [3]:
def append_lyrics(metadata):
    '''
    a helper function for song_data(). 
    gets the song lyrics for a given song. appends the
    song lyrics for a song from Genius.com to a "lyrics" column.
    
    If the function cannot find the song on the Genius lyrics website, it will drop the entire observation from the dataset.
    
    metadata: a pandas dataframe, created from song_data(). at the least contains
              the title column and the artist1 column.
    returns: a pandas dataframe of the original dataframe with a lyrics column and URL to the
             Genius website the lyrics were taken from.
    '''
    #get unique songs
    
    all_lyrics=[]
    all_URL=[]
    title=metadata.title.values
    artist1=metadata.artist1.values
    for x in range(len(title)):
        t=title[x]
        a=artist1[x]
  
        t=re.sub(r'[^\w\s]', '', t)
        a=re.sub(r'[!$/]', '-', a)
        a=re.sub(r'["\\#%&;\()*\[\]+,.:;<=>?@^_`{|}~]', '', a) #[\\]
        URL= "https://www.genius.com/"+a.replace(' ','-')+'-'+t.replace(' ','-')+'-lyrics'
        URL=URL.replace(",",'')
        URL=URL.replace('--','-')
        all_URL.append(URL)
    
    metadata['g_url']=all_URL
    
    unique_URL=np.unique(all_URL)
    
    real_URL=[]
    for u in unique_URL:
        page=requests.get(u)
        soup=BeautifulSoup(page.content, 'lxml')
        if 'Oops! Page not found' not in soup.text.strip():
            lyrics=soup.find_all('div', class_='Lyrics__Container-sc-1ynbvzw-6 jYfhrf')
            Lyrics = [re.sub(r"\[.*?\]",'',i.text.strip()) for i in lyrics]
            LYRICS=" ".join(Lyrics)
            all_lyrics.append(LYRICS)
            real_URL.append(u)
            print(u)
        
        else: 
            #print(URL)
            metadata.drop(metadata.index[metadata['g_url'] == u], inplace=True)
    
    unique_lyrics=pd.DataFrame()
    unique_lyrics['lyrics']=all_lyrics
    unique_lyrics['g_url']=real_URL
    
    data=metadata.merge(unique_lyrics, on='g_url', how='left')
    
    return data

In [11]:
def song_data(start,end):
    '''
    Creates a dataframe with all Billboard Hot 100 metadata and song lyrics (from Genius.com) for a specified range of dates.
    start: start date, a string "YYYY-MM-DD"
    end: end date, a string "YYYY-MM-DD"
    returns: a pandas dataframe.
    see metadata() and append_lyrics() for further documentation.
    '''
    dates=pd.date_range(start,end,freq='W-SAT')
    Dates=[date.strftime('%Y-%m-%d') for date in dates]
    Dates=valid_dates(Dates)
    metadata_dfs=[]
    for date in Dates:
        metadata_dfs.append(metadata(date))
        #print(date)
    Metadata=pd.concat(metadata_dfs)
    Metadata.reset_index(inplace=True, drop=True)
    
    data=append_lyrics(Metadata)
    
    return data

In [13]:
songs1996_1999.to_csv('songs1972_1975.csv')

In [12]:
%%time
songs1996_1999=song_data('1972-01-01','1975-12-31')

https://www.genius.com/10cc-Art-For-Arts-Sake-lyrics
https://www.genius.com/10cc-Im-Not-In-Love-lyrics
https://www.genius.com/10cc-Rubber-Bullets-lyrics
https://www.genius.com/5000-Volts-Im-On-Fire-lyrics
https://www.genius.com/ABBA-Honey-Honey-lyrics
https://www.genius.com/ABBA-SOS-lyrics
https://www.genius.com/ABBA-Waterloo-lyrics
https://www.genius.com/Ace-How-Long-lyrics
https://www.genius.com/Ace-Spectrum-Dont-Send-Nobody-Else-lyrics
https://www.genius.com/Addrisi-Brothers-Weve-Got-To-Get-It-On-Again-lyrics
https://www.genius.com/Aerosmith-Dream-On-lyrics
https://www.genius.com/Aerosmith-Sweet-Emotion-lyrics
https://www.genius.com/Al-Downing-Ill-Be-Holding-On-lyrics
https://www.genius.com/Al-Green-Call-Me-Come-Back-Home-lyrics
https://www.genius.com/Al-Green-Full-Of-Fire-lyrics
https://www.genius.com/Al-Green-Guilty-lyrics
https://www.genius.com/Al-Green-Hot-Wire-lyrics
https://www.genius.com/Al-Green-Im-Still-In-Love-With-You-lyrics
https://www.genius.com/Al-Green-Lets-Get-Marrie

In [8]:
num=list(range(2000,2022))

In [55]:
data=[]
for year in num:
    file='songs'+str(year)+'.csv'
    df=pd.read_csv(file)
    print("year",year)
    print(len(df))
    data.append(df)

year 2000
4671
year 2001
5200
year 2002
5200
year 2003
5200
year 2004
5200
year 2005
4955
year 2006
4949
year 2007
4700
year 2008
4787
year 2009
4820
year 2010
4902
year 2011
4949
year 2012
5148
year 2013
5121
year 2014
5175
year 2015
5168
year 2016
4976
year 2017
5181
year 2018
4917
year 2019
5179
year 2020
5188
year 2021
4631


In [2]:
songs=pd.concat(data)
songs.reset_index(inplace=True,drop=True)

NameError: name 'pd' is not defined

In [10]:
def valid_dates(datelist):
    '''
    a preprocessing function to check if a chart exists for a given date.
    e.g. https://www.billboard.com/charts/hot-100/2011-08-06/ does not have a chart, but https://www.billboard.com/charts/hot-100/2011-08-13/ does.
    removes broken links from a list of dates.
    i.e. if a Billboard chart does not exist for a date specified in a list, remove the date from the list.
    
    datelist: a list of strings, each dates in YYYY-MM-DD format.
    returns: a list of valid dates from the datelist.
    '''
    valid=[]
    for date in datelist:
        URL='https://www.billboard.com/charts/hot-100/'+date
        page=requests.get(URL)
        soup=BeautifulSoup(page.content, 'lxml')
        song1 = soup.find("h3",id='title-of-a-story', class_="c-title a-no-trucate a-font-primary-bold-s u-letter-spacing-0021 u-font-size-23@tablet lrv-u-font-size-16 u-line-height-125 u-line-height-normal@mobile-max a-truncate-ellipsis u-max-width-245 u-max-width-230@tablet-only u-letter-spacing-0028@tablet")
        if song1 != None:
            valid.append(date)
    return valid

In [8]:
DATES=valid_dates(Dates)
print(len(Dates))
len(DATES)

52


52

In [72]:
songs=[]
for date in Dates:
    songs.append(song_data(date))
    print(date)

KeyboardInterrupt: 

In [10]:
df=pd.concat(songs)
df.reset_index(inplace=True,drop=True)
df.to_csv('songs1990.csv')

In [11]:
df

Unnamed: 0,rank,date,title,artist1,artist2,peak_pos,wks_chart,b_url,lyrics,g_url
0,1,2021-01-02,All I Want For Christmas Is You,Mariah Carey,,1,43,https://www.billboard.com/charts/hot-100/2021-...,I don't want a lot for ChristmasThere is just ...,https://www.genius.com/Mariah-Carey-All-I-Want...
1,2,2021-01-02,Rockin' Around The Christmas Tree,Brenda Lee,,2,38,https://www.billboard.com/charts/hot-100/2021-...,Rockin' around the Christmas treeAt the Christ...,https://www.genius.com/Brenda-Lee-Rockin-Aroun...
2,3,2021-01-02,Jingle Bell Rock,Bobby Helms,,3,35,https://www.billboard.com/charts/hot-100/2021-...,"Jingle bell, jingle bell, jingle bell rockJing...",https://www.genius.com/Bobby-Helms-Jingle-Bell...
3,4,2021-01-02,A Holly Jolly Christmas,Burl Ives,,4,19,https://www.billboard.com/charts/hot-100/2021-...,(Ding-dong-ding)(Ding-dong-ding)(Ding-dong-din...,https://www.genius.com/Burl-Ives-A-Holly-Jolly...
4,5,2021-01-02,It's The Most Wonderful Time Of The Year,Andy Williams,,5,20,https://www.billboard.com/charts/hot-100/2021-...,It's the most wonderful time of the yearWith t...,https://www.genius.com/Andy-Williams-Its-The-M...
...,...,...,...,...,...,...,...,...,...,...
4626,95,2021-12-25,Freedom Was A Highway,Jimmie Allen,[Brad Paisley],76,10,https://www.billboard.com/charts/hot-100/2021-...,"(Oh-oh, oh-oh, woo)(Oh-oh, oh-oh)Sunset throug...",https://www.genius.com/Jimmie-Allen-Freedom-Wa...
4627,96,2021-12-25,No Love,Summer Walker,[SZA],13,6,https://www.billboard.com/charts/hot-100/2021-...,"Oh, ooh woahOh-oh, yeahYeah, yeah, yeahIf I ha...",https://www.genius.com/Summer-Walker-No-Love-l...
4628,97,2021-12-25,Bad Man (Smooth Criminal),Polo G,,49,5,https://www.billboard.com/charts/hot-100/2021-...,"Lil Capalot, bitch, haSmooth criminal, Mike Ja...",https://www.genius.com/Polo-G-Bad-Man-Smooth-C...
4629,98,2021-12-25,Feel Alone,Juice WRLD,,98,1,https://www.billboard.com/charts/hot-100/2021-...,"Smokin' this dope, relaxin'I ain't gon' lie, b...",https://www.genius.com/Juice-WRLD-Feel-Alone-l...
