In [100]:
import pandas as pd
import re
import urllib.request
from bs4 import BeautifulSoup
import json
import time
print('Modules loaded!')

Modules loaded!


In [101]:
bb_hot100_issue_data = pd.read_csv('data/bb_hot100_issue_data.csv')
bb_hot100_song_data = pd.read_csv('data/bb_hot100_song_data.csv')

print('CSV data loaded!')

CSV data loaded!


In [102]:
def getBBHot100Issues(year, number_of_retries=3):
    issues = pd.DataFrame(columns=['year', 'issue_date', 'issue_title', 'issue_url'])
    
    domain = 'https://www.billboard.com'
    url = domain + '/archive/charts/' + str(year) + '/hot-100'
    
    retries = 1
    while retries <= number_of_retries:
        try:
            page = urllib.request.urlopen(url, data=None, context=None)
            break
        except:
            retries += 1
            time.sleep(30)
        
    if page:
        soup = BeautifulSoup(page, "html.parser")

        idx = 0
        for link in soup.find_all('a'):
            if link.get('href') is not None:
                if '/charts/hot-100/' in link.get('href'):
                    issue_date = link.get('href')[-10:]
                    issue_title = link.string
                    issue_url = link.get('href')
                    issues.loc[idx] = [year, issue_date, issue_title, issue_url]
                    idx += 1
        
    time.sleep(5)
    return issues

def getBBHot100Songs(issue_date, issue_url, number_of_retries=3):
    songs = pd.DataFrame(columns=['issue_date', 'rank', 'song', 'artist_name', 'artist_url'])
    
    domain = 'https://www.billboard.com'
    url = domain + issue_url
    
    retries = 1
    while retries <= number_of_retries:
        try:
            page = urllib.request.urlopen(url, data=None, context=None)
            break
        except:
            retries += 1
            time.sleep(30)

    if page:
        soup = BeautifulSoup(page, "html.parser")

        idx = 0
        for link in soup.find_all("div", class_="chart-row__title"):
            song = link.h2.text
            rank = idx+1
            artist_name = ''
            artist_url = ''

            if link.a:
                artist_name = link.a.text.replace('\n', '')
                artist_url = link.a.get('href')
            elif link.span:
                artist_name = link.span.text.replace('\n', '')

            songs.loc[idx] = [issue_date, rank, song, artist_name, artist_url]
            idx += 1
            
    time.sleep(5)
    return songs

print('Functions loaded!')

Functions loaded!


In [103]:
drop_index = []
year = 2018
print('Getting issues for year: ', str(year))
issues = getBBHot100Issues(year)
print('----------------')

for index, row in issues.iterrows():
    if ((bb_hot100_issue_data['issue_date'] == row['issue_date'])).any():
        drop_index.append(index)

issues = issues.drop(drop_index, axis=0)
number_of_issues = issues.shape[0]

if number_of_issues > 0:
    print('Issues yet to be processed: ', str(number_of_issues))
    print('----------------')
    
    for index, row in issues.iterrows():
        print('Processing issue: ', str(row['issue_date']))
        songs = getBBHot100Songs(row['issue_date'], row['issue_url'])
        bb_hot100_song_data = pd.concat([bb_hot100_song_data, songs])

    bb_hot100_issue_data = pd.concat([bb_hot100_issue_data, issues])
    print('----------------')
else:
    print('No issues pending to be processed')
    print('----------------')

print('Processing Done!')

Getting issues for year:  2018
----------------
No issues pending to be processed
----------------
Processing Done!


In [104]:
if number_of_issues > 0:
    bb_hot100_issue_data.to_csv('data/bb_hot100_issue_data.csv', encoding='utf-8', index=False)

    bb_hot100_song_data.to_csv('data/bb_hot100_song_data.csv', encoding='utf-8', index=False)

    bb_hot100_song_data[['song', 'artist_name']].drop_duplicates() \
        .to_csv('data/bb_hot100_song.csv', encoding='utf-8', index=False)

    bb_hot100_song_data[['artist_name']].drop_duplicates() \
        .to_csv('data/bb_hot100_artist.csv', encoding='utf-8', index=False)

    print('Both issues and songs exported to CSV along with summarized data!')
else:
    print('Your data is up-to-date - Good!')

Your data is up-to-date - Good!
