**Tutorials**

https://dluo.me/s3databoto3

In [4]:
from bs4 import BeautifulSoup
import requests
from datetime import datetime, timedelta
import pandas as pd
import numpy as np
import os
import boto3
from pprint import pprint
import json

In [None]:
def find_bucket(category, chart=None):
    
    s3 = boto3.resource('s3')
    resource = boto3.resource('s3') # high-level object-oriented API
    
    bucket=f"billboard.charts.{category.replace(' ','-')}"
    if chart:
        bucket = bucket + f'.{chart}'
    
    try:
        bucket = s3.create_bucket(Bucket=bucket, CreateBucketConfiguration={'LocationConstraint': 'us-west-2'})
    except:
        bucket = resource.Bucket(bucket)
        
    return bucket

In [None]:
def find_next_date(date):

    if date > 0:

        # Pull year, month, day from string
        day = int(date[6:8])
        month = int(date[4:6])
        year = int(date[:4])

        date = datetime(year=year,month=month,day=day) + timedelta(1)

    else:

        date = datetime(1958,8,4)

    return date

In [7]:
def save_html(chart, bucket, date):
    
    '''
    Read webpage to BeautifulSoup from internet.
    '''
    # Generate URL to scrape
    url = f"https://www.billboard.com/charts/{chart}"
    if date>0:
        url = url + f"/{date.strftime('%Y-%m-%d')}"
    
    # Read webpage, Convert to text, Convert to soup
    r  = requests.get(url)
    data = r.text
    soup = BeautifulSoup(data, 'html.parser')
    
    '''
    Sending BeautifulSoup to S3 as a HTML file.
    '''
    # Save webpage to local .html file named for date
    if date>0:
        filename=f"{chart}_{date.strftime('%Y%m%d')}.html"
    else:
        filename=f"{chart}_{datetime.now().strftime('%Y%m%d')}.html"
    
    f = open(filename,'w')
    f.write(str(soup))
    
    # Upload .txt file to S3
    bucket.upload_file(filename, Key=filename)
    
    # Delete local .txt file
    os.remove(filename)

In [None]:
if __name__ == "__main__":
    
    '''
    Webpage > HTML
    '''

    s3 = boto3.resource('s3')
    resource = boto3.resource('s3') # high-level object-oriented API
    
    bucket = 'billboard-charts-html'
    bucket = resource.Bucket(bucket)
    
    with open('charts.json') as log:
        charts = json.load(log)
    
    # Save webpages to HTML files on S3 bucket
    for category in charts:
        for chart in charts[category]:
    
            date = charts[category][chart] 
            
            if date > 0:

                date = find_next_date(date)

                while date <= datetime.now():
                    save_html(chart, date, bucket)
                    charts[category][chart] = date.strftime('%Y%m%d')
                    date += timedelta(1)
                    
                    with open('charts.json', 'w') as log:
                        json.dump(charts, log)
                       
            else:

                save_html(chart=chart, bucket=bucket, date=date)
                charts[category][chart] = datetime.now().strftime('%Y%m%d')
                    
                with open('charts.json', 'w') as log:
                    json.dump(charts, log)
    
    
    
#     '''
#     HTML > CSV
#     '''
    
#     # Declare S3 bucket & logfile location
#     bucket = resource.Bucket('billboard-data') #subsitute this for your s3 bucket name.
#     log_file = os.getcwd() + '/logs/parse_html_log.txt'
    
#     # Convert HTML files to CSV and push to new S3 bucket
#     date = find_next_date(data_log_file)
#     while date <= datetime.now():
#         parse_html(date, bucket, log_file)
#         date += timedelta(1)

In [8]:
def parse_html(chart, date, bucket):
    
    html_file = "{chart}_{date}.html".format(chart=chart, date=date.strftime('%Y%m%d'))
    
    obj = client.get_object(Bucket=bucket, Key=html_file)
    
    page = urllib2.urlopen(obj['Body'])
    soup = BeautifulSoup(page.read(), 'html.parser')
    
#     articles = soup.findAll('article')
    
#     lst = []

#     for row in range(0,len(articles)-1):

#         if str(articles[row].findAll('span', class_='chart-row__current-week')) == '[]':
#             current_week_rank = None
#         else:
#             current_week_rank = str(articles[row].findAll('span', class_='chart-row__current-week')).split('>')[1].split('<')[0]

#         if str(articles[row].findAll('h2',class_='chart-row__song')) == '[]':
#             song_name = None
#         else:
#             song_name = str(articles[row].findAll('h2',class_='chart-row__song')).split('>')[1].split('<')[0]

#         if str(articles[row].findAll('a',class_='chart-row__artist')) == '[]':
#             artist_name = str(articles[row].findAll('span',class_='chart-row__artist'))
#         else:
#             artist_name = str(articles[row].findAll('a',class_='chart-row__artist'))

#         dct = {
#             'week_of': str(soup.findAll('time')).split('>')[1].split('<')[0],
#             'current_week_rank': current_week_rank,
#             'song_name': song_name,
#             'artist_name': artist_name
#         }

#         if len(articles[row].findAll('span',class_='chart-row__value')) > 0:
#             dct['last_week_rank'] = str(articles[row].findAll('span',class_='chart-row__value')[0]).split('>')[1].split('<')[0]
#             dct['peak_position'] = str(articles[row].findAll('span', class_='chart-row__value')[1]).split('>')[1].split('<')[0]
#             dct['weeks_on_chart'] = str(articles[row].findAll('span',class_='chart-row__value')[2]).split('>')[1].split('<')[0]

#         lst.append(dct)


    df = pd.DataFrame(lst)
    mask = df['current_week_rank'].astype(float).notna()
    df = df[mask]

    artists = []

    for x in df['artist_name']:
        artists.append(x.split('\n')[1])

    df['artist_name'] = pd.Series(artists)
    
    df['week_of'] = pd.to_datetime(df['week_of'])
    

    '''
    Sending DataFrame to S3 as a CSV file.
    '''
               
    filename = '{chart}_{date}.csv'.format(chart=chart, date=date.strftime('%Y%m%d'))
    # Write DataFrame to local CSV named for date
    df.to_csv(filename, index=False)
    
    # Upload CSV file to S3
    bucket.upload_file(filename, Key=filename)
    
    # Delete local .txt file
    os.remove(filename)
    
    # Write filename to logfile to mark successful page save
    logfile='{chart}_parse_html_log.txt'.format(chart=chart)
    f = open(log_file,'w')
    f.write(str(filename))

In [None]:
# df.to_csv(filename, header=False, mode='a', index=False)