## Set up

In [4]:
# Import Dependencies
import pandas as pd
from splinter import Browser
from bs4 import BeautifulSoup as bs
import requests
import urllib.parse as UP
import yaml
import pymongo

In [5]:
# Set up path for chromedriver

with open("config.yml", 'r') as ymlpath:
    config = yaml.safe_load(ymlpath)
    executable_path = {"executable_path": config["config-key"]}

## Scraping Box Office Mojo 

In [9]:
def movie_scraper():
    years=[str(a) for a in range(2008,2019)]
    df_list=[]
    for year in years:
        r=rq.get('https://www.boxofficemojo.com/yearly/chart/?view2=worldwide&yr=%s&p=.htm' % year)
        print('Box Office data for %s scraped' % year)
        p=BeautifulSoup(r.text,'html.parser')

        ### Look for the table ### 
        b=p.find_all('table')

        ### Usually the fourth table object on page ### 
        tb=b[3].find_all('td')

        ## Each data field is found in a <td> element in the fourth table. Store all data in a list ## 
        data=[]
        for i in tb:
            if i.find('a')!=None:
                data.append(i.find('a').contents[0])
            elif i.find('font')!=None:
                 data.append(i.find('font').contents[0])
            elif i.find('b')!=None:
                data.append(i.find('b').contents[0])

        ### Still a <b> tag left for <font> tags ## 
        data=[a.contents[0] if type(a)!=bs4.element.NavigableString else a for a in data]

        ### Strip special characters ### 
        data=[re.sub('[^A-Za-z0-9-. ]+', '', a) for a in data]

        ### Fill NaNs ### 
        data=[np.nan if a =='na' else a for a in data]

        ### Define the feature names ###
        columns=['bo_year_rank','title','studio','worldwide-gross','domestic-gross','domestic-pct','overseas-gross','overseas-pct']

        ### First 6 elements are column headers # 
        to_df=data[6:]

        ### Escape clause in case the layout changes from year to year ### 
        if len(to_df)%len(columns) != 0:
            print('Possible table misalignment in table for year %s' % year)
            break 

        ### Convert to pandas dataframe ### 

        nrow=int(len(to_df)/len(columns))
        df=pd.DataFrame(np.array(to_df).reshape(nrow,8),columns=columns)
        df['bo_year']=int(year)
        df_list.append(df)

    dirtymovies_df=pd.concat(df_list)
    
    dirtymovies_df = dirtymovies_df.iloc[: , [0, 1, 2, 8]]
    dirtymovies_df["bo_year_rank"] = dirtymovies_df["bo_year_rank"].apply(int)
    movies_df = dirtymovies_df.loc[dirtymovies_df["bo_year_rank"] <=10,:]
    
    Moviedictionary = movies_df.to_dict(orient='records') 
    
    return (Moviedictionary) 

## Scraping Billboard Music

In [10]:
#def album_scraper():

    """Scrapes https://www.billboard for the top ten albums for 2008-2018 based on gross box-office amount.
    Returns a list of dictionaries with year, rank, album name, and artist name"""
    
    #Declare Dependencies
from bs4 import BeautifulSoup
import jinja2
import requests
import pymongo
from datetime import datetime

#Setting variable for time
current_time = datetime.now()

#Inspect Billboard web.  Capture specific years and albums as lists.
years = [2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018]
all_albums = []
all_albums_objects = []

#Use the Python package for parsing HTML.  Calls and receives HTML as strings to process for artists.
def process_chart(htmldata, year):
    soup = BeautifulSoup(data,"html5lib")
    list_albums = []

#Inspect document and for each item in article loop and identify tags to extract from.
    for item in soup.select('article'):
        rank = item.select_one(".ye-chart-item__rank").string.strip()
        image = item.select_one(".ye-chart-item__image").find("img").get("src")
        title = item.select_one(".ye-chart-item__title").string.strip()
        artist = item.select_one(".ye-chart-item__artist").text
        list_albums.append({'rank':rank,'image':image,'artist':artist,'title':title, 'year':year, 'current_time':current_time})
        all_albums_objects.append({'rank':rank,'image':image,'artist':artist,'title':title, 'year':year, 'current_time':current_time})
    
    return list_albums

#For each item in the Year list, loop thru, append to url and create records by year   
for year in years:

    url = requests.get("https://www.billboard.com/charts/year-end/"+str(year)+"/top-billboard-200-albums")
    data = url.content
    all_albums.append(process_chart(data,year))
    print (all_albums)
    
top_ten = []
for rank10 in all_albums_objects:
   if int(rank10.get("rank")) < 11:
       top_ten.append(rank10)

x = mycol.insert_many(top_ten)

   # return(album_dict_list)

## Scraping Metacritic

In [4]:
def metacritic_movie_scraper(movie_dict):

    """Adds review information from metacritic.com to provided movie dictionary
    Returns a dictionary with year, rank, movie title, user rating, and number of reviewers"""
    
    ## INSERT CODE HERE
    ## GRETEL - MAKE SURE TO REMOVE THE EXECUTABLE PATH ASSIGNMENT
    
    return(meta_movie_dict)

In [None]:
def metacritic_album_scraper(album_dict):

    """Adds review information from metacritic.com to provided album dictionary
    Returns a dictionary with year, rank, album title, artist name, user rating, and number of reviewers"""
    
    ## INSERT CODE HERE
    ## GRETEL - MAKE SURE TO REMOVE THE EXECUTABLE PATH ASSIGNMENT
    
    return(meta_album_dict)

In [None]:
def metacritic_song_scraper(song_dict):

    """Adds review information from metacritic.com to provided song dictionary
    Returns a dictionary with year, rank, song title, artist name, user rating, and number of reviewers"""
    
    ## INSERT CODE HERE
    ## GRETEL - MAKE SURE TO REMOVE THE EXECUTABLE PATH ASSIGNMENT
    
    return(meta_song_dict)

## Create list of dictionaries for top movies and music

In [11]:
# Create music_list and movie_list, both lists of dictionary, to enter into mongo database
# GRETEL - FIGURE OUT EXACTLY HOW THIS WORKS BASED ON WHAT IS RETURNED AT WHAT POINT
# IT WOULD BE IDEAL FOR US TO END UP WITH A LIST OF DICTIONARIES FOR EACH MOVIE_DICT_LIST AND MUSIC_DICT_LIST

## UPDATE CODE AS APPROPRIATE BASED ON SOURCE CODE FROM OTHER SCRIPTS

# Scrape BoxOfficeMojo and Billboard Music for a list of dictionaries of the top 10 movies for 2008-2018
movie_BOM_dict_list = movie_scraper()
album_Bill_dict_list = album_scraper()
song_Bill_dict_list = song_scraper()


# Add review information from Metacritic to new list of dictionaries for top movies
movie_dict_list = []
for movie_dict in movie_BOM_dict_list:
    movie_dict_list.append(metacritic_movie_scraper(movie_dict))

# Add review information from Metacritic to new list of dictionaries for top music albums
album_dict_list = []
for album_dict in album_Bill_dict_list:
    album_dict_list.append(metacritic_music_scraper(album_dict))
    
# Add review information from Metacritic to new list of dictionaries for top songs
song_dict_list = []
for song_dict in song_Bill_dict_list:
    song_dict_list.append(metacritic_song_scraper(song_dict))

## Populate mongo database

In [None]:
# Connect to mongo using pymongo to create local database
conn = 'mongodb://localhost:27017'
client = pymongo.MongoClient(conn)

# Create Top 10 database
db = client.top_10_db

# Create movies, albums, and songs collections
movies = db.movies
albums = db.albums
songs = db.songs

# Insert top 10 movies, albums, and songs for 2008-2018
# GRETEL - FIGURE OUT WHETHER WE WANT TO UPSERT
db.movies.insert_many(movie_dict_list)
db.albums.insert_many(album_dict_list)
db.songs.insert_many(song_dict_list)

## Testing