## Set up

In [13]:
# Import Dependencies
import pandas as pd
import requests
import urllib.parse as UP
import yaml
import pymongo
import bs4
import re
import numpy as np
from splinter import Browser
from bs4 import BeautifulSoup as bs

In [14]:
# Set up path for chromedriver

with open("config.yml", 'r') as ymlpath:
    config = yaml.safe_load(ymlpath)
    executable_path = {"executable_path": config["config-key"]}

## Scraping Box Office Mojo 

In [73]:
def movie_scraper():
    
    """Scrapes https://www.boxofficemojo.com for the top ten movies for 2008-2018 based on gross box-office amount.
    Returns a list of dictionaries with year, rank, movie title, and studio"""
    
    # Create a list of year we are querying data for
    years = [str(a) for a in range(2008,2019)]
    
    movie_df_list=[]
    
    for year in years:
        
        # Get webpage data using requests and parse html by creating a beautiful soup object
        response = requests.get('https://www.boxofficemojo.com/yearly/chart/?view2=worldwide&yr=%s&p=.htm' % year)
        soup = bs(response.text,'html.parser')

        # Find location of necessary data in soup object
        soup_tables = soup.find_all('table')
        soup_elements = soup_tables[3].find_all('td')

        # For each td element, find and store data in a list 
        movie_data=[]
        
        for i in soup_elements:
            if i.find('a')!=None:
                movie_data.append(i.find('a').contents[0]) 
            elif i.find('font')!=None:
                movie_data.append(i.find('font').contents[0])
            elif i.find('b')!=None:
                movie_dataappend(i.find('b').contents[0])

        ### Clean Data:
        
        # Remove extraneous tags
        movie_data = [a.contents[0] if type(a)!=bs4.element.NavigableString else a for a in movie_data]

        # Strip special characters
        movie_data = [re.sub('[^A-Za-z0-9-. ]+', '', a) for a in movie_data]

        # Fill NaNs
        movie_data = [np.nan if a =='na' else a for a in movie_data]
        
        # Set first 6 elements as column headers
        to_df = movie_data[6:]

        # Define the column names 
        columns = ['rank','title','studio','worldwide-gross','domestic-gross','domestic-pct','overseas-gross','overseas-pct']

        # Convert to dataframe
        nrow = int(len(to_df)/len(columns)) 
        dirty_movies_df = pd.DataFrame(np.array(to_df).reshape(nrow,8),columns=columns)
        
        # Remove unnecessary columns
        dirty_movies_df = dirty_movies_df.iloc[: , 0:3]
        dirty_movies_df["rank"] = dirty_movies_df["rank"].apply(int)
        dirty_movies_df = dirty_movies_df.loc[dirty_movies_df["rank"] <=10,:]
        
        # Add year column to dataframe
        dirty_movies_df['year']=int(year)
        
        # Add dataframe for specified year to list of dataframes for all years
        movie_df_list.append(dirty_movies_df)
        
    # Convert list of dataframes to single dataframe
    movie_df = pd.concat(movie_df_list)
    movie_dicts = movie_df.to_dict(orient='records') 
    
    return (movie_dicts) 

## Scraping Billboard Music

In [53]:
def process_chart(data, year):
    
    """ Use the Python package for parsing HTML.  Calls and receives HTML as strings to process for artists."""
    
    # Create soup object to parse the html
    soup = bs(data,"html5lib")
    
    # Create a list to return
    list_albums = []

    # Inspect parsed html
    # For each article item, loop and identify tags to extract from.
    # For each entry, add a dictionary to the album list
    
    for item in soup.select('article'):
        rank = int(item.select_one(".ye-chart-item__rank").string.strip())
        title = item.select_one(".ye-chart-item__title").string.strip()
        artist = item.select_one(".ye-chart-item__artist").text.replace("\n", "")
        list_albums.append({'rank':rank, 'title':title, 'artist':artist,' year':year})
    
    return(list_albums)

In [54]:
def album_scraper():

    """Scrapes https://www.billboard for the top ten albums for 2008-2018 based on gross box-office amount.
    Returns a list of dictionaries with year, album title, and artist name"""

    # Create a list of years we are querying data for
    years = [str(a) for a in range(2008,2019)]
    
    all_albums = []

    # For each year, use requests library to get HTML and parse contentus using process_chart function
    # Add newly created list of dictionaries for specified year to comprehensive list for all years
    for year in years:
        url = requests.get("https://www.billboard.com/charts/year-end/"+str(year)+"/top-billboard-200-albums")
        data = url.content
        all_albums = all_albums + process_chart(data,year)
    
    # Filter just the top 10 albums for each year and insert into final list of dictionaries
    album_dicts = []
    for album in all_albums:
        if (album["rank"] < 11):
            top_ten.append(album)
    
    return(album_dicts)

## Scraping Metacritic

In [4]:
def metacritic_movie_scraper(movie_dict):

    """Adds review information from metacritic.com to provided movie dictionary
    Returns a dictionary with year, rank, movie title, user rating, and number of reviewers"""
    
    ## INSERT CODE HERE
    ## GRETEL - MAKE SURE TO REMOVE THE EXECUTABLE PATH ASSIGNMENT
    
    return(meta_movie_dict)

In [None]:
def metacritic_album_scraper(album_dict):

    """Adds review information from metacritic.com to provided album dictionary
    Returns a dictionary with year, rank, album title, artist name, user rating, and number of reviewers"""
    
    ## INSERT CODE HERE
    ## GRETEL - MAKE SURE TO REMOVE THE EXECUTABLE PATH ASSIGNMENT
    
    return(meta_album_dict)

## Create list of dictionaries for top movies and music

In [11]:
# Create music_list and movie_list, both lists of dictionary, to enter into mongo database
# GRETEL - FIGURE OUT EXACTLY HOW THIS WORKS BASED ON WHAT IS RETURNED AT WHAT POINT
# IT WOULD BE IDEAL FOR US TO END UP WITH A LIST OF DICTIONARIES FOR EACH MOVIE_DICT_LIST AND MUSIC_DICT_LIST

## UPDATE CODE AS APPROPRIATE BASED ON SOURCE CODE FROM OTHER SCRIPTS

# Scrape BoxOfficeMojo and Billboard Music for a list of dictionaries of the top 10 movies for 2008-2018
movie_BOM_dict_list = movie_scraper()
album_Bill_dict_list = album_scraper()

# Add review information from Metacritic to new list of dictionaries for top movies
movie_dict_list = []
for movie_dict in movie_BOM_dict_list:
    movie_dict_list.append(metacritic_movie_scraper(movie_dict))

# Add review information from Metacritic to new list of dictionaries for top music albums
album_dict_list = []
for album_dict in album_Bill_dict_list:
    album_dict_list.append(metacritic_music_scraper(album_dict))


## Populate mongo database

In [None]:
# Connect to mongo using pymongo to create local database
conn = 'mongodb://localhost:27017'
client = pymongo.MongoClient(conn)

# Create Top 10 database
db = client.top_10_db

# Create movies and albums collections
movies = db.movies
albums = db.albums

# Insert top 10 movies and albums for 2008-2018
# GRETEL - FIGURE OUT WHETHER WE WANT TO UPSERT
db.movies.insert_many(movie_dict_list)
db.albums.insert_many(album_dict_list)

## Testing