In [2]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
import pickle
import random
import time



In [3]:
def loadPage(url):
    '''
    Given a page URL, returns a tuple that contains the BeautifulSoup of that page, followed by the text of that page.
    '''
    response = requests.get(url, headers={'User-agent': "Mozilla/5.0"})
    return BeautifulSoup(response.text, 'lxml'), response.text

In [4]:
#Box Office Mojo has two types of pages I want. The first is the weekly domestic box office table,
#and the other is the general summary. 

def find_domestic_link(title_summary_BS):
    '''Given the BeautifulSoup of a summary page like https://www.boxofficemojo.com/title/tt1568911/, returns 
    the weekly domestic releases URL, ex. https://www.boxofficemojo.com/release/rl880313857/weekly/?ref_=bo_rl_tab#tabs
    '''
    url_mid = title_summary_BS.find(class_="a-section a-spacing-none a-spacing-top-base").find("a", text="Domestic")['href']
    url_sections = url_mid.split("/")
    main_section = url_sections[2].split("?")[0]
    return "https://www.boxofficemojo.com/release/" + main_section+"/weekly/?ref_=bo_rl_tab"

def findTitleSummary(weekly_domestics_BS):
    '''Given the BeautifulSoup of the domestic weekly relases, like https://www.boxofficemojo.com/release/rl880313857/weekly/?ref_=bo_rl_tab#tabs, returns 
    the summary page URL , ex. https://www.boxofficemojo.com/title/tt1568911/
    '''
    try:
        title_summary = weekly_domestics_BS \
        .find("a", class_='a-link-normal mojo-title-link refiner-display-highlight') \
        .get("href")
        return "https://www.boxofficemojo.com"+title_summary
    except:
        return None
    
def summary_url_to_IMDB_url(box_office_mojo_url):
    '''Given a title summary URL, like https://www.boxofficemojo.com/title/tt1568911/, returns the IMDB URL: https://www.imdb.com/title/tt1568911/'''
    try:
        return "https://www.imdb.com/title/" + box_office_mojo_url.split("/")[4] + "/"
    except:
        return None

'https://www.boxofficemojo.com/title/tt1568911/?ref_=bo_rl_ti'

In [None]:
#Static definition of years. Range excludes 2020 due to the pandemic.
years_to_scrape = [2013, 2014, 2015, 2016, 2017, 2018, 2019]

In [6]:
lines = []
links = []

#First, create a collection of links to scrape. For each year, acquire all movies that released that year.

for year in years_to_scrape:
    #This page contains all movies released during this year, including rereleases, and sorted by total gross.
    movies_page_url = "https://www.boxofficemojo.com/year/"+str(year)+"/?grossesOption=totalGrosses"
    movies_page = loadPage(movies_page_url)[0]

    movies_table = movies_page.find("table").find_all("tr")


    for t in movies_table:
        newline = []
        link = t.find("a")
        if link:
            links.append(link.get("href"))
        newline = [c.text for c in t.find_all("td")+t.find_all("th")]
        lines.append(newline)

    names = [l[1] for l in lines[1:]]
    list_of_weekly_links = []
    list_of_bom_ids=[]
    for l in links[1:]:
        try:
            this_id = l.split("/")[2]
            list_of_bom_ids.append(this_id)
            list_of_weekly_links.append("https://www.boxofficemojo.com/release/"+this_id+"/weekly/")
        except:
            this_id = None
            print(l)
    

?sort=rank&grossesOption=totalGrosses&ref_=bo_yld__resort#table
?sort=rank&grossesOption=totalGrosses&ref_=bo_yld__resort#table
?grossesOption=totalGrosses&sort=rank&ref_=bo_yld__resort#table
?sort=rank&grossesOption=totalGrosses&ref_=bo_yld__resort#table
?grossesOption=totalGrosses&sort=rank&ref_=bo_yld__resort#table
?grossesOption=totalGrosses&sort=rank&ref_=bo_yld__resort#table
?sort=rank&grossesOption=totalGrosses&ref_=bo_yld__resort#table
?grossesOption=totalGrosses&sort=rank&ref_=bo_yld__resort#table
?grossesOption=totalGrosses&sort=rank&ref_=bo_yld__resort#table
?grossesOption=totalGrosses&sort=rank&ref_=bo_yld__resort#table
3580 https://www.boxofficemojo.com/release/rl2708702721/weekly/


In [7]:


random.seed()
def scrapePage(name, category, url, sleep, dictToAddTo):
    '''Scrapes a page, returning its BeautifulSoup. 
    name: The name of the movie for storage in the dictionary.
    category: The page category for storage in the dictionary.
    url: The page to scrape.
    sleep: The wait time between requests.
    dictToAddTo: A dictionary. Movies are stored as {move_name: {cateogry: HTML text}}
    
    
    If the movie is already saved in the dictionary, it is not added.
    '''
    page = None
    if(url):
        print("Scraping: "+ name + " At: " + url)
        try:
            if name in dictToAddTo:
                if category not in dictToAddTo[name]:
                    page, text = loadPage(url)
                    dictToAddTo[name][category]=text
                    time.sleep(sleep)
                else: 
                    #It's already there.
                    print("Already there: "+ name)
                    page = None
            else: 
                page, text = loadPage(url)
                dictToAddTo[name] = {"Name": name, category: text}
                time.sleep(sleep)
        except: 
            print("Problem with " + url)
            page = None
    else:
        print("Skipping bad URL for " + name)
    return page

In [9]:

try:
    movielibrary = pickle.load(open("With_IMDB.p", "rb"))
except:
    movielibrary = dict({})

#Movie scraping happens here.
#Because it's so inefficient and has a tendency to crash due to memory issues (whyyyy did I decide to save the text of each page), I re-pickle 
    
def scrapeWeeklies(dictToAddTo = {}):
    """
        Arguments: 
        dictToAddTo: Optional dictionary to populate with the contents of a list of links.
        Returns:
        The same dictionary, but with all pages scraped and added under the column "Domestic Weekly".
    """
    for name, url in zip(names, list_of_weekly_links):
        scrapePage(name, "Domestic Weekly", url, 1, dictToAddTo)
    return dictToAddTo
scrapeWeeklies(movielibrary)
pickle.dump(movielibrary, open("With_IMDB.p", "wb"))

def scrapeBOMSummaries(dictToAddTo):
    """
        Arguments: 
        dictToAddTo: Optional dictionary to populate with the contents of a list of links.
        Returns:
        The same dictionary, but with all pages scraped and added under the column "BOM Summary".
    """
    for name in dictToAddTo.keys():
        target = findTitleSummary(BeautifulSoup(dictToAddTo[name]["Domestic Weekly"], 'lxml'))
        if(scrapePage(name, "BOM Summary", target,0.2, dictToAddTo, True))==None:
            print("Error during: "+ name)
scrapeBOMSummaries(movielibrary)
pickle.dump(movielibrary, open("With_IMDB.p", "wb"))

def scrapeIMDB(dictToAddTo):
    """
        Arguments: 
        dictToAddTo: Optional dictionary to populate with the contents of a list of links.
        Returns:
        The same dictionary, but with all pages scraped and added under the column "IMDB Page".
    """
    for name in list(dictToAddTo.keys()):
        target = summary_url_to_IMDB_url(findTitleSummary(BeautifulSoup(dictToAddTo[name]["Domestic Weekly"], 'lxml')))
        if(scrapePage(name, "IMDB Page", target,2 +2*random.random(), dictToAddTo))==None:
            print("Error during: "+ name)
            
scrapeIMDB(movielibrary)
pickle.dump(movielibrary, open("With_IMDB.p", "wb"))

Scraping: Star Wars: Episode VIII - The Last Jedi At: https://www.boxofficemojo.com/release/rl2708702721/weekly/
Scraping: Beauty and the Beast At: https://www.boxofficemojo.com/release/rl222594561/weekly/
Scraping: Wonder Woman At: https://www.boxofficemojo.com/release/rl578455041/weekly/
Scraping: Jumanji: Welcome to the Jungle At: https://www.boxofficemojo.com/release/rl3095234049/weekly/
Scraping: Guardians of the Galaxy Vol. 2 At: https://www.boxofficemojo.com/release/rl2976089601/weekly/
Scraping: Spider-Man: Homecoming At: https://www.boxofficemojo.com/release/rl863208961/weekly/
Scraping: It At: https://www.boxofficemojo.com/release/rl3481241089/weekly/
Scraping: Thor: Ragnarok At: https://www.boxofficemojo.com/release/rl2959312385/weekly/
Scraping: Despicable Me 3 At: https://www.boxofficemojo.com/release/rl122652161/weekly/
Scraping: Justice League At: https://www.boxofficemojo.com/release/rl1129088513/weekly/
Scraping: Logan At: https://www.boxofficemojo.com/release/rl322932