In [2]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
import pickle
import random
import time
import re

In [3]:
def loadPage(url):
    '''
    Given a page URL, returns a tuple that contains the BeautifulSoup of that page, followed by the text of that page.
    '''
    response = requests.get(url, headers={'User-agent': "Mozilla/5.0"})
    return BeautifulSoup(response.text, 'lxml'), response.text

In [4]:
#Box Office Mojo has two types of pages I want. The first is the weekly domestic box office table,
#and the other is the general summary. 

def find_domestic_link(title_summary_BS):
    '''Given the BeautifulSoup of a summary page like https://www.boxofficemojo.com/title/tt1568911/, returns 
    the weekly domestic releases URL, ex. https://www.boxofficemojo.com/release/rl880313857/weekly/?ref_=bo_rl_tab#tabs
    '''
    url_mid = title_summary_BS.find(class_="a-section a-spacing-none a-spacing-top-base").find("a", text="Domestic")['href']
    url_sections = url_mid.split("/")
    main_section = url_sections[2].split("?")[0]
    return "https://www.boxofficemojo.com/release/" + main_section+"/weekly/?ref_=bo_rl_tab"

def findTitleSummary(weekly_domestics_BS):
    '''Given the BeautifulSoup of the domestic weekly relases, like https://www.boxofficemojo.com/release/rl880313857/weekly/?ref_=bo_rl_tab#tabs, returns 
    the summary page URL , ex. https://www.boxofficemojo.com/title/tt1568911/
    '''
    try:
        title_summary = weekly_domestics_BS \
        .find("a", class_='a-link-normal mojo-title-link refiner-display-highlight') \
        .get("href")
        return "https://www.boxofficemojo.com"+title_summary
    except:
        return None
    
def summary_url_to_IMDB_url(box_office_mojo_url):
    '''Given a title summary URL, like https://www.boxofficemojo.com/title/tt1568911/, returns the IMDB URL: https://www.imdb.com/title/tt1568911/'''
    try:
        return "https://www.imdb.com/title/" + box_office_mojo_url.split("/")[4] + "/"
    except:
        return None

In [4]:
#Static definition of years. Range excludes 2020 due to the pandemic.
years_to_scrape = [2014, 2015, 2016, 2017, 2018, 2019]

In [13]:
lines = []
links = []

#First, create a collection of links to scrape. For each year, acquire all movies that released that year.

for year in years_to_scrape:
    #This page contains all movies released during this year, including rereleases, and sorted by total gross.
    movies_page_url = "https://www.boxofficemojo.com/year/"+str(year)+"/?grossesOption=totalGrosses"
    print(movies_page_url)
    movies_page = None
    while (movies_page == None):
        try:
            movies_page = loadPage(movies_page_url)[0]
            movies_table = movies_page.find("table").find_all("tr")
        except:
            print("Exception")
            movies_page = None
            time.sleep(3)
    
        
    
    time.sleep(0.2)

    movies_table = movies_page.find("table").find_all("tr")


    for t in movies_table:
        newline = []
        link = t.find("a")
        if link:
            links.append(link.get("href"))
        newline = [c.text for c in t.find_all("td")+t.find_all("th")]
        lines.append(newline)

    names = [l[1] for l in lines[1:]]
    list_of_weekly_links = []
    for l in links[1:]:
        try:
            this_id = l.split("/")[2]
            list_of_weekly_links.append("https://www.boxofficemojo.com/release/"+this_id+"/weekly/")
        except:
            this_id = None
            print(l)
    

https://www.boxofficemojo.com/year/2014/?grossesOption=totalGrosses
https://www.boxofficemojo.com/year/2015/?grossesOption=totalGrosses
?grossesOption=totalGrosses&sort=rank&ref_=bo_yld__resort#table
https://www.boxofficemojo.com/year/2016/?grossesOption=totalGrosses
?grossesOption=totalGrosses&sort=rank&ref_=bo_yld__resort#table
?sort=rank&grossesOption=totalGrosses&ref_=bo_yld__resort#table
https://www.boxofficemojo.com/year/2017/?grossesOption=totalGrosses
?grossesOption=totalGrosses&sort=rank&ref_=bo_yld__resort#table
?sort=rank&grossesOption=totalGrosses&ref_=bo_yld__resort#table
?grossesOption=totalGrosses&sort=rank&ref_=bo_yld__resort#table
https://www.boxofficemojo.com/year/2018/?grossesOption=totalGrosses
?grossesOption=totalGrosses&sort=rank&ref_=bo_yld__resort#table
?sort=rank&grossesOption=totalGrosses&ref_=bo_yld__resort#table
?grossesOption=totalGrosses&sort=rank&ref_=bo_yld__resort#table
?grossesOption=totalGrosses&sort=rank&ref_=bo_yld__resort#table
https://www.boxoffic

In [5]:


def scrape_weekly_to_table(index_name, url, sleep=0, table = [], summary_page_urls = [], movies_to_skip=[]):
    '''
    Scrapes a weekly page from a Box Office Mojo URL, and adds its value to a table, 'table'. 
    index_name: The index, which is the first column. 
    url: The URL to scrape
    sleep: Time to sleep between each call.
    table: The table to add the values in the weekly table to.
    summary_page_urls: Locate the URL for the summary page in this step, and add them to this list.
    '''
    page,text = loadPage(url)
    time.sleep(sleep)
    
    next_ = findTitleSummary(page)
    if next_ != None:
        summary_page_urls.append((index_name, findTitleSummary(page)))
    else: 
        movies_to_skip.add(index_name)
    
    
    main_table = page.find("table")
        
    #The movie name was originally determined by clicking on a link with the name in the Name column.
    #Now, grab the page title to determine what it actually is- "Name".
    title = page.find("h1", class_="a-size-extra-large")
    if main_table!=None and title!=None:        
        
        real_title = title.text
        for row in main_table.find_all("tr")[1:]:
            newrow = [index_name, real_title]

            for item in row.find_all("td"):
                newrow.append(item.text)
            table.append(newrow)
    else:
        movies_to_skip.add(index_name)
    return table, summary_page_urls

def scrape_all_weeklies(cols = ["index_name","Name", "Date", "Rank", "Weekly", "%LW", "Theaters", "Change", "Avg", "To Date", "Week", "Estimated"]):
    '''Returns a dataframe containing weekly data, plus an array of URLS for the next step'''
    table = []
    summary_urls = []
    movies_to_skip = pickle.load(open("Movies to skip.p", "rb"))
    for name, url in zip(names, list_of_weekly_links):
        if name not in movies_to_skip:
            print(name)
            table, summary_urls = scrape_weekly_to_table(name, url, 0.1, table, summary_urls, movies_to_skip)
    pickle.dump(movies_to_skip, open("Movies to skip.p", "wb"))
    return pd.DataFrame(data=table, columns= cols), summary_urls

In [6]:
def scrape_summary_to_table(index_name, url, sleep=0, table=[], imdb_urls = [], movies_to_skip=[]):
    '''
    Scrapes a Box Office Mojo summary page, and builds a list of its properties. 
    '''
    try:
        page, text=loadPage(url)
    except:
        print(index_name)
        print(url)
        return a+1
    
    time.sleep(sleep)

    imdb_urls.append((index_name, summary_url_to_IMDB_url(url)))

    main_table = page.find(class_="a-section a-spacing-none mojo-summary-values mojo-hidden-from-mobile")

    if main_table!=None:
        budget_ = main_table.find(text="Budget")
        if budget_ !=None:
            budget = budget_.findNext().text
        else:
            budget = None

        mpaa_ = main_table.find(text="MPAA")
        if mpaa_ != None: 
            mpaa = mpaa_.findNext().text
        else: 
            mpaa=None

        genres_ = main_table.find(text="Genres")
        if genres_ !=None:
            genres = set(re.split(" |\n", genres_.findNext().text))
        else:
            genres = set({})

        opening_ = main_table.find(text="Opening")
        if opening_ !=None:
            opening = opening.findNext().text
        else:
            opening = None

        runtime_ = main_table.find(text="Running Time")
        if runtime_ !=None:
            runtime = runtime_.findNext().text
        else:
            runtime = None

        table.append([index_name, budget, mpaa, genres, opening, runtime])

    else:
        print(index_name)
        movies_to_skip.add(index_name)
    return table, imdb_urls

def scrape_all_summaries(summary_urls, cols = ["index_name", "Budget", "MPAA", "Genres", "Domestic Opening", "Running Time"]):
    '''
    Scrapes all summaries into a DataFrame. Returns that dataframe, plus a table of the next set of URLS.
    '''
    table = []
    imdb_urls = []
    movies_to_skip = pickle.load(open("Movies to skip.p", "rb"))
    for index_name, url in summary_urls:
        if index_name not in movies_to_skip:
            print(index_name)
            table, imdb_urls = scrape_summary_to_table(index_name, url, 0.1, table, imdb_urls, movies_to_skip)
    pickle.dump(movies_to_skip, open("Movies to skip.p", "wb"))
    return pd.DataFrame(data=table, columns= cols), imdb_urls


In [26]:
def scrape_IMDB_to_table(index_name, url, sleep=0, table=[], movies_to_skip=[]):
    
    page, text=loadPage(url)
    time.sleep(sleep)
    
    try:
        release_date = page.find(title="See more release dates").text.split(" ")
        day = release_date[0]
        if len(day)==1:
            day = "0"+day
        month = release_date[1]
        year = release_date[2]
        release_day=datetime.datetime.strptime(month+day+year, "%B%d%Y")
    except:
        release_day = None

    try:
        metacritic_critics = page.find(class_="titleReviewBarItem").findNext().findNext().text
        if " " not in metacritic_critics:
            score = int(metacritic_critics.strip())
        else:
            score = None
    except:
        score = None


    try:
        director = page.find(class_="credit_summary_item").find("a").text
    except: 
        director = None

    try:
        budget = page.find(text="Budget:").findParent().findParent().text
    except:
        budget = None

    try:
        cast = []
        cast_table = page.find("table", class_= "cast_list")
        for t in cast_table.find_all("tr")[1:]:
            name =  t.find_all("td")[1].text
            cast.append(name)
    except:
        cast = None
    
    table.append([index_name, release_day, score, director, cast, budget])
    return table

def scrape_all_imdbs(imdb_urls, cols = ["index_name", "Release Date", "Metacritic Critic Score", "Director", "Cast", "IMDB's Budget"]):
    table = []
    movies_to_skip = pickle.load(open("Movies to skip.p", "rb"))
    for index_name, url in imdb_urls:
        if index_name not in movies_to_skip:
            print(index_name)
            table = scrape_IMDB_to_table(index_name, url, 2+random.random(), table, movies_to_skip)
    pickle.dump(movies_to_skip, open("Movies to skip.p", "wb"))
    return pd.DataFrame(data=table, columns=cols)

In [14]:
weeklies, summary_urls = scrape_all_weeklies()

NameError: name 'names' is not defined

In [23]:
summaries, imdb_urls = scrape_all_summaries(summary_urls)

Guardians of the Galaxy
The Hobbit: The Battle of the Five Armies
Transformers: Age of Extinction
Maleficent
X-Men: Days of Future Past
Big Hero 6
Dawn of the Planet of the Apes
22 Jump Street
Teenage Mutant Ninja Turtles
Neighbors
Ride Along
Ride Along
Rio 2
Into the Woods
Lucy
Night at the Museum: Secret of the Tomb
Mr. Peabody & Sherman
The Maze Runner
Non-Stop
Heaven Is for Real
The Imitation Game
Fury
Annabelle
Penguins of Madagascar
Let's Be Cops
The Monuments Men
The Purge: Anarchy
Exodus: Gods and Kings
The Nut Job
Son of God
The Grand Budapest Hotel
Planes: Fire & Rescue
RoboCop
Dracula Untold
The Hundred-Foot Journey
Selma
Muppets Most Wanted
Ouija
The Boxtrolls
The Book of Life
About Last Night
Into the Storm
The Judge
Blended
St. Vincent
A Million Ways to Die in the West
Birdman or (The Unexpected Virtue of Ignorance)
The Expendables 3
The Expendables 3
Earth to Echo
Earth to Echo
The Theory of Everything
The Theory of Everything
This Is Where I Leave You
Paranormal Activit

In [27]:
so_far = summaries, weeklies, imdb_urls
pickle.dump(so_far, open("in_progress.p", "wb"))

NameError: name 'imdbs' is not defined

In [8]:
movielibrary = pickle.load(open("in_progress.p", "rb"))

In [9]:
summaries, weeklies, imdb_urls = movielibrary

In [None]:
imdbs = scrape_all_imdbs(imdb_urls)

In [28]:
all_frames = weeklies, summaries, imdbs

pickle.dump(all_frames, open("Scraped_data.p", "wb"))