# Scraping Box Office Mojo
## Part 1
#### Data Collection with Beautiful Soup

This is a notebook of functions that will scrape through http://www.boxofficemojo.com. The data collected here will be pickled and used in another file for cleaning and regression analysis. 

In [1]:
from fake_useragent import UserAgent
from bs4 import BeautifulSoup
from datetime import datetime
from time import sleep
import pandas as pd
import requests
import pprint
import pickle
import random
import re

In [2]:
def random_proxy():
    
    ''' Set a new user-agent and retrieve a random proxy '''

    # New agent
    headers = {"User-Agent" : UserAgent().random}

    # Soup the proxy page
    proxy_url = "https://sslproxies.org/"
    response = requests.get(url = proxy_url)
    page = response.text
    soup = BeautifulSoup(page, "lxml")

    # Grab a random proxy and port
    proxies_table = soup.find(id = 'proxylisttable')
    rows = proxies_table.tbody.find_all('tr')
    address = random.randint(0, len(rows) - 1)
    proxy = rows[address].td.string.strip()
    port = rows[address].td.next_sibling.string.strip()
    proxies = {'http': 'http://{}:{}'.format(proxy, port)}

    return headers, proxies

In [4]:
def soupify_year(page, year, headers, proxies):
    
    ''' Soupify a specified year's movie releases '''
    
    year_url = 'http://www.boxofficemojo.com/yearly/chart/?page={0}=domestic&yr={1}&p=.htm'.format(page, year) 
    response = requests.get(url = year_url, headers = headers, proxies = proxies)
    page = response.text
    soup = BeautifulSoup(page, "lxml")
    print("Status Code: ", response.status_code)
    
    try:
        return soup
    except:
        # Soup Nazi says:
        print("No soup for you!")

In [5]:
def soupify_movie(movie_url, headers, proxies):
    
    ''' Soupify a movie page for traversing '''
    
    response = requests.get(url = movie_url, headers = headers, proxies = proxies)
    page = response.text
    soup = BeautifulSoup(page, "lxml") 
    print("Status Code: ", response.status_code)
    
    try: 
        return soup
    except:
        # Soup Nazi says:
        print("No soup for you!")

In [6]:
def link_list(headers, proxies, start=1980, end=2018):
    
    ''' Collects all the URLs of the movies in a list '''
    '''        Yearly Box Office from 1980-2018       '''
    '''   All movies will be adjusted for inflation   '''
    
    link_list = []
    idx = 0
    for yr in range(start, end+1):
        for pg in range(1,8):
            try:
                soup = soupify_year(page = pg, year = yr, headers = headers, proxies = proxies)
                table = soup.find_all(name = 'table')[3]
            except:
                # New proxy on failure
                print("Failure occured")
                headers, proxies = random_proxy()
                soup = soupify_year(page = pg, year = yr, headers = headers, proxies = proxies)
                table = soup.find_all(name = 'table')[3]
            
            # All movies have an href attribute with the compiled regex
            links = table.find_all(href = re.compile('/movies/\?id='))
            for link in links:
                    link_list.append('http://www.boxofficemojo.com/{}&adjust_yr=2017&p=.htm'.format(link['href']))
            
            idx += 1
            print("Index: ", idx)
            
    return link_list

In [7]:
def traverse_movie(soup):
    
    ''' Grab features from movie's page '''
    ''' Returns:
            Title, 
            Adjusted (2017) total domestic grossing,
            Nominal domestic gross,
            Nominal worldwide gross,
            Release date as datetime object,
            Age as of January 31, 2018,
            Movie genre,
            Runtime in minutes,
            MPAA Rating,
            Budget (Unadjusted),
            #Keanu Reeves as actor,
            #Samuel L. Jackson as actor (no minor roles),
    '''
    
    # Title
    TITLE = soup.title.text.split('(')[0].strip()
    
    # Domestic gross adjusted to 2017 dollars
    money = movie_value(soup, regex = "Domestic Total Adj\. Gross")
    ADJ_GROSS = clean_money(money = money)
    
    # Try to grab total lifetime grosses (unadjusted) for domestic and worldwide
    # Percentages and/or foreign numbers can be conjured in data processing
    try:
        domestic = domestic_worldwide(soup, regex = "Domestic:")
        DOMESTIC = clean_money(money = domestic)
    except:
        DOMESTIC = None
        
    try:    
        worldwide = domestic_worldwide(soup, regex = "Worldwide:")
        WORLD = clean_money(money = worldwide)
    except:
        WORLD = None
    
    # Datetime object of release date & age in days, evaluated January 31, 2018
    release_string = movie_value(soup, regex = "Release Date: ")
    RELEASE = datetime.strptime(release_string, '%B %d, %Y')
    NOW = datetime(2018, 1, 31)
    AGE = NOW - RELEASE
    
    # Genre
    GENRE = movie_value(soup, regex = "Genre: ")
    
    # Runtime converted to minutes
    runtime_string = movie_value(soup, regex = "Runtime: ")
    RUNTIME = runtime_convert(runtime_string = runtime_string)
    
    # MPAA Rating
    RATING = movie_value(soup, regex = "MPAA Rating: ")
    
    # Production budget
    budget_string = movie_value(soup, regex = "Production Budget: ")
    BUDGET = budget_convert(budget_string)
    
    # Keanu Reeves and Samuel L. Jackson effect, no minor roles
    #KEANU, SAMUEL = actors(soup = soup)
    
    
    return [TITLE, ADJ_GROSS, DOMESTIC, WORLD, RELEASE, AGE, 
            GENRE, RUNTIME, RATING, BUDGET]  #, KEANU, SAMUEL]

In [8]:
def movie_value(soup, regex):
    
    ''' Grab a specified value from soups HTML '''
    
    # RegEx an object, return the text next to that object
    # or return none if it doesn't exist
    obj = soup.find(text = re.compile(regex, flags = re.IGNORECASE))
    if not obj: 
        return None
    
    next_sibling = obj.next_sibling
    
    if next_sibling:
        return next_sibling.text 
    else:
        return None

In [9]:
def clean_money(money, numbers_pattern = '[^0-9]'):
    return int(re.sub(numbers_pattern, '', money))

In [10]:
def domestic_worldwide(soup, regex):
    
    ''' Grab a domestic or worldwide unadjusted lifetime grosses if it exists '''
    
    # Grab object for clean_money, return none if it doesn't exist
    obj = soup.find(text = (re.compile(regex, flags = re.IGNORECASE)))
    
    if obj:
        return obj.parent.find_next('b').text 
    else:
        return None

In [11]:
def runtime_convert(runtime_string):
    
    ''' Converts string of runtime in X hrs. Y mins. to minutes '''
    
    runtime = runtime_string.split()
    try:
        minutes = int(runtime[0])*60 + int(runtime[2])
        return minutes
    except:
        return None

In [12]:
def budget_convert(budget_string, numbers_pattern = '[^0-9]'):
    
    ''' Converts string of budget into a monetary value if it exists '''
    
    if "million" in budget_string:
        return int(re.sub(numbers_pattern, '', budget_string)) * 1000000
    elif "N/A" in budget_string:
        return None
    else:
        return int(re.sub(numbers_pattern, '', budget_string))

In [13]:
def actors(soup, KEANU = 0, SAMUEL = 0):
    
    ''' Searches the actors list for Keanue Reeves and Samuel L. Jackson '''
    
    # This searches for actors field but may be bugged
    keanu = re.compile("Keanu Reeves(?!\*)", flags = re.I)
    samuel = re.compile("Samuel L\. Jackson(?!\*)", flags = re.I)
    try:
        actors = soup.find(text = re.compile("Actors:",flags = re.IGNORECASE))
        print(actors)
    except:
        return (KEANU, SAMUEL)
    if actors != None:
        actors = actors.next.text
        if keanu.search(actors):
            KEANU = 1
        if samuel.search(actors):
            SAMUEL = 1
    return KEANU, SAMUEL

In [14]:
def scrape(soup, movie, headers, proxies):
    
    ''' Soupify, traverse, if something fails, wait a few, reset proxy and retry '''
    
    try:
        return traverse_movie(soup) 
        
    except:
        print("No soup for you! @ ", IDX)
        print("... zZz .. (ー。ー) .. zZz ...")
        sleep(1 + 5*random.random())
        headers, proxies = random_proxy()
        print(headers, proxies)
        scrape(soup, movie, headers, proxies)

In [15]:
# (☞ ͡° ͜ʖ ͡°)☞  Time to SCRAPE

# Initiate the random agent and proxy
headers, proxies = random_proxy()
print(headers, proxies)

# Get a list of all the movie urls
movies = link_list(headers = headers, proxies = proxies)
%time

{'User-Agent': 'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1667.0 Safari/537.36'} {'http': 'http://144.217.213.226:80'}
CPU times: user 4 µs, sys: 1e+03 ns, total: 5 µs
Wall time: 9.06 µs


In [None]:
# Without a function that looks for the page numbers, we kept grabbing proxies?
# Slow to run, pickle list and continue

# Check for duplicates just in case =)
print("Movies: ", len(movies), "=", len(set(movies)))

# Check first 3 and last 3 movies
pprint.pprint(movies[:3])
pprint.pprint(movies[-3:])

# Pickle the movies_list using the highest protocol available.
with open('movies_list.pickle', 'wb') as file:   
    pickle.dump(movies, file, pickle.HIGHEST_PROTOCOL)

In [20]:
#~#~### ༼ つ ◕_◕ ༽つ *Scraping Intensifies* ༼ つ ◕_◕ ༽つ  ###~#~#
        
# Scrape movies from movie list
# DATA is a list of dictionaries for each record
# IDX will be used as a counter, different instances should run at different counters
DATA = []
IDX = 0
columns = ['TITLE', 'ADJ_GROSS', 'DOMESTIC', 'WORLD', 'RELEASE', 
           'AGE', 'GENRE', 'RUNTIME', 'RATING', 'BUDGET', "IDX"]

# Bring in our pickled list of movie urls
with open('movies_list.pickle', 'rb') as file:
    movies_list = pickle.load(file)

headers, proxies = random_proxy()
# Change the splicing if running clones
for movie in movies_list[0:]:
    print("Index: ", IDX)

    # Soupify movie and traverse page with scrape()
    soup = soupify_movie(movie_url = movie, headers = headers, proxies = proxies)
    movie_data = scrape(soup, movie, headers = headers, proxies = proxies)
    
    movie_data.append(IDX)
    diccionario = dict(zip(columns, movie_data))
    DATA.append(diccionario)

    IDX += 1
    
# Pickle the data frame created from our DATA list of dictionaries
df = pd.DataFrame(DATA)
df.to_pickle(path = 'mojo_jojo{}.pickle'.format(IDX))

Index:  0


KeyboardInterrupt: 

In [None]:
# If segmenting failed, pickle data
print(len(movies_list[0:0]))
df = pd.DataFrame(DATA)
df.to_pickle(path = 'mojo_jojo_X_XXXX.pickle')

In [None]:
#pprint.pprint(DATA)