In [50]:
# Data wrangling
import pandas as pd
import numpy as np

# Time
import time
import datetime
from datetime import datetime as dt
import dateutil.parser

# Plotting
import matplotlib.pyplot as plt
import seaborn as sns

# URL Grabbing
import requests

# Scraping / Searching
from bs4 import BeautifulSoup
import re

# Misc
import pickle
from pprint import pprint

%matplotlib inline
pd.set_option('display.float_format', lambda x: '%.2f' % x) # reduces the amount of decimals when printing the df's

In [10]:
def scrape_bom(url):
    '''Scrapes BoxOfficeMojo.com and returns a dictionary with key value pairs that can be built
    into a dataframe.
    '''
    url

In [11]:
def get_genre_links():
    '''Returns a list of links for each genre
    '''
    url = 'http://www.boxofficemojo.com/genres/'
    page = requests.get(url)
    soup = BeautifulSoup(page.text, "lxml")
    
    genre_links = soup.find_all(href=re.compile(r"\/chart\/\?id="))
    movie_genre_links = []
    
    for raw_link in genre_links:
        link = 'http://www.boxofficemojo.com/genres/' + raw_link['href'][2:]
        movie_genre_links.append(link)
    
    return movie_genre_links

In [12]:
def get_pages(soup):
    pagenums = soup.find_all(href=re.compile(r"pagenum="))
#     pprint(pagenums) # uncomment this if you need to check if the pagenums are coming through
    pagenum_regex = re.compile(r"pagenum=(\d+)")
    pagedict = {}
    for atag in pagenums:
        val = pagenum_regex.findall(atag['href'])[0]

        if val not in pagedict:
            pagedict[val] = 'http://www.boxofficemojo.com/' + atag['href']
    return pagedict

In [47]:
def parse_genre(genre_links):
    '''Takes in a list of links and starts requesting data for each and returns a set of movie links
    '''
    movie_links = set()
    # use if you need to specify a link to parse
    # url = genre_links[0]
    for url in genre_links:
        # preparing to be within a genre
        genre_page = requests.get(url)
        
        # test to see if the page will load, and if not, just go to next link and print issue
        if genre_page.status_code < 200 and genre_page.status_code > 300:
            print(url, 'failed with status code', genre_page.status_code)
            continue
        
        soup = BeautifulSoup(genre_page.text, "lxml")
        page_num_links = get_pages(soup)
        
        # testing the number of links found on the page
        #pprint(page_num_links)

        # gets the movies for the first page and stores them in a set
        for link in get_movies(soup):
            movie_links.add(link)
        
        # testing the actual links found on the first page
        #pprint(movie_links)

        # get the movies for the other pages
        for pagenum, link in page_num_links.items():
            page = requests.get(link)
            
            # test to see if the page will load
            if page.status_code < 200 and page.status_code > 300:
                print(link, 'failed with status code', page.status_code)
                continue
            new_soup = BeautifulSoup(page.text, "lxml")
            new_links = get_movies(new_soup)
            for movie_link in new_links:
                movie_links.add(movie_link)
        
    return movie_links

In [34]:
def get_movies(soup):
    movies = []
    raw_movie_links = soup.find_all(href=re.compile(r"\/movies\/\?id="))
    for raw_link in raw_movie_links:
        link = 'http://www.boxofficemojo.com' + raw_link['href']
        movies.append(link)
    return movies
        
        #         raw_movie_links = soup.find_all(href=re.compile(r"\/movies\/\?id="))
#         for raw_link in raw_movie_links:
#             link = 'http://www.boxofficemojo.com/' + raw_link['href']
#             movie_links.append(link)

# Testing Functions

In [167]:
hi = {'a':1, 'b':2, 'c':4}
for k,v in hi.items():
    print(k, v)

a 1
b 2
c 4


In [35]:
# this generates a list of links to each genre
genre_links = get_genre_links()

In [36]:
genre_links

['http://www.boxofficemojo.com/genres/chart/?id=3d.htm',
 'http://www.boxofficemojo.com/genres/chart/?id=actionbuddycomedy.htm',
 'http://www.boxofficemojo.com/genres/chart/?id=martialarts.htm',
 'http://www.boxofficemojo.com/genres/chart/?id=actionsequals.htm',
 'http://www.boxofficemojo.com/genres/chart/?id=wirefu.htm',
 'http://www.boxofficemojo.com/genres/chart/?id=actionheroine.htm',
 'http://www.boxofficemojo.com/genres/chart/?id=actionremake.htm',
 'http://www.boxofficemojo.com/genres/chart/?id=desertadventure.htm',
 'http://www.boxofficemojo.com/genres/chart/?id=adventureperiod.htm',
 'http://www.boxofficemojo.com/genres/chart/?id=adventureremake.htm',
 'http://www.boxofficemojo.com/genres/chart/?id=animation.htm',
 'http://www.boxofficemojo.com/genres/chart/?id=anime.htm',
 'http://www.boxofficemojo.com/genres/chart/?id=computeranimation.htm',
 'http://www.boxofficemojo.com/genres/chart/?id=animatedfantasy.htm',
 'http://www.boxofficemojo.com/genres/chart/?id=animationmotionca

In [40]:
len(genre_links)

215

In [48]:
all_movie_links = parse_genre(genre_links)

In [49]:
len(all_movie_links)

9969

In [53]:
lst_all_movie_links = list(all_movie_links)

In [57]:
lst_all_movie_links

['http://www.boxofficemojo.com/movies/?id=othersideofthemountainpartii.htm',
 'http://www.boxofficemojo.com/movies/?id=saw4.htm',
 'http://www.boxofficemojo.com/movies/?id=universalsoldier.htm',
 'http://www.boxofficemojo.com/movies/?id=jackal.htm',
 'http://www.boxofficemojo.com/movies/?id=betterthanchocolate.htm',
 'http://www.boxofficemojo.com/movies/?id=mommiedearest.htm',
 'http://www.boxofficemojo.com/movies/?id=gbf.htm',
 'http://www.boxofficemojo.com/movies/?id=newtown.htm',
 'http://www.boxofficemojo.com/movies/?id=murderball.htm',
 'http://www.boxofficemojo.com/movies/?id=terraferma.htm',
 'http://www.boxofficemojo.com/movies/?id=mebeforeyou.htm',
 'http://www.boxofficemojo.com/movies/?id=talladeganights.htm',
 'http://www.boxofficemojo.com/movies/?id=cruising.htm',
 'http://www.boxofficemojo.com/movies/?id=abcd2.htm',
 'http://www.boxofficemojo.com/movies/?id=whenthesearises.htm',
 'http://www.boxofficemojo.com/movies/?id=seainside.htm',
 'http://www.boxofficemojo.com/movies

In [54]:
import pickle

with open('BOMojo_movie_links.pkl', 'wb') as picklefile:
    pickle.dump(lst_all_movie_links, picklefile)

In [56]:
!ls -AF

[34m.git[m[m/                   BOMojo_movie_links.pkl  scraping_testing.ipynb
.gitignore              README.md               [34mtutorial[m[m/
[34m.ipynb_checkpoints[m[m/     scraping-boxmojo.ipynb


In [159]:
# Test getting movies from a single genre page
url = 'http://www.boxofficemojo.com/genres/chart/?id=3d.htm'
genre_page = requests.get(url)
soup = BeautifulSoup(genre_page.text, "lxml")

print(len(get_movies(soup)))

178


In [171]:
# test parse genre
movie_links = parse_genre(genre_links)

{'2': 'http://www.boxofficemojo.com//genres/chart/?view=main&sort=gross&order=DESC&pagenum=2&id=3d.htm', '3': 'http://www.boxofficemojo.com//genres/chart/?view=main&sort=gross&order=DESC&pagenum=3&id=3d.htm', '4': 'http://www.boxofficemojo.com//genres/chart/?view=main&sort=gross&order=DESC&pagenum=4&id=3d.htm'}
['http://www.boxofficemojo.com//movies/?id=bossbaby.htm',
 'http://www.boxofficemojo.com//movies/?id=starwars7.htm',
 'http://www.boxofficemojo.com//movies/?id=avatar.htm',
 'http://www.boxofficemojo.com//movies/?id=jurassicpark4.htm',
 'http://www.boxofficemojo.com//movies/?id=avengers11.htm',
 'http://www.boxofficemojo.com//movies/?id=starwars2016.htm',
 'http://www.boxofficemojo.com//movies/?id=pixar2015.htm',
 'http://www.boxofficemojo.com//movies/?id=avengers2.htm',
 'http://www.boxofficemojo.com//movies/?id=beautyandthebeast2017.htm',
 'http://www.boxofficemojo.com//movies/?id=toystory3.htm',
 'http://www.boxofficemojo.com//movies/?id=ironman3.htm',
 'http://www.boxofficem

In [37]:
_3d_links = parse_genre(genre_links)

In [38]:
len(_3d_links)

384

In [39]:
_3d_links

{'http://www.boxofficemojo.com/movies/?id=300sequel.htm',
 'http://www.boxofficemojo.com/movies/?id=3d06.htm',
 'http://www.boxofficemojo.com/movies/?id=47ronin.htm',
 'http://www.boxofficemojo.com/movies/?id=abcd2013.htm',
 'http://www.boxofficemojo.com/movies/?id=abrahamlincolnvampirehunter.htm',
 'http://www.boxofficemojo.com/movies/?id=airracers.htm',
 'http://www.boxofficemojo.com/movies/?id=alice2.htm',
 'http://www.boxofficemojo.com/movies/?id=aliceinwonderland10.htm',
 'http://www.boxofficemojo.com/movies/?id=aliensofthedeep.htm',
 'http://www.boxofficemojo.com/movies/?id=allyouneediskill.htm',
 'http://www.boxofficemojo.com/movies/?id=alphaandomega.htm',
 'http://www.boxofficemojo.com/movies/?id=amazingspiderman2.htm',
 'http://www.boxofficemojo.com/movies/?id=amazingspiderman3.htm',
 'http://www.boxofficemojo.com/movies/?id=amazingspiderman4.htm',
 'http://www.boxofficemojo.com/movies/?id=americanmustang.htm',
 'http://www.boxofficemojo.com/movies/?id=amityville3d.htm',
 'htt

# Determining movie links within a genre

In [3]:
# this is testing out a single genre to get each movie link
url = 'http://www.boxofficemojo.com/genres/chart/?id=3d.htm'
page = requests.get(url)
soup = BeautifulSoup(page.text, "lxml")

In [4]:
raw_movie_links = soup.find_all(href=re.compile(r"\/movies\/\?id="))
movie_links = {}
for raw_link in raw_movie_links:
    link = 'http://www.boxofficemojo.com/' + raw_link['href']
#     print(link)
    movie_links[link] = None

In [6]:
len(movie_links)

178

# Determining pagenums

In [93]:
pagenums = soup.find_all(href=re.compile(r"pagenum="))

In [97]:
pagenums[0]['href']

'/genres/chart/?view=main&sort=gross&order=DESC&pagenum=2&id=3d.htm'

In [106]:
pagenum_regex = re.compile(r"pagenum=(\d+)")
pagedict = {}
for atag in pagenums:
    val = pagenum_regex.findall(atag['href'])[0]
    
    if val not in pagedict:
        pagedict[val] = 'http://www.boxofficemojo.com/' + atag['href']
    
# pagenum_regex.findall(pagenums[0]['href'])
# pagenums[0]['href']

In [107]:
pagedict

{'2': 'http://www.boxofficemojo.com//genres/chart/?view=main&sort=gross&order=DESC&pagenum=2&id=3d.htm',
 '3': 'http://www.boxofficemojo.com//genres/chart/?view=main&sort=gross&order=DESC&pagenum=3&id=3d.htm',
 '4': 'http://www.boxofficemojo.com//genres/chart/?view=main&sort=gross&order=DESC&pagenum=4&id=3d.htm'}