# Scrape movie sequel data from Box Office Mojo 

In [200]:
import requests
import time
import random
import pandas as pd
from bs4 import BeautifulSoup

Get data from Box Office Mojo. This doesn't lump in Marvel Cinematic Universe data as one series (or other examples), only includes movies in box office (as direct to video sales are very spotty)

## Scrape list of franchises and franchise summary webpages

In [69]:
url = 'http://www.boxofficemojo.com/franchises/'
response = requests.get(url)
response.status_code
# print(response.text)
page = response.text
soup = BeautifulSoup(page,"lxml")

Get list of webpage links for franchises

In [76]:
site_link = "http://www.boxofficemojo.com/franchises"
franchise_table = soup.find_all('table')[3]
franchise_links = []
franchise_list = []
for franchise in franchise_table.find_all('tr')[1:]:
    franchise_link.append(site_link + (franchise.find('a')['href'][1:]))
    franchise_name.append(franchise.find('a').text)
print(franchise_links[:5])
print(franchise_list[:5])

['http://www.boxofficemojo.com/franchises/chart/?id=3ninjas.htm', 'http://www.boxofficemojo.com/franchises/chart/?id=300.htm', 'http://www.boxofficemojo.com/franchises/chart/?id=agathachristie.htm', 'http://www.boxofficemojo.com/franchises/chart/?id=alexcross.htm', 'http://www.boxofficemojo.com/franchises/chart/?id=aliceinwonderland.htm']
['3 Ninjas', '300', 'Agatha Christie', 'Alex Cross', 'Alice in Wonderland']


In [221]:
print(f'There are {len(franchise_link)} movie franchises with at least one box office sequel.')

There are 254 movie franchises with at least one box office sequel.


## Proof of concept for webscrape to data frame pipeline: single franchise

Scrape basic movie data (7 features) from the summary page for each franchise - rank, title, adjusted gross domestic, gross theathers, adjusted opening, adjusted theaters, release date, plus a list of individual movie websites for getting more data if needed

In [210]:
franchise_name = []
movie_title = []
rank = []
studio = []
adjusted_domestic_gross = []
theaters = []
"""
number of theaters is in the 4th table, not the 5th table - will have to generate
a separate for loop to get this value
theater_raw = franchise.find_all('td')[4].text
    theaters.append(theater_raw.replace(',',''))
    # need to convert to datetime object
"""
release = []
movie_webpage = []


url = 'http://www.boxofficemojo.com/franchises/chart/?id=3ninjas.htm'
response = requests.get(url)
# print(response.status_code)
page = response.text
soup = BeautifulSoup(page,"lxml")


franchise_table = soup.find_all('table')[5]
for franchise in franchise_table.find_all('tr')[1:-2]:
    movie_title.append(franchise.find_all('td')[1].text)
    rank.append(int(franchise.find_all('td')[0].text))
    studio.append(franchise.find_all('td')[2].text)
    adj_domestic_gross_str = franchise.find_all('td')[3].text[1:]
    adjusted_domestic_gross.append(adj_domestic_gross_str.replace(',', ''))
    release_str = franchise.find_all('td')[5].text
    release.append(pd.to_datetime(release_str))
    franchise_name.append(soup.find('h1').text)
    movie_webpage.append(site_link + franchise.find('a')['href'])

In [211]:
df = pd.DataFrame(
    {'franchise_name': franchise_name,
     'movie_title': movie_title,
     'rank': rank,
     'studio': studio,
     'adjusted_domestic_gross': adjusted_domestic_gross,
#     'n_theaters': theaters,
     'release_date': release,
    })

In [212]:
movie_webpage

['http://www.boxofficemojo.com/franchises/movies/?id=3ninjas.htm',
 'http://www.boxofficemojo.com/franchises/movies/?id=3ninjaskickback.htm',
 'http://www.boxofficemojo.com/franchises/movies/?id=3ninjasknuckleup.htm',
 'http://www.boxofficemojo.com/franchises/movies/?id=3ninjas4.htm']

In [213]:
df.head()

Unnamed: 0,franchise_name,movie_title,rank,studio,adjusted_domestic_gross,release_date
0,3 Ninjas,3 Ninjas,1,BV,64010300,1992-08-07
1,3 Ninjas,3 Ninjas Kick Back,2,TriS,25855900,1994-05-06
2,3 Ninjas,3 Ninjas Knuckle Up,3,Sony,870700,1995-03-10
3,3 Ninjas,3 Ninjas: High Noon at Mega Mountain,4,Sony,734000,1998-04-10


Create variables 'prior_movie_date' and 'prior_movie_studio', then get the difference between the two.  The value for the first movie in each franchise will be NaN because there is no previous movie to compare it to. 

In [214]:
df[['prior_movie_studio', 'prior_movie_release_date']] = df.groupby(['franchise_name'])['studio','release_date'].transform(lambda grp: grp.shift(1))

In [215]:
df.head()

Unnamed: 0,franchise_name,movie_title,rank,studio,adjusted_domestic_gross,release_date,prior_movie_studio,prior_movie_release_date
0,3 Ninjas,3 Ninjas,1,BV,64010300,1992-08-07,,NaT
1,3 Ninjas,3 Ninjas Kick Back,2,TriS,25855900,1994-05-06,BV,1992-08-07
2,3 Ninjas,3 Ninjas Knuckle Up,3,Sony,870700,1995-03-10,TriS,1994-05-06
3,3 Ninjas,3 Ninjas: High Noon at Mega Mountain,4,Sony,734000,1998-04-10,Sony,1995-03-10


Get difference between release dates. 

In [216]:
df['time_since_prior_movie'] = df.release_date - df.prior_movie_release_date

In [217]:
df.head()

Unnamed: 0,franchise_name,movie_title,rank,studio,adjusted_domestic_gross,release_date,prior_movie_studio,prior_movie_release_date,time_since_prior_movie
0,3 Ninjas,3 Ninjas,1,BV,64010300,1992-08-07,,NaT,NaT
1,3 Ninjas,3 Ninjas Kick Back,2,TriS,25855900,1994-05-06,BV,1992-08-07,637 days
2,3 Ninjas,3 Ninjas Knuckle Up,3,Sony,870700,1995-03-10,TriS,1994-05-06,308 days
3,3 Ninjas,3 Ninjas: High Noon at Mega Mountain,4,Sony,734000,1998-04-10,Sony,1995-03-10,1127 days


Below is getting data from The Numbers - include direct-to-video movies which is interesting, but they don't have sales data for all of those direct to video releases so it's tougher to assess. It might be useful down the line.

## Generalizing to the full dataset: scraping basic features from 254 franchises

In [205]:
franchise_name = []
movie_title = []
rank = []
studio = []
adjusted_domestic_gross = []
theaters = []
"""
number of theaters is in the 4th table, not the 5th table - will have to generate
a separate for loop to get this value
theater_raw = franchise.find_all('td')[4].text
    theaters.append(theater_raw.replace(',',''))
    # need to convert to datetime object
"""
release = []
movie_webpage = []

for url in franchise_links[:5]: 
    response = requests.get(url)
    # print(response.status_code)
    page = response.text
    soup = BeautifulSoup(page,"lxml")

    franchise_table = soup.find_all('table')[5]
    for franchise in franchise_table.find_all('tr')[1:-2]:
        movie_title.append(franchise.find_all('td')[1].text)
        rank.append(int(franchise.find_all('td')[0].text))
        studio.append(franchise.find_all('td')[2].text)
        adj_domestic_gross_str = franchise.find_all('td')[3].text[1:]
        adjusted_domestic_gross.append(adj_domestic_gross_str.replace(',', ''))
        release_str = franchise.find_all('td')[5].text
        release.append(pd.to_datetime(release_str))
        franchise_name.append(soup.find('h1').text)
        movie_webpage.append(site_link + franchise.find('a')['href'])
        
        df = pd.DataFrame(
    {'franchise_name': franchise_name,
     'movie_title': movie_title,
     'rank': rank,
     'studio': studio,
     'adjusted_domestic_gross': adjusted_domestic_gross,
#     'n_theaters': theaters,
     'release_date': release,
    })
    sec = random.uniform(5,15)
    print(f'Sleeping for ~{int(sec)} seconds before next scrape.')
    time.sleep(sec)
df.tail()

Sleeping for ~12 seconds before next scrape.
Sleeping for ~11 seconds before next scrape.
Sleeping for ~10 seconds before next scrape.
Sleeping for ~11 seconds before next scrape.
Sleeping for ~13 seconds before next scrape.


Unnamed: 0,franchise_name,movie_title,rank,studio,adjusted_domestic_gross,release_date
7,Alex Cross,Kiss the Girls,1,Par.,120792000,1997-10-03
8,Alex Cross,Along Came a Spider,2,Par.,119886200,2001-04-06
9,Alex Cross,Alex Cross,3,LG/S,29459100,2012-10-19
10,Alice in Wonderland,Alice in Wonderland (2010),1,BV,385654200,2010-03-05
11,Alice in Wonderland,Alice Through the Looking Glass,2,BV,80883900,2016-05-27


In [None]:
"""
url = 'https://www.the-numbers.com/movies/franchises'
response = requests.get(url)
response.status_code
# print(response.text)
page = response.text
soup = BeautifulSoup(page,"lxml")
# soup.find_all('table')
# soup.find('table').find_all('tr')[1:]

movie_count = 0
single = 0
franchises = 0
for link in soup.find('table').find_all('tr')[1:]:
    print(link.find('a')['href']) # put .text instead of ['href'] for the name of the franchise
    n = int(link.find_all('td')[1].text)
    print(link.find_all('td')[1].text) # print the next column - the number of movies
    movie_count += int(link.find_all('td')[1].text)
    if n == 1:
        single += 1
    franchises += 1
print(movie_count)
print(franchises)
print(single)
"""