In [1]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup

In [2]:
# Function to convert money string to integer
def money_str_int(str):
    number = int(str.strip('$').replace(',',''))
    return number

In [3]:
#New Data

def get_table(soup):
    headers = [col.text.strip() for col in soup.findAll('th')] # column names
    headers.append('URL')
    rows = soup.findAll('table')[0].findAll('tr') # retrieve rows
    data = []
    base_url = 'https://www.boxofficemojo.com'

    # Format table data
    for row in rows[1:]:
        # Get list of text displayed on web page
        cell_data = row.findAll('td')
        cells = [cell.text for cell in cell_data] 
        cells[2] = money_str_int(cells[2])

        # Get URL for each movie and append to list
        movie_url = base_url + row.find('a').attrs['href']
        cells.append(movie_url)

        # Add list to data
        data.append(cells)
        
    return pd.DataFrame(data, columns = headers) #return a data frame

In [4]:
url = 'https://www.boxofficemojo.com/chart/top_lifetime_gross/'
base_url = 'https://www.boxofficemojo.com'
df_movie = pd.DataFrame() 

# Get HTML data for top 1000 grossing movies from Box Office Mojo
for i in range(5):
    html_page = requests.get(url)
    soup = BeautifulSoup(html_page.content, 'html.parser')
    fetched_data = get_table(soup)
    if i < 4:
        url = base_url + soup.findAll('li', class_='a-last')[0].find('a').attrs["href"]
    df_movie = df_movie.append(fetched_data, ignore_index = True)
    
df_movie.head()

Unnamed: 0,Rank,Title,Lifetime Gross,Year,URL
0,1,Star Wars: Episode VII - The Force Awakens,936662225,2015,https://www.boxofficemojo.com/title/tt2488496/...
1,2,Avengers: Endgame,858373000,2019,https://www.boxofficemojo.com/title/tt4154796/...
2,3,Avatar,760507625,2009,https://www.boxofficemojo.com/title/tt0499549/...
3,4,Black Panther,700426566,2018,https://www.boxofficemojo.com/title/tt1825683/...
4,5,Avengers: Infinity War,678815482,2018,https://www.boxofficemojo.com/title/tt4154756/...


In [5]:
# Iterate through above dataframe and pull data from each movie's page
def get_movie_data(url):
    movie_page = requests.get(url)
    movie = BeautifulSoup(movie_page.content, 'html.parser')
    divs = movie.findAll('div', class_='a-section a-spacing-none') #first section of table stored as divs instead of table

    # Variables will come back as 'No Data' if the webpage doesn't have this information
    distributor = 'No Data'
    budget = 'No Data'
    rating = 'No Data'
    duration = 'No Data'
    genres = 'No Data'

    for div in divs:
        spans = div.findAll('span')
        i=0
        for span in spans:      
            if span.text  == 'Domestic Distributor':
                distributor = spans[i+1].text.replace('See full company information\n\n','') #remove extra link text from end
                i+=1
                break
            if span.text == 'Budget':
                budget = money_str_int(spans[i+1].text) #convert budget string to integer
                i+=1
                break
            if span.text == 'MPAA':
                rating = spans[i+1].text
                i+=1
                break
            if span.text == 'Running Time':
                dur = spans[i+1].text.split() #imports duration as hours and minutes
                if len(dur) == 2:
                    dur.extend(['0','0'])
                duration = round( (float(dur[0]) + float(dur[2])/60) , 2) #converts to hours rounded to hundreths
                i+=1
                break
            if span.text == 'Genres':
                genres = spans[i+1].text.replace(' ','').replace('\n\n',',') #.strip wasn't working for some reason
                i+=1
                break
            else:
                i+=1
            
    df_movie = pd.DataFrame([distributor, budget, rating, duration, genres]).transpose()
    df_movie.columns = ['Distributor', 'Budget', 'Rating', 'Running_Time_hrs', 'Genres']

    return df_movie

In [6]:
# loop to get data from first 500 URLs from prior dataframe (broke this up due to timeout issues
df_movie_2 = pd.DataFrame(columns = ['Distributor', 'Budget', 'Rating', 'Running_Time_hrs', 'Genres'])
for url in df_movie['URL'][:500]:
    movie_data = get_movie_data(url)
    df_movie_2 = df_movie_2.append(movie_data, ignore_index=True)
df_movie_2

Unnamed: 0,Distributor,Budget,Rating,Running_Time_hrs,Genres
0,Walt Disney Studios Motion Pictures,245000000,PG-13,2.3,"Action,Adventure,Sci-Fi"
1,Walt Disney Studios Motion Pictures,356000000,PG-13,3.02,"Action,Adventure,Drama,Sci-Fi"
2,Twentieth Century Fox,237000000,PG-13,2.7,"Action,Adventure,Fantasy,Sci-Fi"
3,Walt Disney Studios Motion Pictures,No Data,PG-13,2.23,"Action,Adventure,Sci-Fi"
4,Walt Disney Studios Motion Pictures,No Data,PG-13,2.48,"Action,Adventure,Sci-Fi"
...,...,...,...,...,...
495,Walt Disney Studios Motion Pictures,50000000,PG,2.08,"Adventure,Comedy,Drama,Fantasy,Musical"
496,United Artists,No Data,No Data,1.52,"Drama,Sport"
497,Walt Disney Studios Motion Pictures,85000000,PG,1.78,"Animation,Comedy,Family,Fantasy,Musical,Romance"
498,Paramount Pictures,150000000,PG-13,2.77,"Drama,Fantasy,Romance"


In [7]:
# loop to add data from last 500 URLs from prior dataframe
for url in df_movie['URL'][500:]:
    movie_data = get_movie_data(url)
    df_movie_2 = df_movie_2.append(movie_data, ignore_index=True)

In [8]:
df_movie_2.columns = ['Distributor', 'Budget', 'Rating', 'Running_Time_hrs', 'Genres']
result = pd.concat([df_movie, df_movie_2], axis=1, join='inner')
result.to_pickle('cats_bomojo_data.pkl')

In [9]:
# RUN THIS CODE TO LOAD CAT'S DATA
df_movie_3 = pd.read_pickle('cats_bomojo_data.pkl')
df_movie_3.head(3)

Unnamed: 0,Rank,Title,Lifetime Gross,Year,URL,Distributor,Budget,Rating,Running_Time_hrs,Genres
0,1,Star Wars: Episode VII - The Force Awakens,936662225,2015,https://www.boxofficemojo.com/title/tt2488496/...,Walt Disney Studios Motion Pictures,245000000,PG-13,2.3,"Action,Adventure,Sci-Fi"
1,2,Avengers: Endgame,858373000,2019,https://www.boxofficemojo.com/title/tt4154796/...,Walt Disney Studios Motion Pictures,356000000,PG-13,3.02,"Action,Adventure,Drama,Sci-Fi"
2,3,Avatar,760507625,2009,https://www.boxofficemojo.com/title/tt0499549/...,Twentieth Century Fox,237000000,PG-13,2.7,"Action,Adventure,Fantasy,Sci-Fi"
