In [2]:
from __future__ import print_function, division
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
import re

In [3]:
#Definition for finding information that is next to a text field
def find_text(textsoup, field):
    info = textsoup.find(text=re.compile(field))
    if info:
        return info.parent.findNext().text.strip()
    else: 
        return 'N/A'
    
#specific to rundays, find data that contains a certain text string
def find_rundays(textsoup, field):
    info = textsoup.find(text=re.compile(field))
    if info:
        return info.parent.text.strip()
    else: 
        return 'N/A'

#this will be used to pass 3d or imax to determine is a movie has IMAX or 3D variants
def find_tech(textsoup, field):
    if textsoup.find(href="/genres/chart/?id={}.htm".format(field)):
        return 1
    else: return 0
    
#this will be used to pass 0 or 1 based on presence of a field
def find_binary(textsoup, field):
    if textsoup.find(text=re.compile(field)):
        return 1
    else: return 0

In [4]:
#finds all the pieces of information I want
def get_movie_data(row):
    response = requests.get('https://www.boxofficemojo.com{}'.format(row))
    webpage = response.text
    
    soup = BeautifulSoup(webpage, "lxml")
    
    this_movie_data = [] 
    this_movie_data.append(row)                                   #movie URL
    this_movie_data.append(find_text(soup,"Worldwide:"))          #WORLDWIDE GROSS
    this_movie_data.append(find_text(soup,'MPAA Rating:'))        #MPAA RATING
    this_movie_data.append(find_text(soup,'Production Budget:'))  #production budget 
    this_movie_data.append(find_text(soup,'Genre:'))              #Genre
    this_movie_data.append(find_text(soup,'Runtime:'))            #Runtime
    this_movie_data.append(find_text(soup,'Close'))               #Closedate
    this_movie_data.append(find_rundays(soup,'days'))             #rundays
    this_movie_data.append(find_tech(soup,'3d'))                  #3D Movie
    this_movie_data.append(find_tech(soup,'imax'))                #IMAX    
    this_movie_data.append(find_binary(soup,'Series: '))          #Find Franchises  
    
    return this_movie_data

In [5]:
#Some useful fuctions to clean up dataframe
def clean_num(row):                                  #Extract Number and strip non numerical parts
    return row.replace('$','').replace(',','')

def clean_rundays(row):                              #Extract Number and strip non numerical parts
    return row.split()[0]

def clean_runtime(row):
    runtime = 'N/A'
    time_list = list()
    time_list = row.split()
    if len(time_list) >= 2:
        runtime = int(time_list[-2])                 #this finds the minutes and casts it as a number
    if len(time_list) >= 4:
        runtime = runtime + int(time_list[-4])*60    #this finds the hours, casts to int, and X60 mins
    return runtime

In [9]:
year_range = range(1988,1998)                       #this subs out the year ranges sought via commenting in/out
#year_range = range(2008,2018)

for year in year_range:
    #Read in the CSV generated by the Year Scrape, and only bring in the URL column
    url_df = pd.DataFrame(pd.read_csv("../04_Data/{}_movies.csv".format(year))["url"])
    
    #Apply the Get Data function to get a series of lists, held in the 'Data Folder'
    url_df["data"] = url_df["url"].apply(get_movie_data)
    
    #Create a new dataframe for the movie info, and clean up the columns
    movie_df = pd.DataFrame(list(url_df["data"]))
    movie_df.rename({0: 'url', 1: 'worldwide gross', 2: 'mpaa', 3:'budget', 
                     4:'genre', 5: 'runtime', 6:'close', 7:'rundays',
                     8: '3d', 9: 'imax', 10: 'series'}, axis=1, inplace=True)

    #Apply cleaning functions on dataframe columns
    movie_df['worldwide gross']=movie_df['worldwide gross'].apply(clean_num)
    movie_df['rundays']=movie_df['rundays'].apply(clean_rundays)
    movie_df['runtime']=movie_df['runtime'].apply(clean_runtime)
    
    #Save CSV with filename 'year_movies.csv' for each pass of the for loop
    movie_df.to_csv('../04_Data/{}_movies_detail.csv'.format(year))  