In [8]:
from bs4 import BeautifulSoup as bs
import pandas as pd
import requests
import pickle
from datetime import datetime
import os

In [6]:
## Save list_movie_data in with pickle
def save_data(filename, data):
    with open(filename, 'wb') as f:
        pickle.dump(data, f)

## Load pickle file data 
def load_data(filename):
    with open(filename, 'rb') as f:
        return pickle.load(f)
    
#function for scraping infobox
def scrape(url):
    page = requests.get(url)
    toScrape = bs(page.content, 'html.parser')
    movie_details = toScrape.find(class_='infobox vevent')
    rows = movie_details.find_all('tr') 
    movie_data = {}

    title = rows[0].find('th').get_text()
    movie_data['Title'] = title
    for i, row in enumerate(rows):
        try:
            if i <= 1:
                continue
            elif row.find('th').get_text() == 'Based on':
                continue
            elif row.find('th').get_text() == 'Starring':
                clean_tags(row)
                movie_data['Starring'] = clean(row)
                movie_data['Lead'] = movie_data['Starring'][0]
            elif 'Production' in row.find('th').get_text():
                clean_tags(row)
                movie_data['Production companies'] = clean(row)
            elif row.find('th').get_text() == 'Running time':
                clean_tags(row)
                movie_data['Running_time_min'] = clean(row)
            elif row.find('th').get_text() == 'Release date':
                clean_tags(row)
                date = clean(row).strip()
                dt = dt_conversion(date)
                movie_data['Release_date_dt'] = dt
                movie_data['Release_month'] = dt.month
            elif row.find('th').get_text() == 'Budget':
                budget = money_convert(row)
                movie_data['Budget'] = budget
            elif row.find('th').get_text() == 'Box office':
                money = money_convert(row)
                movie_data['Box_office'] = money
            else:
                clean_tags(row)
                column = row.find('th').get_text(' ', strip=True)
                data = clean(row)
                movie_data[column] = data
        except:
            pass
        
    op_rating = get_op_and_rating(title)
    movie_data['Box_office_opening'] = op_rating[0]
    movie_data['Rating'] = op_rating[1]
        
    return movie_data
    
    
    
#grab opening box office numbers and MPAA rating from Box Office Mojo given movie title
def get_op_and_rating(title):
    search_page = requests.get('https://www.boxofficemojo.com/search/?q='+title)
    search_page_content = bs(search_page.content, 'html.parser')
    search_list = search_page_content.find_all('a')
    path = ''
    for li in search_list:
        if li.get_text() == title:
            path = li['href']
            break
        elif title[:10] in li.get_text():
            path = li['href']
            break
            
    if path == '':
        path = search_list[14]['href']
    
    
    data = [None,'Unknown']
    movie_page = requests.get('https://www.boxofficemojo.com'+path)
    movie_page_content = bs(movie_page.content, 'html.parser')
    table_links = movie_page_content.find_all('a')
    for li in table_links:
        if '$' in li.get_text():
            opening_box = float(li.get_text().replace('$', '').replace(',', ''))
            data[0] = opening_box
            break
            
    spans = movie_page_content.find_all('span')
    for span in spans:
        if 'MPAA' in span.get_text():
            rating = span.find_next('span').get_text()
            data[1] = rating
            break
    return data  


#Convert date str to datetime object
def dt_conversion(date):
    patterns = ['%B %d, %Y', '%d %B %Y']
    for pat in patterns:
        try:
            return datetime.strptime(date, pat)
        except:
            pass
    return none


#remove troublesome tags
def clean_tags(content):
    t = ['sup', 'span']
    tags = content.find_all(t)
    for tag in tags:
        tag.decompose()

def money_convert(row):
    multiplier = 1
    money_str = row.find('td').get_text().replace('\xa0', ' ')
    if '£' in money_str:
        multiplier = 1.41
    elif '€' in money_str:
        multiplier = 1.21
        
        
    if 'million' in money_str:
        if '(' in money_str:
            money_str = money_str.split('(')[0].replace('$','').replace('£', '')
        if '-' in money_str:
            number = float(money_str.split('-')[0].replace('$','').replace('£', ''))
            return number * multiplier * (10**6)
        if '–' in money_str:
            number = float(money_str.split('–')[0].replace('$','').replace('£', ''))
            return number * multiplier * (10**6)
        else:
            number = float(money_str.split(' ')[0].replace('$','').replace('£', ''))
            return number * multiplier * (10**6)
    elif 'billion' in money_str:
        if '(' in money_str:
            money_str = money_str.split('(')[0].replace('$','').replace('£', '')
        if '-' in money_str:
            number = float(money_str.split('-')[0].replace('$','').replace('£', ''))
            return number * multiplier * (10**9)
        if '–' in money_str:
            number = float(money_str.split('–')[0].replace('$','').replace('£', ''))
            return number * multiplier * (10**9)
        else:
            number = float(money_str.split(' ')[0].replace('$','').replace('£', ''))
            return number * multiplier * (10**9)
    else:
        number = float(money_str.replace(',','').replace('$','').replace('£', ''))
        return number * multiplier


#function to clean data scraped from wikipedia infobox
def clean(row):
    if row.find('th').get_text() == 'Release date':
        if row.find('td').get_text()[0].isdigit() == True:
            return row.find('td').get_text().split(',')[0].replace('\xa0', ' ').strip('\n').strip(' ')  
        return row.find('td').get_text().split('(')[0].replace('\xa0', ' ').strip('\n').strip(' ')                                                                                         
    elif row.find('th').get_text() == 'Running time':
        return int(row.find('td').get_text().split(' ')[0])
    elif row.find('br'):
        return [text for text in row.find('td').stripped_strings]
    elif row.find('li'):
        return [li.get_text(' ', strip=True).replace('\xa0', ' ') for li in row.find_all('li')]
    return row.find('td').get_text()


# Scrape single page and add to movie data

In [19]:
# load movie data 
list_movie_data = load_data('./1990-2019_movie_data.pickle')

#list_movie_data.append(scrape(#some wikipedia link to movie infobox))

#save_data('moviestudio_movie_data.pickle', list_movie_data)
save_data('1990-2019_movie_data.pickle', list_movie_data)

# Delete irrelevant columns
df = pd.DataFrame(list_movie_data) 
while True:
    try:
        df.drop(df.columns[21], axis=1, inplace = True)
    except:
        break
df.drop(df.columns[[3,4,7,9]], axis=1, inplace = True)
df = df.set_index('Title')

# drop rows without box office numbers and create seperate csv and dataframe
df1 = df.dropna(subset=['Release_month','Budget', 'Box_office_opening', 'Box_office'])

#change file name accordingly
#df.to_csv('moviestudio_movie_data_cleaned.csv')
df.to_csv('../Data/all_movie_data_cleaned.csv')
# df1.to_csv(#moviestudio_movie_data_dropna.csv')     
df1.to_csv('../CleanData/all_movie_data_dropna.csv')

Unnamed: 0_level_0,Directed by,Starring,Lead,Release_date_dt,Release_month,Running_time_min,Country,Language,Budget,Box_office,Box_office_opening,Rating
Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Iron Man,Jon Favreau,"[Robert Downey Jr., Terrence Howard, Jeff Brid...",Robert Downey Jr.,2008-04-14,4,126,United States,English,140000000.0,585800000.0,98618668.0,PG-13
The Incredible Hulk,Louis Leterrier,"[Edward Norton, Liv Tyler, Tim Roth, Tim Blake...",Edward Norton,2008-06-08,6,112,United States,English,137500000.0,264800000.0,55414050.0,PG-13
Iron Man 2,Jon Favreau,"[Robert Downey Jr., Gwyneth Paltrow, Don Chead...",Robert Downey Jr.,2010-04-26,4,125,United States,English,170000000.0,623900000.0,128122480.0,PG-13
Thor,Kenneth Branagh,"[Chris Hemsworth, Natalie Portman, Tom Hiddles...",Chris Hemsworth,2011-04-17,4,114,United States,English,150000000.0,449300000.0,65723338.0,PG-13
Captain America: The First Avenger,Joe Johnston,"[Chris Evans, Tommy Lee Jones, Hugo Weaving, H...",Chris Evans,2011-07-19,7,124,United States,English,140000000.0,370600000.0,65058524.0,PG-13
The Avengers,Joss Whedon,"[Robert Downey Jr., Chris Evans, Mark Ruffalo,...",Robert Downey Jr.,2012-04-11,4,143,United States,English,220000000.0,1519000000.0,207438708.0,PG-13
Iron Man 3,Shane Black,"[Robert Downey Jr., Gwyneth Paltrow, Don Chead...",Robert Downey Jr.,2013-04-14,4,131,United States,English,200000000.0,1215000000.0,174144585.0,PG-13
Thor: The Dark World,Alan Taylor,"[Chris Hemsworth, Natalie Portman, Tom Hiddles...",Chris Hemsworth,2013-10-22,10,112,United States,English,150000000.0,644800000.0,85737841.0,PG-13
Captain America: The Winter Soldier,"[Anthony Russo, Joe Russo]","[Chris Evans, Scarlett Johansson, Sebastian St...",Chris Evans,2014-03-13,3,136,United States,English,170000000.0,714400000.0,95023721.0,PG-13
Guardians of the Galaxy,James Gunn,"[Chris Pratt, Zoe Saldana, Dave Bautista, Vin ...",Chris Pratt,2014-07-21,7,122,United States,English,232300000.0,772800000.0,94320883.0,PG-13
