In [4]:
!pip install requests



In [1]:
from bs4 import BeautifulSoup as bs
import pandas as pd
import requests
import pickle
from datetime import datetime

In [67]:
## Save list_movie_data in with pickle
def save_data(filename, data):
    with open(filename, 'wb') as f:
        pickle.dump(data, f)
    
    
#function for scraping infobox
def scrape(url):
    page = requests.get(url)
    toScrape = bs(page.content, 'html.parser')
    movie_details = toScrape.find(class_='infobox vevent')
    rows = movie_details.find_all('tr') 
    movie_data = {}

    title = rows[0].find('th').get_text()
    movie_data['Title'] = title
    clean_tags(movie_details)
    for i, row in enumerate(rows):
        try:
            if i <= 1:
                continue
            elif row.find('th').get_text() == 'Based on':
                continue
            elif row.find('th').get_text() == 'Starring':
                movie_data['Starring'] = clean(row)
                movie_data['Lead'] = movie_data['Starring'][0]
            elif 'Production' in row.find('th').get_text():
                movie_data['Production companies'] = clean(row)
            elif row.find('th').get_text() == 'Running time':
                movie_data['Running time (min)'] = clean(row)
            elif row.find('th').get_text() == 'Release date':
                date = clean(row)
                movie_data['Release date'] = date
                movie_data['Release date (dt)'] = dt_conversion(date)
            elif row.find('th').get_text() == 'Box office':
                op_rating = get_op_and_rating(title)
                movie_data['Box office (opening)'] = op_rating[0]
                movie_data['Box office'] = clean(row)
                movie_data['Rating'] = op_rating[1]
            else:
                column = row.find('th').get_text(' ', strip=True)
                data = clean(row)
                movie_data[column] = data
        except:
            pass
        
    return movie_data
    
    
#grab opening box office numbers and MPAA rating from Box Office Mojo given movie title
def get_op_and_rating(title):
    search_page = requests.get('https://www.boxofficemojo.com/search/?q='+title)
    search_page_content = bs(search_page.content, 'html.parser')
    search_list = search_page_content.find_all('a')
    for li in search_list:
        if li.get_text() == title:
            path = li['href']
            break

    data = ['n/a','n/a']
    movie_page = requests.get('https://www.boxofficemojo.com'+path)
    movie_page_content = bs(movie_page.content, 'html.parser')
    table_links = movie_page_content.find_all('a')
    for li in table_links:
        if '$' in li.get_text():
            opening_box = float(li.get_text().replace('$', '').replace(',', ''))
            data[0] = opening_box
            break
            
    spans = movie_page_content.find_all('span')
    for span in spans:
        if 'MPAA' in span.get_text():
            rating = span.find_next('span').get_text()
            data[1] = rating
            break
    
    return data  


#Convert date str to datetime object
def dt_conversion(date):
    patterns = ['%B %d, %Y', '%d %B %Y']
    for pat in patterns:
        try:
            return datetime.strptime(date, pat)
        except:
            pass
    return None


#remove troublesome tags
def clean_tags(content):
    t = ['sup', 'span']
    tags = content.find_all(t)
    for tag in tags:
        tag.decompose()
        
def money_convert(row):
    multiplier = 1
    if '£' in row.find('td').get_text():
        multiplier = 1.21
    if 'million' in row.find('td').get_text():
        if '(' in row.find('td').get_text():
            number = float(row.find('td').get_text(' ', strip=True).split('(')[0].replace('$','').replace('£', ''))
        elif '-' in row.find('td').get_text():
            number = float(row.find('td').get_text(' ', strip=True).split('-')[0].replace('$','').replace('£', ''))
        else:
            number = float(row.find('td').get_text(' ', strip=True).split(' ')[0].replace('$','').replace('£', ''))
        money = number * multiplier * (10**6)
        return money
    elif 'billion' in row.find('td').get_text():
        if '(' in row.find('td').get_text():
            number = float(row.find('td').get_text(' ', strip=True).split('(')[0].replace('$','').replace('£', ''))
        elif '-' in row.find('td').get_text():
            number = float(row.find('td').get_text(' ', strip=True).split('-')[0].replace('$','').replace('£', ''))
        else:
            number = float(row.find('td').get_text(' ', strip=True).split(' ')[0].replace('$','').replace('£', ''))
        money = number * multiplier * (10**9)
        return money
    else:
        number = float(row.find('td').get_text().replace(',','').replace('$','').replace('£', ''))
        return number * multiplier


#function to clean data scraped from wikipedia infobox
def clean(row):
    if row.find('th').get_text() == 'Release date':
        return row.find('td').get_text().split('(')[0].replace('\xa0', ' ').strip('\n').strip(' ')                                                                                         
    elif row.find('th').get_text() == 'Running time':
        return int(row.find('td').get_text().split(' ')[0])
    elif row.find('br'):
        return [text for text in row.find('td').stripped_strings]
    elif row.find('li'):
        return [li.get_text(' ', strip=True).replace('\xa0', ' ') for li in row.find_all('li')]
    elif '$' in row.find('td').get_text() or '£' in row.find('td').get_text():
            return money_convert(row)
    return row.find('td').get_text()



# load in table of WBMovies 
# https://en.wikipedia.org/wiki/List_of_Warner_Bros._films_(2000%E2%80%932009)
# https://en.wikipedia.org/wiki/List_of_Warner_Bros._films_(2010%E2%80%932019)
links = ['https://en.wikipedia.org/wiki/List_of_Warner_Bros._films_(2000%E2%80%932009)', 
         'https://en.wikipedia.org/wiki/List_of_Warner_Bros._films_(2010%E2%80%932019)']
list_movie_data = []
for link in links:
    page = requests.get(link)
    content = bs(page.content, 'html.parser')
    table_rows = content.select('.wikitable.sortable i')

    #loop through table rows and scrape entry given the url
    #Append movie info each iteration
    for i,row in enumerate(table_rows):
        if i % 10 ==  0:
            print(i);
        try:
            path = row.find('a')['href']
            movie_url = 'https://en.wikipedia.org' + path
            list_movie_data.append(scrape(movie_url))
        except Exception as e:
            print(movie_url)
            print(e)
    
save_data('WB_movie_data.pickle', list_movie_data)
list_movie_data

0
10
20
30
40
https://en.wikipedia.org/wiki/Harry_Potter_and_the_Philosopher%27s_Stone_(film)
'NoneType' object is not subscriptable
50
60
70
80
90
100
110
120
130
140
https://en.wikipedia.org/wiki/The_Dukes_of_Hazzard_(film)
'NoneType' object is not subscriptable
150
160
170
180
190
200
210
220
https://en.wikipedia.orghttps://nl.wikipedia.org/wiki/Morrison_krijgt_een_zusje
HTTPSConnectionPool(host='en.wikipedia.orghttps', port=443): Max retries exceeded with url: //nl.wikipedia.org/wiki/Morrison_krijgt_een_zusje (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x000001D436D0DD90>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed'))
https://en.wikipedia.orghttps://nl.wikipedia.org/wiki/Hoe_overleef_ik_mezelf%3F_(film)
HTTPSConnectionPool(host='en.wikipedia.orghttps', port=443): Max retries exceeded with url: //nl.wikipedia.org/wiki/Hoe_overleef_ik_mezelf%3F_(film) (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object

270
280
290
300
https://en.wikipedia.org/w/index.php?title=Kabir_Azab%C4%B1&action=edit&redlink=1
'NoneType' object has no attribute 'find_all'
310
320
330
https://en.wikipedia.org/wiki/Western_Stars#Film
'NoneType' object has no attribute 'find_all'
340


[{'Title': 'The Whole Nine Yards',
  'Directed by': 'Jonathan Lynn',
  'Produced by': ['Allan Kaufman', 'David Willis'],
  'Written by': 'Mitchell Kapner',
  'Starring': ['Bruce Willis',
   'Matthew Perry',
   'Rosanna Arquette',
   'Michael Clarke Duncan',
   'Natasha Henstridge',
   'Amanda Peet',
   'Kevin Pollak'],
  'Lead': 'Bruce Willis',
  'Music by': 'Randy Edelman',
  'Cinematography': 'David Franco',
  'Edited by': 'Tom Lewis',
  'Production companies': ['Morgan Creek Productions',
   'Franchise Pictures',
   'Rational Packaging',
   'Lansdown Films'],
  'Distributed by': 'Warner Bros.',
  'Release date': 'February 18, 2000',
  'Release date (dt)': datetime.datetime(2000, 2, 18, 0, 0),
  'Running time (min)': 98,
  'Country': 'United States',
  'Language': 'English',
  'Budget': 41300000.0,
  'Box office (opening)': 13731070.0,
  'Box office': 106400000.0,
  'Rating': 'n/a'},
 {'Title': 'My Dog Skip',
  'Directed by': 'Jay Russell',
  'Produced by': ['Mark Johnson',
   'John 

In [69]:
## Load pickle file data 
def load_data(filename):
    with open(filename, 'rb') as f:
        return pickle.load(f)

# load movie data 
list_movie_data = load_data('WB_movie_data.pickle')

# create pandas dataframe
df = pd.DataFrame(list_movie_data)

# dropping irrelevant columns
for x in range(30):
    df.drop(df.columns[26], axis=1, inplace = True)

df.drop(df.columns[[3,6,7,8,10,20,21,22,23,24,25]], axis=1, inplace = True)

# drop rows that dont have box office data
df.dropna(subset=['Box office','Box office (opening)','Budget'], inplace=True)
pd.set_option("display.max_rows", None, "display.max_columns", None)
df = df.set_index('Title')
display(df)

# df.to_csv('WB_movie_data_cleaned.csv')

Unnamed: 0_level_0,Directed by,Produced by,Starring,Lead,Production companies,Release date,Release date (dt),Running time (min),Country,Language,Budget,Box office (opening),Box office,Rating
Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
The Whole Nine Yards,Jonathan Lynn,"[Allan Kaufman, David Willis]","[Bruce Willis, Matthew Perry, Rosanna Arquette...",Bruce Willis,"[Morgan Creek Productions, Franchise Pictures,...","February 18, 2000",2000-02-18,98.0,United States,English,41300000.0,13731070.0,106400000.0,
My Dog Skip,Jay Russell,"[Mark Johnson, John Lee Hancock, Broderick Joh...","[Frankie Muniz, Diane Lane, Luke Wilson, Kevin...",Frankie Muniz,"[Alcon Entertainment, MDS Productions LLC]","January 14, 2000",2000-01-14,95.0,United States,English,4500000.0,56943.0,35500000.0,PG
Romeo Must Die,Andrzej Bartkowiak,"[Joel Silver, Jim Van Wyck]","[Jet Li, Aaliyah, Isaiah Washington, Russell W...",Jet Li,[Silver Pictures],"March 22, 2000",2000-03-22,115.0,United States,English,25000000.0,18014503.0,91000000.0,R
Ready to Rumble,Brian Robbins,"[Robert F. Newmyer, Jeffrey Silver]","[David Arquette, Oliver Platt, Scott Caan, Bil...",David Arquette,"[Outlaw Productions, Bel Air Entertainment, To...","April 7, 2000",2000-04-07,107.0,United States,English,24000000.0,5257778.0,12500000.0,PG-13
Gossip,Davis Guggenheim,"[Jeffrey Silver, Robert Newmyer]","[James Marsden, Lena Headey, Norman Reedus, Ka...",James Marsden,[Village Roadshow Pictures],"April 21, 2000",2000-04-21,90.0,United States,English,24000000.0,2321729.0,12000000.0,R
Battlefield Earth,Roger Christian,"[Jonathan Krane, Elie Samaha, John Travolta]","[John Travolta, Barry Pepper, Forest Whitaker,...",John Travolta,"[Morgan Creek Productions, Franchise Pictures]","May 10, 2000",2000-05-10,117.0,United States,English,44000000.0,11548898.0,29700000.0,PG-13
The Perfect Storm,Wolfgang Petersen,"[Paula Weinstein, Wolfgang Petersen, Gail Katz]","[George Clooney, Mark Wahlberg, Diane Lane, Jo...",George Clooney,"[Baltimore Pictures, Radiant Productions]","June 30, 2000",2000-06-30,130.0,United States,English,120000000.0,41325042.0,328700000.0,PG-13
The In Crowd,Mary Lambert,James G. Robinson,"[Susan Ward, Lori Heuring, Matthew Settle, Nat...",Susan Ward,[Morgan Creek Productions],"July 19, 2000",2000-07-19,105.0,United States,English,13000000.0,1505551.0,5000000.0,PG-13
Space Cowboys,Clint Eastwood,"[Clint Eastwood, Andrew Lazar]","[Clint Eastwood, Tommy Lee Jones, Donald Suthe...",Clint Eastwood,"[Village Roadshow Pictures, Clipsal Films, Mal...","August 4, 2000",2000-08-04,130.0,United States,English,60000000.0,18093776.0,128900000.0,PG-13
The Replacements,Howard Deutch,Dylan Sellers,"[Keanu Reeves, Gene Hackman, Orlando Jones, Jo...",Keanu Reeves,[Bel Air Entertainment],"August 11, 2000",2000-08-11,118.0,,English,50000000.0,11039214.0,50100000.0,PG-13
