In [1]:
!pip install requests



In [2]:
from bs4 import BeautifulSoup as bs
import pandas as pd
import requests
import pickle
from datetime import datetime
import os

In [48]:
## Save list_movie_data in with pickle
def save_data(filename, data):
    with open(filename, 'wb') as f:
        pickle.dump(data, f)

## Load pickle file data 
def load_data(filename):
    with open(filename, 'rb') as f:
        return pickle.load(f)
    
#function for scraping infobox
def scrape(url):
    page = requests.get(url)
    toScrape = bs(page.content, 'html.parser')
    movie_details = toScrape.find(class_='infobox vevent')
    rows = movie_details.find_all('tr') 
    movie_data = {}

    title = rows[0].find('th').get_text()
    movie_data['Title'] = title
    for i, row in enumerate(rows):
        try:
            if i <= 1:
                continue
            elif row.find('th').get_text() == 'Based on':
                continue
            elif row.find('th').get_text() == 'Starring':
                clean_tags(row)
                movie_data['Starring'] = clean(row)
                movie_data['Lead'] = movie_data['Starring'][0]
            elif 'Production' in row.find('th').get_text():
                clean_tags(row)
                movie_data['Production companies'] = clean(row)
            elif row.find('th').get_text() == 'Running time':
                clean_tags(row)
                movie_data['Running_time_min'] = clean(row)
            elif row.find('th').get_text() == 'Release date':
                clean_tags(row)
                date = clean(row).strip()
                dt = dt_conversion(date)
                movie_data['Release_date_dt'] = dt
                movie_data['Release_month'] = dt.month
            elif row.find('th').get_text() == 'Budget':
                budget = money_convert(row)
                movie_data['Budget'] = budget
            elif row.find('th').get_text() == 'Box office':
                money = money_convert(row)
                movie_data['Box_office'] = money
            else:
                clean_tags(row)
                column = row.find('th').get_text(' ', strip=True)
                data = clean(row)
                movie_data[column] = data
        except:
            pass
        
    op_rating = get_op_and_rating(title)
    movie_data['Box_office_opening'] = op_rating[0]
    movie_data['Rating'] = op_rating[1]
        
    return movie_data
    
    
    
#grab opening box office numbers and MPAA rating from Box Office Mojo given movie title
def get_op_and_rating(title):
    search_page = requests.get('https://www.boxofficemojo.com/search/?q='+title)
    search_page_content = bs(search_page.content, 'html.parser')
    search_list = search_page_content.find_all('a')
    path = ''
    for li in search_list:
        if li.get_text() == title:
            path = li['href']
            break
        elif title[:10] in li.get_text():
            path = li['href']
            break
            
    if path == '':
        path = search_list[14]['href']
    
    
    data = [None,'Unknown']
    movie_page = requests.get('https://www.boxofficemojo.com'+path)
    movie_page_content = bs(movie_page.content, 'html.parser')
    table_links = movie_page_content.find_all('a')
    for li in table_links:
        if '$' in li.get_text():
            opening_box = float(li.get_text().replace('$', '').replace(',', ''))
            data[0] = opening_box
            break
            
    spans = movie_page_content.find_all('span')
    for span in spans:
        if 'MPAA' in span.get_text():
            rating = span.find_next('span').get_text()
            data[1] = rating
            break
    return data  


#Convert date str to datetime object
def dt_conversion(date):
    patterns = ['%B %d, %Y', '%d %B %Y']
    for pat in patterns:
        try:
            return datetime.strptime(date, pat)
        except:
            pass
    return none


#remove troublesome tags
def clean_tags(content):
    t = ['sup', 'span']
    tags = content.find_all(t)
    for tag in tags:
        tag.decompose()

def money_convert(row):
    multiplier = 1
    money_str = row.find('td').get_text().replace('\xa0', ' ')
    if '£' in money_str:
        multiplier = 1.41
    elif '€' in money_str:
        multiplier = 1.21
        
        
    if 'million' in money_str:
        if '(' in money_str:
            money_str = money_str.split('(')[0].replace('$','').replace('£', '')
        if '-' in money_str:
            number = float(money_str.split('-')[0].replace('$','').replace('£', ''))
            return number * multiplier * (10**6)
        if '–' in money_str:
            number = float(money_str.split('–')[0].replace('$','').replace('£', ''))
            return number * multiplier * (10**6)
        else:
            number = float(money_str.split(' ')[0].replace('$','').replace('£', ''))
            return number * multiplier * (10**6)
    elif 'billion' in money_str:
        if '(' in money_str:
            money_str = money_str.split('(')[0].replace('$','').replace('£', '')
        if '-' in money_str:
            number = float(money_str.split('-')[0].replace('$','').replace('£', ''))
            return number * multiplier * (10**9)
        if '–' in money_str:
            number = float(money_str.split('–')[0].replace('$','').replace('£', ''))
            return number * multiplier * (10**9)
        else:
            number = float(money_str.split(' ')[0].replace('$','').replace('£', ''))
            return number * multiplier * (10**9)
    else:
        number = float(money_str.replace(',','').replace('$','').replace('£', ''))
        return number * multiplier


#function to clean data scraped from wikipedia infobox
def clean(row):
    if row.find('th').get_text() == 'Release date':
        if row.find('td').get_text()[0].isdigit() == True:
            return row.find('td').get_text().split(',')[0].replace('\xa0', ' ').strip('\n').strip(' ')  
        return row.find('td').get_text().split('(')[0].replace('\xa0', ' ').strip('\n').strip(' ')                                                                                         
    elif row.find('th').get_text() == 'Running time':
        return int(row.find('td').get_text().split(' ')[0])
    elif row.find('br'):
        return [text for text in row.find('td').stripped_strings]
    elif row.find('li'):
        return [li.get_text(' ', strip=True).replace('\xa0', ' ') for li in row.find_all('li')]
    return row.find('td').get_text()


# Scrape WB movies from 2000-2019

In [None]:
# load in table of WBMovies 
# https://en.wikipedia.org/wiki/List_of_Warner_Bros._films_(2000%E2%80%932009)
# https://en.wikipedia.org/wiki/List_of_Warner_Bros._films_(2010%E2%80%932019)
links = ['https://en.wikipedia.org/wiki/List_of_Warner_Bros._films_(2000%E2%80%932009)', 
         'https://en.wikipedia.org/wiki/List_of_Warner_Bros._films_(2010%E2%80%932019)']
list_movie_data = []
for link in links:
    page = requests.get(link)
    content = bs(page.content, 'html.parser')
    table_rows = content.select('.wikitable.sortable i')

    #loop through table rows and scrape entry given the url
    #Append movie info each iteration
    for i,row in enumerate(table_rows):
        if i % 10 == 0:
            print(i);
        try:
            path = row.find('a')['href']
            movie_url = 'https://en.wikipedia.org' + path
            list_movie_data.append(scrape(movie_url))
        except Exception as e:
            print(movie_url)
            print(e)
    
save_data('WB_movie_data.pickle', list_movie_data)
list_movie_data

# Drop irrelevant columns and save WB data to csv

In [35]:
# load movie data 
list_movie_data = load_data('WB_movie_data.pickle')

# create pandas dataframe
df = pd.DataFrame(list_movie_data)

# dropping irrelevant columns
for x in range(36):
    df.drop(df.columns[20], axis=1, inplace = True)

df.drop(df.columns[[2,3,6,7,8,9,10,]], axis=1, inplace = True)
df = df.set_index('Title')


pd.set_option("display.max_rows", None, "display.max_columns", None)

df.to_csv('./Data/movie_data_cleaned_WB.csv')


df1 = df.dropna(subset=['Release_month','Budget', 'Box_office_opening', 'Box_office'])
display(df1)
df1.to_csv('./CleanData/movie_data_dropna_WB.csv')

Unnamed: 0_level_0,Directed by,Starring,Lead,Release_date_dt,Release_month,Running_time_min,Country,Language,Budget,Box_office,Box_office_opening,Rating
Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
The Whole Nine Yards,Jonathan Lynn,"[Bruce Willis, Matthew Perry, Rosanna Arquette...",Bruce Willis,2000-02-18,2.0,98.0,United States,English,41300000.0,106400000.0,13731070.0,Unknown
My Dog Skip,Jay Russell,"[Frankie Muniz, Diane Lane, Luke Wilson, Kevin...",Frankie Muniz,2000-01-14,1.0,95.0,United States,English,4500000.0,35500000.0,56943.0,PG
Romeo Must Die,Andrzej Bartkowiak,"[Jet Li, Aaliyah, Isaiah Washington, Russell W...",Jet Li,2000-03-22,3.0,115.0,United States,English,25000000.0,91000000.0,18014503.0,R
Ready to Rumble,Brian Robbins,"[David Arquette, Oliver Platt, Scott Caan, Bil...",David Arquette,2000-04-07,4.0,107.0,United States,English,24000000.0,12500000.0,5257778.0,PG-13
Gossip,Davis Guggenheim,"[James Marsden, Lena Headey, Norman Reedus, Ka...",James Marsden,2000-04-21,4.0,90.0,United States,English,24000000.0,12000000.0,2321729.0,R
Battlefield Earth,Roger Christian,"[John Travolta, Barry Pepper, Forest Whitaker,...",John Travolta,2000-05-10,5.0,117.0,United States,English,44000000.0,29700000.0,11548898.0,PG-13
The Perfect Storm,Wolfgang Petersen,"[George Clooney, Mark Wahlberg, Diane Lane, Jo...",George Clooney,2000-06-30,6.0,130.0,United States,English,120000000.0,328700000.0,41325042.0,PG-13
The In Crowd,Mary Lambert,"[Susan Ward, Lori Heuring, Matthew Settle, Nat...",Susan Ward,2000-07-19,7.0,105.0,United States,English,13000000.0,5000000.0,1505551.0,PG-13
Pokémon The Movie 2000: The Power of One,Kunihiko Yuyama,See below,S,1999-07-17,7.0,82.0,Japan,Japanese,30000000.0,133900000.0,19575608.0,Unknown
Space Cowboys,Clint Eastwood,"[Clint Eastwood, Tommy Lee Jones, Donald Suthe...",Clint Eastwood,2000-08-04,8.0,130.0,United States,English,60000000.0,128900000.0,18093776.0,PG-13


# Scrape Marvel Cinematic Universe movies

In [5]:
# load in table of MCU Movies 
#https://en.wikipedia.org/wiki/List_of_Marvel_Cinematic_Universe_films
link = 'https://en.wikipedia.org/wiki/List_of_Marvel_Cinematic_Universe_films'
list_movie_data = []
page = requests.get(link)
content = bs(page.content, 'html.parser')
table_rows = content.select('.wikitable.plainrowheaders i')

#loop through table rows and scrape entry given the url
#Append movie info each iteration
for i,row in enumerate(table_rows):
    if row.get_text() == 'Spider-Man: Far From Home':
        break
    if i % 5 == 0:
        print(i);
    try:
        path = row.find('a')['href']
        movie_url = 'https://en.wikipedia.org' + path
        list_movie_data.append(scrape(movie_url))
    except Exception as e:
        print(movie_url)
        print(e)
    
save_data('MCU_movie_data.pickle', list_movie_data)
list_movie_data

0
5
10
15
20


[{'Title': 'Iron Man',
  'Directed by': 'Jon Favreau',
  'Produced by': ['Avi Arad', 'Kevin Feige'],
  'Screenplay by': ['Mark Fergus',
   'Hawk Ostby',
   'Art Marcum',
   'Matt Holloway'],
  'Starring': ['Robert Downey Jr.',
   'Terrence Howard',
   'Jeff Bridges',
   'Shaun Toub',
   'Gwyneth Paltrow'],
  'Lead': 'Robert Downey Jr.',
  'Music by': 'Ramin Djawadi',
  'Cinematography': 'Matthew Libatique',
  'Edited by': 'Dan Lebental',
  'Production companies': ['Marvel Studios'],
  'Distributed by': 'Paramount Pictures',
  'Release_date_dt': datetime.datetime(2008, 4, 14, 0, 0),
  'Release_month': 4,
  'Running_time_min': 126,
  'Country': 'United States',
  'Language': 'English',
  'Budget': 140000000.0,
  'Box_office': 585800000.0,
  'Box_office_opening': 98618668.0,
  'Rating': 'PG-13'},
 {'Title': 'The Incredible Hulk',
  'Directed by': 'Louis Leterrier',
  'Produced by': ['Avi Arad', 'Gale Anne Hurd', 'Kevin Feige'],
  'Written by': 'Zak Penn',
  'Starring': ['Edward Norton',
 

# Drop irrelevant columns and save MCU data to csv

In [36]:
# load movie data 
list_movie_data = load_data('MCU_movie_data.pickle')

# create pandas dataframe
df = pd.DataFrame(list_movie_data)

df.drop(df.columns[[2,3,6,7,8,9,10,20,21]], axis=1, inplace = True)
df = df.set_index('Title')

pd.set_option("display.max_rows", None, "display.max_columns", None)

df.to_csv('./Data/movie_data_cleaned_MCU.csv')

df1 = df.dropna(subset=['Release_month','Budget', 'Box_office_opening', 'Box_office'])
display(df1)
df1.to_csv('./CleanData/movie_data_dropna_MCU.csv')

Unnamed: 0_level_0,Directed by,Starring,Lead,Release_date_dt,Release_month,Running_time_min,Country,Language,Budget,Box_office,Box_office_opening,Rating
Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Iron Man,Jon Favreau,"[Robert Downey Jr., Terrence Howard, Jeff Brid...",Robert Downey Jr.,2008-04-14,4,126,United States,English,140000000.0,585800000.0,98618668.0,PG-13
The Incredible Hulk,Louis Leterrier,"[Edward Norton, Liv Tyler, Tim Roth, Tim Blake...",Edward Norton,2008-06-08,6,112,United States,English,137500000.0,264800000.0,55414050.0,PG-13
Iron Man 2,Jon Favreau,"[Robert Downey Jr., Gwyneth Paltrow, Don Chead...",Robert Downey Jr.,2010-04-26,4,125,United States,English,170000000.0,623900000.0,128122480.0,PG-13
Thor,Kenneth Branagh,"[Chris Hemsworth, Natalie Portman, Tom Hiddles...",Chris Hemsworth,2011-04-17,4,114,United States,English,150000000.0,449300000.0,65723338.0,PG-13
Captain America: The First Avenger,Joe Johnston,"[Chris Evans, Tommy Lee Jones, Hugo Weaving, H...",Chris Evans,2011-07-19,7,124,United States,English,140000000.0,370600000.0,65058524.0,PG-13
The Avengers,Joss Whedon,"[Robert Downey Jr., Chris Evans, Mark Ruffalo,...",Robert Downey Jr.,2012-04-11,4,143,United States,English,220000000.0,1519000000.0,207438708.0,PG-13
Iron Man 3,Shane Black,"[Robert Downey Jr., Gwyneth Paltrow, Don Chead...",Robert Downey Jr.,2013-04-14,4,131,United States,English,200000000.0,1215000000.0,174144585.0,PG-13
Thor: The Dark World,Alan Taylor,"[Chris Hemsworth, Natalie Portman, Tom Hiddles...",Chris Hemsworth,2013-10-22,10,112,United States,English,150000000.0,644800000.0,85737841.0,PG-13
Captain America: The Winter Soldier,"[Anthony Russo, Joe Russo]","[Chris Evans, Scarlett Johansson, Sebastian St...",Chris Evans,2014-03-13,3,136,United States,English,170000000.0,714400000.0,95023721.0,PG-13
Guardians of the Galaxy,James Gunn,"[Chris Pratt, Zoe Saldana, Dave Bautista, Vin ...",Chris Pratt,2014-07-21,7,122,United States,English,232300000.0,772800000.0,94320883.0,PG-13


# Scrape single page and add to movie data

In [19]:
# load movie data 
list_movie_data = load_data('./Data/MCU_movie_data.pickle')

#list_movie_data.append(scrape(#some wikipedia link to movie infobox))
list_movie_data.append(scrape('https://en.wikipedia.org/wiki/Spider-Man:_Far_From_Home'))

#save_data('moviestudio_movie_data.pickle', list_movie_data)
save_data('MCU_movie_data.pickle', list_movie_data)

# Delete irrelevant columns
df = pd.DataFrame(list_movie_data) 
while True:
    try:
        df.drop(df.columns[20], axis=1, inplace = True)
    except:
        break
df.drop(df.columns[[2,3,6,7,8,9,10]], axis=1, inplace = True)
df = df.set_index('Title')

# drop rows without box office numbers and create seperate csv and dataframe
df1 = df.dropna(subset=['Release_month','Budget', 'Box_office_opening', 'Box_office'])
pd.set_option("display.max_rows", None, "display.max_columns", None)
display(df1)


#change file name accordingly
#df.to_csv('moviestudio_movie_data_cleaned.csv')
df.to_csv('./Data/movie_data_cleaned_MCU.csv')
# df1.to_csv(#moviestudio_movie_data_dropna.csv')
df1.to_csv('./CleanData/movie_data_dropna_MCU.csv')

Unnamed: 0_level_0,Directed by,Starring,Lead,Release_date_dt,Release_month,Running_time_min,Country,Language,Budget,Box_office,Box_office_opening,Rating
Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Iron Man,Jon Favreau,"[Robert Downey Jr., Terrence Howard, Jeff Brid...",Robert Downey Jr.,2008-04-14,4,126,United States,English,140000000.0,585800000.0,98618668.0,PG-13
The Incredible Hulk,Louis Leterrier,"[Edward Norton, Liv Tyler, Tim Roth, Tim Blake...",Edward Norton,2008-06-08,6,112,United States,English,137500000.0,264800000.0,55414050.0,PG-13
Iron Man 2,Jon Favreau,"[Robert Downey Jr., Gwyneth Paltrow, Don Chead...",Robert Downey Jr.,2010-04-26,4,125,United States,English,170000000.0,623900000.0,128122480.0,PG-13
Thor,Kenneth Branagh,"[Chris Hemsworth, Natalie Portman, Tom Hiddles...",Chris Hemsworth,2011-04-17,4,114,United States,English,150000000.0,449300000.0,65723338.0,PG-13
Captain America: The First Avenger,Joe Johnston,"[Chris Evans, Tommy Lee Jones, Hugo Weaving, H...",Chris Evans,2011-07-19,7,124,United States,English,140000000.0,370600000.0,65058524.0,PG-13
The Avengers,Joss Whedon,"[Robert Downey Jr., Chris Evans, Mark Ruffalo,...",Robert Downey Jr.,2012-04-11,4,143,United States,English,220000000.0,1519000000.0,207438708.0,PG-13
Iron Man 3,Shane Black,"[Robert Downey Jr., Gwyneth Paltrow, Don Chead...",Robert Downey Jr.,2013-04-14,4,131,United States,English,200000000.0,1215000000.0,174144585.0,PG-13
Thor: The Dark World,Alan Taylor,"[Chris Hemsworth, Natalie Portman, Tom Hiddles...",Chris Hemsworth,2013-10-22,10,112,United States,English,150000000.0,644800000.0,85737841.0,PG-13
Captain America: The Winter Soldier,"[Anthony Russo, Joe Russo]","[Chris Evans, Scarlett Johansson, Sebastian St...",Chris Evans,2014-03-13,3,136,United States,English,170000000.0,714400000.0,95023721.0,PG-13
Guardians of the Galaxy,James Gunn,"[Chris Pratt, Zoe Saldana, Dave Bautista, Vin ...",Chris Pratt,2014-07-21,7,122,United States,English,232300000.0,772800000.0,94320883.0,PG-13


# Scrape single page and add to test data set

In [18]:
# load movie data 
test_movie_data = []

#list_movie_data.append(scrape(#some wikipedia link to movie infobox))
test_movie_data.append(scrape('https://en.wikipedia.org/wiki/Spiral_(2021_film)'))

# Delete irrelevant columns
df = pd.DataFrame(test_movie_data) 
while True:
    try:
        df.drop(df.columns[20], axis=1, inplace = True)
    except:
        break
df.drop(df.columns[[2,3,6,7,8,9,10]], axis=1, inplace = True)
df.info()
        
df = df.set_index('Title')

# drop rows without box office numbers and create seperate csv and dataframe
df1 = df.dropna(subset=['Release_month','Budget', 'Box_office_opening', 'Box_office'])
pd.set_option("display.max_rows", None, "display.max_columns", None)
display(df1)

df1.to_csv('./test_movie_data.csv')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1 entries, 0 to 0
Data columns (total 13 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   Title               1 non-null      object        
 1   Directed by         1 non-null      object        
 2   Starring            1 non-null      object        
 3   Lead                1 non-null      object        
 4   Release_date_dt     1 non-null      datetime64[ns]
 5   Release_month       1 non-null      int64         
 6   Running_time_min    1 non-null      int64         
 7   Country             1 non-null      object        
 8   Language            1 non-null      object        
 9   Budget              1 non-null      float64       
 10  Box_office          1 non-null      float64       
 11  Box_office_opening  1 non-null      float64       
 12  Rating              1 non-null      object        
dtypes: datetime64[ns](1), float64(3), int64(2), object(7)


Unnamed: 0_level_0,Directed by,Starring,Lead,Release_date_dt,Release_month,Running_time_min,Country,Language,Budget,Box_office,Box_office_opening,Rating
Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Spiral,Darren Lynn Bousman,"[Chris Rock, Max Minghella, Marisol Nichols, S...",Chris Rock,2021-05-14,5,93,United States,English,20000000.0,36500000.0,8750034.0,R


# Merge MCU and WB csv into one

In [20]:
files = [file for file in os.listdir('./CleanData')]
merged_data = pd.DataFrame()
for file in files:
    df = pd.read_csv('./CleanData/'+file)
    merged_data = pd.concat([merged_data, df])

merged_data.to_csv('all_data.csv', index=False)