<h2>Scraping Metacritic Scores

In [1]:
# Import Dependencies
import requests
from pprint import pprint
import pandas as pd
from bs4 import BeautifulSoup as bs
from decimal import Decimal
import re

In [2]:
# Try Metacritic Scrape for Top 100 Ranked Movies
url = 'https://www.metacritic.com/browse/movies/score/metascore/all/filtered?sort=desc'
print(requests.get(url))

<Response [403]>


In [3]:
# Add headers to bypass 403 Forbidden Access
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0 Safari/605.1.15'}
response = requests.get(url, timeout=30, headers=headers)
print(response)

<Response [200]>


In [4]:
# Parse the HTML text using Beautiful Soup
soup = bs(response.text, 'html.parser')

# Find and append the metacritic score, rank, title, and short summary of each movie
top_100_metacritic_dicts = []
for td in soup.find_all('td', class_ = 'clamp-summary-wrap'):
    score = td.find('div', class_ = 'clamp-score-wrap').text.strip()
    rank = td.find('span', class_ = 'title numbered').text.strip()[:-1]
    title = td.find('a', class_ = 'title').text.strip()
    summary = td.find('div', class_ = 'summary').text.strip()
    top_100_metacritic_dicts.append({'score': score,
                                    'rank': rank,
                                    'title': title,
                                    'summary': summary})

print(len(top_100_metacritic_dicts))

100


<h2>Scraping Base Data (Top 100 Grossing Movies)

In [5]:
# Scrape the base data from the-numbers.com (for Top 100 Grossing Movies)
response = requests.get("https://www.the-numbers.com/box-office-records/domestic/all-movies/cumulative/all-time")
base_df_1 = pd.read_html(response.text)[0]
base_df_2 = pd.read_html(response.text)[1]
base_df = base_df_1.append(base_df_2).reset_index(drop=True)

# Fix cut off titles
base_df = base_df.replace(base_df.iloc[74]['Movie'], 'Pirates of the Caribbean: The Curse of the Black Pearl')
base_df = base_df.replace(base_df.iloc[87]['Movie'], 'The Chronicles of Narnia: The Lion, the Witch and the Wardrobe')

base_df = base_df.rename(columns = {"Movie":"title"})

# Make a copy of dataframe
base_df2 = base_df.copy()

# Loop through copy of dataframe and convert money text to decimal formatting
for index, row in base_df2.iterrows():
    # Convert to decimal
    money = re.match(r'\$\d+,\d+,\d+', row['DomesticBox Office']).group()
    base_df2 = base_df2.replace(row['DomesticBox Office'], Decimal(re.sub(r'[^\d.]', '', money)))
    money = re.match(r'\$\d+,\d+,\d+', row['InternationalBox Office']).group()
    base_df2 = base_df2.replace(row['InternationalBox Office'], Decimal(re.sub(r'[^\d.]', '', money)))
    money = re.match(r'\$\d+,\d+,\d+', row['WorldwideBox Office']).group()
    base_df2 = base_df2.replace(row['WorldwideBox Office'], Decimal(re.sub(r'[^\d.]', '', money)))

base_df2.head()

Unnamed: 0,Rank,Released,title,DomesticBox Office,InternationalBox Office,WorldwideBox Office
0,1,2015,Star Wars Ep. VII: The Force Awakens,936662225,1116648,2053311
1,2,2009,Avatar,760507625,2015837,2776345
2,3,2018,Black Panther,700059566,648300000,1348359
3,4,2018,Avengers: Infinity War,678815482,1369988,2048803
4,5,1997,Titanic,659363944,1548844,2208208


<h2>Scraping Base Data Movies (Top 100 Grossing Movies) from Metacritic

In [6]:
# Prepare Movie Title URLs for Metacritic calls
urls = []
found_first = False
for title in base_df.title:

    url_title = title.replace(' ', '-').replace(',', '').lower().replace('â', '')

    if ',' in url_title:
        url_title = url_title
    if 'â' in url_title:
        url_title = url_title

    # Replace ep. viii: with viii-- and all other cases found in top100 grossing movie titles
    if 'ep.-iv:' in url_title:
        url_title = url_title.replace('ep.-iv:', 'episode-iv--')
    elif 'ep.-iii:' in url_title:
        url_title = url_title.replace('ep.-iii:', 'episode-iii--')
    elif 'ep.-viii:' in url_title:
        url_title = url_title.replace('ep.-viii:', 'episode-viii--')
    elif 'ep.-vii:' in url_title:
        url_title = url_title.replace('ep.-vii:', 'episode-vii--')
    elif 'ep.-vi:' in url_title:
        url_title = url_title.replace('ep.-vi:', 'episode-vi--')
    elif 'ep.-v:' in url_title:
        url_title = url_title.replace('ep.-v:', 'episode-v--')
    elif 'ep.-ii:' in url_title:
        url_title = url_title.replace('ep.-ii:', 'episode-ii--')
    elif 'ep.-i:' in url_title:
        url_title = url_title.replace('ep.-i:', 'episode-i--')
    elif 'part-ii' in url_title in url_title:
        url_title = url_title.replace('part-ii', 'part-2')
    elif 'part-i' in url_title and not 'harry' in url_title:
        url_title = url_title.replace('part-i', 'part-1')
    
    # Now replace ':' and '.' with '' since we don't need it for matching anymore
    if ':' in url_title:
        url_title = url_title.replace(':', '')
    if '.' in url_title:
        url_title = url_title.replace('.', '')

    # Special cases of URLs:
    if title == 'The Twilight Saga: Breaking Dawn, Part 2':
        url_title = 'the-twilight-saga-breaking-dawn---part-2'
    if title == 'The Twilight Saga: Breaking Dawn, Part 1':
        url_title = 'the-twilight-saga-breaking-dawn---part-1'
    # The Avengers 2012
    if 'The Avengers' in title:
        url_title = 'the-avengers-2012'
    # Beauty and the Beast 2017 and 1991
    if 'Beauty and the Beast' in title:
        if found_first != True:
            url_title = 'beauty-and-the-beast-2017'
            found_first = True
        else:
            url_title = 'beauty-and-the-beast-1991'
    # Frozen 2013
    if 'Frozen' in title:
        url_title = 'frozen-2013'
    # The Jungle Book 2016
    if 'The Jungle Book' in title:
        url_title = 'the-jungle-book-2016'
    # Inside Out 2015
    if 'Inside Out' in title:
        url_title = 'inside-out-2015'
    
    urls.append({'title': title, 'url': 'https://www.metacritic.com/movie/' + url_title})
    
len(urls)

100

In [7]:
# Scraping Metacritic for each of the top 100 grossing movies
top_100_grossing_metacritic_dicts = []

for url in urls:

    # Parse the HTML text using Beautiful Soup
    soup = bs(requests.get(url['url'], timeout=30, headers=headers).text, 'html.parser')
    
    # Find and append the release date, metacritic critic(meta) and user scores, production studio, 
    # diretor, main cast, genres, and parental rating, and runtime of each movie
    
    # Release Date
    try:
        for span in soup.find_all('span', class_ = 'release_date'):
            release_date = span.find_all('span')[1].text.strip()
    except:
        release_date = ''
    # Scores
    try:
        #critic_score = soup.find('div', class_ = 'primary_baby_item oswald').text.strip().split('\n')[0]
        #critic_score = soup.find('div', class_ = 'score fl').text.strip()
        scores = []
        for a in soup.find_all('a', class_ = 'metascore_anchor'):
            scores.append(a.text.strip())
        critic_score = scores[0]
        user_score = scores[1]
    except:
        critic_score = ''
        user_score = ''
        print(url['url'])
        
    # Production Studio
    try:
        studio = soup.find('span', class_ = 'distributor').text.strip()[:-2]
    except:
        studio = ''
        
    # Director
    try:
        director = soup.find('div', class_ = 'director').text.strip().split('\n')[1]
    except:
        director = ''
        
    # Main Cast
    try:
        main_cast = soup.find('div', class_ = 'summary_cast details_section').text.strip().split('\n')[2].replace('  ', '').split(',')
    except:
        main_cast = ''
    
    # Genres
    try:
        genres = soup.find('div', class_ = 'genres').text.strip()[11:].replace(' ', '').split(',')
    except:
        genres = ''
    
    # Parental Rating
    try:
        parental_rating = soup.find('div', class_ = 'rating').text.strip().split('\n')[3].replace('  ', '')
    except:
        parental_rating = ''

    # Runtime
    try:
        runtime = soup.find('div', class_ = 'runtime').text.strip()[9:]
    except:
        runtime = ''
    
    # Append data as a dictionary to list of dictionaries
    top_100_grossing_metacritic_dicts.append({'title': url['title'],
                                              'release_date': release_date,
                                              'critic_score': critic_score,
                                              'user_score': user_score,
                                              'studio': studio,
                                              'director': director,
                                              'main_cast': main_cast,
                                              'genres': genres,
                                              'parental_rating': parental_rating,
                                              'runtime': runtime
                                             })

pprint(top_100_grossing_metacritic_dicts)

[{'critic_score': '81',
  'director': 'J.J. Abrams',
  'genres': ['Action', 'Adventure', 'Sci-Fi', 'Fantasy'],
  'main_cast': ['Adam Driver',
                ' Carrie Fisher',
                ' Daisy Ridley',
                ' Domhnall Gleeson',
                ' Harrison Ford',
                ' John Boyega',
                ' Mark Hamill',
                ' Oscar Isaac'],
  'parental_rating': 'PG-13',
  'release_date': 'December 18, 2015',
  'runtime': '135 min',
  'studio': 'Walt Disney Studios Motion Pictures',
  'title': 'Star Wars Ep. VII: The Force Awakens',
  'user_score': '6.8'},
 {'critic_score': '83',
  'director': 'James Cameron',
  'genres': ['Action', 'Adventure', 'Sci-Fi', 'Fantasy'],
  'main_cast': ['Michelle Rodriguez', ' Sam Worthington', ' Sigourney Weaver'],
  'parental_rating': 'PG-13',
  'release_date': 'December 18, 2009',
  'runtime': '162 min',
  'studio': 'Twentieth Century Fox Film Corporation',
  'title': 'Avatar',
  'user_score': '7.5'},
 {'critic_score': '

  'director': 'Bill Condon',
  'genres': ['Drama', 'Thriller', 'Fantasy', 'Romance'],
  'main_cast': ['Dakota Fanning',
                ' Kristen Stewart',
                ' Robert Pattinson',
                ' Taylor Lautner'],
  'parental_rating': 'PG-13',
  'release_date': 'November 18, 2011',
  'runtime': '117 min',
  'studio': 'Summit Entertainment',
  'title': 'The Twilight Saga: Breaking Dawn, Part 1',
  'user_score': '4.6'},
 {'critic_score': '41',
  'director': 'Jay Roach',
  'genres': ['Comedy', 'Romance'],
  'main_cast': ['Barbra Streisand',
                ' Ben Stiller',
                ' Blythe Danner',
                ' Dustin Hoffman',
                ' Robert De Niro',
                ' Teri Polo'],
  'parental_rating': 'PG-13',
  'release_date': 'December 22, 2004',
  'runtime': '115 min',
  'studio': 'Universal Pictures',
  'title': 'Meet the Fockers',
  'user_score': '7.0'},
 {'critic_score': '73',
  'director': 'Todd Phillips',
  'genres': ['Comedy', 'Crime'],
  'm

In [8]:
# Create DataFrame
top_100_grossing_metacritic_pd = pd.DataFrame(top_100_grossing_metacritic_dicts)[['title','release_date','critic_score','user_score','studio','director','main_cast','genres','parental_rating','runtime']]
top_100_grossing_metacritic_pd.head()

Unnamed: 0,title,release_date,critic_score,user_score,studio,director,main_cast,genres,parental_rating,runtime
0,Star Wars Ep. VII: The Force Awakens,"December 18, 2015",81,6.8,Walt Disney Studios Motion Pictures,J.J. Abrams,"[Adam Driver, Carrie Fisher, Daisy Ridley, ...","[Action, Adventure, Sci-Fi, Fantasy]",PG-13,135 min
1,Avatar,"December 18, 2009",83,7.5,Twentieth Century Fox Film Corporation,James Cameron,"[Michelle Rodriguez, Sam Worthington, Sigour...","[Action, Adventure, Sci-Fi, Fantasy]",PG-13,162 min
2,Black Panther,"February 16, 2018",88,6.6,Walt Disney Studios Motion Pictures,Ryan Coogler,"[Andy Serkis, Angela Bassett, Chadwick Bosem...","[Action, Adventure, Sci-Fi, Drama]",PG-13,134 min
3,Avengers: Infinity War,"April 27, 2018",68,8.6,Walt Disney Studios Motion Pictures,Anthony Russo and Joe Russo,"[Angela Bassett, Benedict Cumberbatch, Benic...","[Action, Adventure, Sci-Fi, Fantasy]",PG-13,149 min
4,Titanic,"December 19, 1997",75,8.5,Paramount Pictures,James Cameron,"[Kate Winslet, Leonardo DiCaprio]","[Drama, Romance]",PG-13,194 min


In [10]:
# Merge with Base Table
new_base_df = pd.merge(base_df2, top_100_grossing_metacritic_pd, on='title')
new_base_df.head()

Unnamed: 0,Rank,Released,title,DomesticBox Office,InternationalBox Office,WorldwideBox Office,release_date,critic_score,user_score,studio,director,main_cast,genres,parental_rating,runtime
0,1,2015,Star Wars Ep. VII: The Force Awakens,936662225,1116648,2053311,"December 18, 2015",81,6.8,Walt Disney Studios Motion Pictures,J.J. Abrams,"[Adam Driver, Carrie Fisher, Daisy Ridley, ...","[Action, Adventure, Sci-Fi, Fantasy]",PG-13,135 min
1,2,2009,Avatar,760507625,2015837,2776345,"December 18, 2009",83,7.5,Twentieth Century Fox Film Corporation,James Cameron,"[Michelle Rodriguez, Sam Worthington, Sigour...","[Action, Adventure, Sci-Fi, Fantasy]",PG-13,162 min
2,3,2018,Black Panther,700059566,648300000,1348359,"February 16, 2018",88,6.6,Walt Disney Studios Motion Pictures,Ryan Coogler,"[Andy Serkis, Angela Bassett, Chadwick Bosem...","[Action, Adventure, Sci-Fi, Drama]",PG-13,134 min
3,4,2018,Avengers: Infinity War,678815482,1369988,2048803,"April 27, 2018",68,8.6,Walt Disney Studios Motion Pictures,Anthony Russo and Joe Russo,"[Angela Bassett, Benedict Cumberbatch, Benic...","[Action, Adventure, Sci-Fi, Fantasy]",PG-13,149 min
4,5,1997,Titanic,659363944,1548844,2208208,"December 19, 1997",75,8.5,Paramount Pictures,James Cameron,"[Kate Winslet, Leonardo DiCaprio]","[Drama, Romance]",PG-13,194 min
