In [None]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

from collections import defaultdict
from rotten_tomatoes_scraper.rt_scraper import RTScraper, MovieScraper
# https://github.com/pdrm83/rotten_tomatoes_scraper

In [None]:
from bs4 import BeautifulSoup
import difflib
import re
import requests
from urllib.request import urlopen

In [None]:
class DirectorScraper(RTScraper):
    def __init__(self, **kwargs):
        RTScraper.__init__(self)
        if 'director_name' in kwargs.keys():
            self.director_name = kwargs['director_name']
            self.extract_url()
        if 'director_url' in kwargs.keys():
            self.url = kwargs['director_url']
        if 'print' in kwargs.keys():
            self.print = kwargs['print']

    def extract_url(self):
        search_result = self.search(term=self.director_name)
        url_director = 'https://www.rottentomatoes.com' + search_result['actors'][0]['url']
        self.url = url_director
    
    def extract_metadata(self):
        try:
            if (self.print):
                try:
                    print(self.director_name, self.url)
                except AttributeError:
                    print(self.url)
        except AttributeError:
            pass
        page_director = urlopen(self.url)
        soup = BeautifulSoup(page_director, 'lxml')
        try:
            selected_section = soup.find_all('tbody', class_='celebrity-filmography__tbody')[0]
        except IOError:
            print('The parsing process returns an error.')
        soup_filmography = BeautifulSoup(str(selected_section), 'lxml')
        movie_metadata = defaultdict(dict)
        for each_row in soup_filmography.find_all('tr'):
            is_this_a_linked_movie = each_row.find('td', class_='celebrity-filmography__title').find('a')
            if is_this_a_linked_movie is None:
                next
            else:
                for each_cell in each_row.find_all('td', class_="celebrity-filmography__credits"):
                    for each_string in each_cell.stripped_strings:
                        if "Director" in each_string:
                            try:
                                this_title = each_row['data-title']
                                movie_metadata[this_title]['Year'] = each_row['data-year']
                                movie_metadata[this_title]['Score_Rotten'] = each_row['data-tomatometer']
                                movie_metadata[this_title]['Box_Office'] = each_row['data-boxoffice']
                            except IOError:
                                pass
        self.metadata = movie_metadata

In [None]:
def get_movies_by_director(dict_args):
    movie_scraper = DirectorScraper(**dict_args)
    movie_scraper.extract_metadata()
    return movie_scraper.metadata

In [None]:
directors = ['George Lucas', 'M Night Shyamalan', 'Lana Wachowski', 'Cameron Crowe', 'James Cameron', 'Steven Spielberg', 'Christopher Nolan', 'Kathryn Bigelow', 'Paul Verhoeven', 'Brad Bird', 'Ang Lee', 'James L Brooks', 'Tim Burton', 'Michael Mann', 'Hayao Miyazaki', 'Jonathan Demme',
             'George Miller', 'Nora Ephron', 'Gina Prince-Bythewood', 'Robert Zemeckis', 'John Musker', 'Elaine May', 'John Singleton', 'John Carpenter']

director_dict = defaultdict(dict)

In [None]:
# Get info for each director
for director in directors:
    if director in director_dict.keys():
        next
    else:
        if director == 'George Miller':
            args = {'director_url': 'https://www.rottentomatoes.com/celebrity/1042523-george_miller', 'print': True}
        else:
            args = {'director_name': director, 'print': True}
    director_dict[director] = get_movies_by_director(args)

In [None]:
# Convert director info into a dataframe
director_df = pd.DataFrame.from_dict({(i,j): director_dict[i][j] 
                                      for i in director_dict.keys() 
                                      for j in director_dict[i].keys()},
                                     orient='index')

# Set multi-index names
director_df.index.set_names(["Director", "Film"], inplace=True)

In [None]:
franchises = {'MCU Franchise': 
                  {'Iron Man': {'search_by': 'URL', 'URL': 'https://www.rottentomatoes.com/m/iron_man'},
                  'The Incredible Hulk': {'search_by': 'title'},
                  'Iron Man 2': {'search_by': 'title'},
                  'Thor': {'search_by': 'title'},
                  'Captain America: The First Avenger': {'search_by': 'URL', 'URL': 'https://www.rottentomatoes.com/m/captain_america_the_first_avenger'},
                  'The Avengers': {'search_by': 'URL', 'URL': 'https://www.rottentomatoes.com/m/marvels_the_avengers'},
                  'Iron Man 3': {'search_by': 'title'},
                  'Thor: The Dark World': {'search_by': 'title'},
                  'Captain America: The Winter Soldier': {'search_by': 'title'},
                  'Guardians of the Galaxy': {'search_by': 'title'},
                  'Avengers: Age of Ultron': {'search_by': 'title'},
                  'Ant-Man': {'search_by': 'title'},
                  'Captain America: Civil War': {'search_by': 'URL', 'URL': 'https://www.rottentomatoes.com/m/captain_america_civil_war'},
                  'Doctor Strange': {'search_by': 'title'},
                  'Guardians of the Galaxy Vol. 2': {'search_by': 'title'},
                  'Spider-Man: Homecoming': {'search_by': 'title'},
                  'Thor: Ragnarok': {'search_by': 'title'},
                  'Black Panther': {'search_by': 'title'},
                  'Avengers: Infinity War': {'search_by': 'title'},
                  'Ant-Man and the Wasp': {'search_by': 'title'},
                  'Captain Marvel': {'search_by': 'title'},
                  'Avengers: Endgame': {'search_by': 'URL', 'URL': 'https://www.rottentomatoes.com/m/avengers_endgame'},
                  'Spider-Man: Far From Home': {'search_by': 'URL', 'URL': 'https://www.rottentomatoes.com/m/spider_man_far_from_home'}},
                'Star Wars Franchise': 
                   {'Star Wars': {'search_by': 'URL', 'URL': 'https://www.rottentomatoes.com/m/star_wars'},
                    'The Empire Strikes Back': {'search_by': 'title'},
                    'Return of the Jedi': {'search_by': 'URL', 'URL': 'https://www.rottentomatoes.com/m/star_wars_episode_vi_return_of_the_jedi'},
                    'Star Wars: Episode I -- The Phantom Menace': {'search_by': 'title'},
                    'Star Wars: Episode II -- Attack of the Clones': {'search_by': 'title'},
                    'Star Wars: Episode III -- Revenge of the Sith': {'search_by': 'title'},
                    'Star Wars: The Force Awakens': {'search_by': 'title'},
                    'Rogue One: A Star Wars Story': {'search_by': 'title'},
                    'Star Wars: The Last Jedi': {'search_by': 'URL', 'URL': 'https://www.rottentomatoes.com/m/star_wars_the_last_jedi'},
                    'Solo: A Star Wars Story': {'search_by': 'title'},
                    'Star Wars: The Rise of Skywalker': {'search_by': 'title'}},
                'Toy Story Franchise':
                   {'Toy Story': {'search_by': 'title'},
                    'Toy Story 2': {'search_by': 'URL', 'URL': 'https://www.rottentomatoes.com/m/toy_story_2'},
                    'Toy Story 3': {'search_by': 'title'},
                    'Toy Story 4': {'search_by': 'title'}},
                'Mission: Impossible Franchise':
                   {'Mission: Impossible': {'search_by': 'title'},
                    'Mission: Impossible 2': {'search_by': 'URL', 'URL': 'https://www.rottentomatoes.com/m/mission_impossible_2'},
                    'Mission: Impossible III': {'search_by': 'title'},
                    'Mission: Impossible -- Ghost Protocol': {'search_by': 'title'},
                    'Mission: Impossible Rogue Nation': {'search_by': 'URL', 'URL': 'https://www.rottentomatoes.com/m/mission_impossible_rogue_nation'} ,
                    'Mission: Impossible -- Fallout': {'search_by': 'URL', 'URL': 'https://www.rottentomatoes.com/m/mission_impossible_fallout'}},
                'Alien Franchise':
                   {'Alien': {'search_by': 'title'}, 
                    'Aliens': {'search_by': 'title'}, 
                    'Alien 3': {'search_by': 'URL', 'URL': 'https://www.rottentomatoes.com/m/alien3'}, 
                    'Alien: Resurrection': {'search_by': 'title'}, 
                    'Prometheus': {'search_by': 'title'},
                    'Alien: Covenant': {'search_by': 'title'}}, 
                'Crocodile Dundee Franchise': 
                   {'Crocodile Dundee': {'search_by': 'title'},
                    'Crocodile Dundee 2': {'search_by': 'title'},
                    'Crocodile Dundee in Los Angeles': {'search_by': 'title'}},
                'Star Trek TOS Franchise':
                   {'Star Trek: The Motion Picture': {'search_by': 'title'},
                    'Star Trek II: The Wrath of Khan': {'search_by': 'title'},
                    'Star Trek III: The Search for Spock': {'search_by': 'title'},
                    'Star Trek IV: The Voyage Home': {'search_by': 'URL', 'URL': 'https://www.rottentomatoes.com/m/star_trek_iv_the_voyage_home'},
                    'Star Trek V: The Final Frontier': {'search_by': 'URL', 'URL': 'https://www.rottentomatoes.com/m/star_trek_v_the_final_frontier'} ,
                    'Star Trek VI: The Undiscovered Country': {'search_by': 'URL', 'URL': 'https://www.rottentomatoes.com/m/star_trek_vi_the_undiscovered_country'}},
                'Twilight Franchise':
                    {'Twilight': {'search_by': 'URL', 'URL': 'https://www.rottentomatoes.com/m/twilight'},
                     'The Twilight Saga: New Moon': {'search_by': 'URL', 'URL': 'https://www.rottentomatoes.com/m/twilight_saga_new_moon'},
                     'The Twilight Saga: Eclipse': {'search_by': 'title'},
                     'The Twilight Saga: Breaking Dawn Part 1': {'search_by': 'title'},
                     'The Twilight Saga: Breaking Dawn Part 2': {'search_by': 'title'}}}

franchise_dict = defaultdict(dict)

In [None]:
def get_movies_by_title_or_url(dict_args):
    movie_scraper = MovieScraper(**dict_args)
    movie_scraper.extract_metadata()
    return movie_scraper.metadata['Score_Rotten']

In [None]:
for franchise, movie in franchises.items():
    for key in movie.keys():
        print(f'Processing {key}...')
        if key in franchise_dict[franchise].keys():
            next
        else:
            if franchises[franchise][key]['search_by'] == 'title':
                args = {'movie_title': key}
            elif franchises[franchise][key]['search_by'] == 'URL':
                args = {'movie_url': franchises[franchise][key]['URL']}
        franchise_dict[franchise][key] = get_movies_by_title_or_url(args)

In [None]:
# Convert franchise info into a dataframe
franchise_df = pd.DataFrame.from_dict({(i,j): franchise_dict[i][j] 
                                      for i in franchise_dict.keys() 
                                      for j in franchise_dict[i].keys()},
                                      orient='index', columns = ['Score_Rotten'])

# Fix up multi-index
franchise_df.index = pd.MultiIndex.from_tuples(franchise_df.index)

# Standardize columns
franchise_df['Year'] = np.nan
franchise_df['Box_Office'] = np.nan
franchise_df = franchise_df[['Year', 'Score_Rotten', 'Box_Office']]
franchise_df.index.set_names(["Director", "Film"], inplace=True)

In [None]:
misc_movies = {'The Judge': {'search_by': 'URL', 'URL': 'https://www.rottentomatoes.com/m/the_judge_2014'},
               'Fantastic Four (1994)': {'search_by': 'URL', 'URL': 'https://www.rottentomatoes.com/m/10005582-fantastic_four'},
               'Fantastic Four (2005)': {'search_by': 'URL', 'URL': 'https://www.rottentomatoes.com/m/fantastic_four'},
               'Fantastic Four: Rise of the Silver Surfer': {'search_by': 'URL', 'URL': 'https://www.rottentomatoes.com/m/fantastic_four_2_rise_of_the_silver_surfer'},
               'Fantastic Four (2015)': {'search_by': 'URL', 'URL': 'https://www.rottentomatoes.com/m/fantastic_four_2015'},
               'Star Wars: Holiday Special': {'search_by': 'URL', 'URL': 'https://www.rottentomatoes.com/m/star_wars_holiday_special'},
               'Batman v Superman: Dawn of Justice': {'search_by': 'title'},
               'Fletch': {'search_by': 'title'},
               'Suicide Squad': {'search_by': 'title'},
               'Under Siege 2: Dark Territory': {'search_by': 'title'},
               'Jack Reacher: Never Go Back': {'search_by': 'title'},
               'The Man Who Knew Too Little': {'search_by': 'title'},
               'Blank Check': {'search_by': 'title'},
               'Clifford': {'search_by': 'title'},
               'Wonder Woman': {'search_by': 'URL', 'URL': 'https://www.rottentomatoes.com/m/wonder_woman_2017'},
               'The Book of Henry': {'search_by': 'URL', 'URL': 'https://www.rottentomatoes.com/m/the_book_of_henry'},
               'The Devil Wears Prada': {'search_by': 'title'},
               'Justice League':  {'search_by': 'URL', 'URL': 'https://www.rottentomatoes.com/m/justice_league_2017'},
               'Lost in Space': {'search_by': 'title'},
               'Running Scared': {'search_by': 'URL', 'URL': 'https://www.rottentomatoes.com/m/1018009-running_scared'},
               'Josie and the Pussycats': {'search_by': 'title'},
               'Hotel Transylvania': {'search_by': 'title'},
               'Hotel Transylvania 2': {'search_by': 'title'},
               'Hotel Transylvania 3: Summer Vacation': {'search_by': 'title'},
               'Space Jam': {'search_by': 'title'},
               'A Star Is Born': {'search_by': 'title'},
               'Home Again': {'search_by': 'URL', 'URL': 'https://www.rottentomatoes.com/m/home_again_2017'},
               'Ralph Breaks the Internet': {'search_by': 'title'},
               'Aquaman': {'search_by': 'title'},
               'Assassin\'s Creed': {'search_by': 'title'},
               'Pokemon Detective Pikachu': {'search_by': 'title'},
               'The Lion King': {'search_by': 'title'},
               'Joker': {'search_by': 'title'},
               'The Kingdom of Dreams and Madness': {'search_by': 'title'},
               'Birds of Prey (and the Fantabulous Emancipation of One Harley Quinn)': {'search_by': 'title'},
               'Twilight Zone: The Movie': {'search_by': 'URL', 'URL': 'https://www.rottentomatoes.com/m/twilight_zone_the_movie'},
               'Stargate': {'search_by': 'title'},
               'When Harry Met Sally...': {'search_by': 'title'},
               'Wonder Woman 1984': {'search_by': 'title'},
               'Don Jon': {'search_by': 'title'},
               'What Men Want': {'search_by': 'title'},
               'Frankenweenie': {'search_by': 'title'},
               'Whisper of the Heart': {'search_by': 'URL', 'URL': 'https://www.rottentomatoes.com/m/whisper-of-the-heart'},
               'Babe': {'search_by': 'title'},
               'Hanging Up': {'search_by': 'title'},
               'Disappearing Acts': {'search_by': 'URL', 'URL': 'https://www.rottentomatoes.com/m/disappearing_acts'},
               'The Way of the Gun': {'search_by': 'URL', 'URL': 'https://www.rottentomatoes.com/m/way_of_the_gun'},
               'The Return of Jafar': {'search_by': 'title'},
               'Aladdin and the King of Thieves': {'search_by': 'title'}}
                
misc_movie_dict = defaultdict(dict)

In [None]:
# Get info for each miscellaneous movie
for movie in misc_movies:
    print(f'Processing {movie}...')
    if movie in misc_movie_dict.keys():
        next
    else:
        if misc_movies[movie]['search_by'] == 'title':
            args = {'movie_title': movie}
        elif misc_movies[movie]['search_by'] == 'URL':
            args = {'movie_url': misc_movies[movie]['URL']}
        misc_movie_dict[movie] = get_movies_by_title_or_url(args)

In [None]:
# Convert miscellaneous movie info into a dataframe
misc_df = pd.DataFrame.from_dict(misc_movie_dict, orient='index', columns = ['Score_Rotten']).reset_index()

# Standardize columns
misc_df.rename(columns = {'index': 'Film'}, inplace = True)
misc_df['Year'] = np.nan
misc_df['Box_Office'] = np.nan
misc_df = misc_df[['Film', 'Year', 'Score_Rotten', 'Box_Office']]
misc_df['Director'] = 'Misc Director'
misc_df.set_index(['Director', 'Film'], inplace=True)

In [None]:
# Make a monster!
blank_check_df = pd.concat([director_df, franchise_df, misc_df])

# Convert year, RT score, and box office into integers
for col in blank_check_df.columns:
    blank_check_df[col] = blank_check_df[col].astype('float').astype('Int32')  
    
# Drop movies without RT scores
blank_check_df = blank_check_df[blank_check_df['Score_Rotten'] > 0]

# Drop Spielberg pre-1997
blank_check_df = blank_check_df[~((blank_check_df.index.get_level_values(0) == 'Steven Spielberg') & (blank_check_df['Year'] < 1997))]

In [None]:
# Create CSV
blank_check_df.to_csv('blank_check.csv')