# Setup

## Pandas & general

In [None]:
import pandas as pd
import datetime
from collections import defaultdict 
import requests
from bs4 import BeautifulSoup

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

current_year = datetime.date.today().year

base_search = 'https://www.rottentomatoes.com/'

director_search = f'{base_search}celebrity/'
franchise_search = f'{base_search}franchise/'
general_search = f'{base_search}search?search='

## Function to change lists of directors/franchises to more URL-friendly form

In [None]:
def urlify_list(my_list):

    my_dict = defaultdict(dict)

    for item in my_list:
       
       my_dict[item]['url_friendly_name'] = ' '.join(item.lower().replace('-', '').replace('.', '').split()).replace(' ', '_')

    return my_dict

## Function to pass web page to main loop

In [None]:
def get_soup(subject, **kwargs):

    uri = subject['url_friendly_name']

    if kwargs['type'] == 'director':

        url = f'{director_search}{uri}'

    elif kwargs['type'] == 'franchise':

        url = f'{franchise_search}{uri}'

    page = requests.get(url)

    return BeautifulSoup(page.content, 'html.parser')

# Directors

## Setup

In [None]:
# List of directors; modify as needed

directors = ['George Lucas', 'M. Night Shyamalan', 'Lana Wachowski', 'Cameron Crowe', 'James Cameron', 'Steven Spielberg',
             'Christopher Nolan', 'Kathryn Bigelow', 'Paul Verhoeven', 'James L. Brooks', 'Brad Bird', 'Genndy Tartakovsky', 
             'Ang Lee', 'Bradley Cooper', 'Nancy Meyers', 'Tim Burton', 'Michael Mann', 'Hayao Miyazaki', 'Jonathan Demme', 
             'George Miller', 'Nora Ephron', 'Gina Prince-Bythewood', 'Robert Zemeckis', 'John Musker', 'Elaine May', 
             'Joseph Gordon-Levitt', 'John Singleton', 'John Carpenter', 'Jane Campion', 'Sam Raimi', 'Bob Fosse',
             'Stanley Kubrick', 'Henry Selick', 'Danny Boyle', 'Buster Keaton', 'Park Chan Wook', 'David Fincher',
             'Barbra Streisand', 'John McTiernan']


# Add to this dictionary if the miniseries is better referred to by a name other than the director's

aliases = {'Lana Wachowski': 'The Wachowskis',
            'John Musker': 'Musker/Clements'}


director_dict = urlify_list(directors)


# Start/end dates that the pod covered, if applicable

director_start_dates = {'George Lucas' : 1977,
                        'Steven Spielberg': 1997,
                        'Paul Verhoeven': 1985,
                        'Buster Keaton': 1923}

director_end_dates = {'Buster Keaton' : 1929}


# For some very weird reason, RT refers to John Carpenter by his actor alias "Rip Haight"

director_dict['John Carpenter']['url_friendly_name'] = 'rip_haight'


# Results dictionary we're about to use

def dd():
    
    return defaultdict(dict)

director_results = defaultdict(dd)

## Scraping

In [None]:
for key, val in director_dict.items():

    if key not in director_results:

        soup = get_soup(val, type = 'director')

        movie_rows = soup.find_all('tr', {'data-qa' : 'celebrity-filmography-movies-trow'}) 

        for row in movie_rows:

              this_movie_title = row['data-title']

              this_movie_year = int(row['data-year'])

              this_movie_credit = row.find('td', {'class' : 'celebrity-filmography__credits'}).text

              this_movie_rt = row.find('span', attrs = {'class' : 'label', 'data-tomatometer': True})

              if (this_movie_rt is not None) & ('Director' in this_movie_credit) & (this_movie_year >= director_start_dates.get(key, 0)) & (this_movie_year <= director_end_dates.get(key, current_year)):

                      director_results[key][this_movie_title]['Year'] = this_movie_year
                      director_results[key][this_movie_title]['RT'] = this_movie_rt['data-tomatometer']
                      print(this_movie_title, this_movie_year, this_movie_rt['data-tomatometer'])

## Function to convert scraped dictionary to dataframe

In [None]:
def nested_dict_to_df(my_dict):

    return pd.concat({
        k: pd.DataFrame.from_dict(v, 'index') for k, v in my_dict.items()
    }, axis = 0)

## Make director dataframe

In [None]:
directors_df = nested_dict_to_df(director_results).rename_axis(['Subject', 'Title']).reset_index()


# List of movies that were skipped; modify as needed

director_manual_drops = ['Pearl Jam Twenty',
                         'Quay',
                         'Black Book',
                         "Tim Burton's Corpse Bride",
                         'Frankenweenie',
                         'Swimming to Cambodia',
                         'Enzo Avitabile Music Life',
                         'Neil Young Journeys',
                         'Neil Young Trunk Show',
                         'Jimmy Carter: Man From Plains',
                         'Neil Young: Heart of Gold',
                         'The Agronomist',
                         'Storefront Hitchcock',
                         'Disappearing Acts',
                         "Someone's Watching Me!",
                         'Elvis',
                         'Body Bags',
                         'Three... Extremes',
                         'Full Frontal']

# Drop skipped movies

directors_df = directors_df[~directors_df['Title'].isin(director_manual_drops)]


directors_df['Feed'] = 'Main Feed'

directors_df['Subject'] = [aliases.get(x, x) for x in directors_df['Subject']]


directors_df.tail()

# Franchises

## Setup

In [None]:
# List of franchises; modify as needed. Note that the RT franchise pages are very random
# (e.g. no James Bond or Star Wars??), so check to see what exists before adding. 
# They also often are inaccessible for no apparent reason. Pay attention when you run the next cell. 
# It will interrupt if it can't access the franchise page. You may need to re-run it until it goes.

franchises = ['Marvel Cinematic Universe', 'Mission Impossible', 'Alien', 'Star Trek', 'Ghostbusters', 
            'Men in Black', 'Planet of the Apes', 'Oceans', 'Austin Powers', 'Terminator']

             
# Start/end dates that the pod covered, if applicable

franchise_start_dates = {}

franchise_end_dates = {'Marvel Cinematic Universe' : 2019,
                        'Mission Impossible' : 2018,
                        'Star Trek' : 1991, 
                        'Planet of the Apes' : 1973}


franchise_dict = urlify_list(franchises)


# Results dictionary we're about to use

franchise_results = defaultdict(dd)


franchise_dict

## Scraping

In [None]:
for key, val in franchise_dict.items():

    if key not in franchise_results:

        soup = get_soup(val, type = 'franchise')

        movie_list_items = soup.find_all('li', {'data-franchise-type' : 'Movie'})

        if not movie_list_items:

            input('ERROR! RT franchise page being weird! Re-run this cell until it works!')

            break

        else:
            
            for item in movie_list_items:

                this_movie_year = int(item.find('span', {'data-qa' : True}).text[1:-1])

                this_movie_title = item.find('a', {'data-qa' : True}).text.strip()


                if (this_movie_title == "Ocean's Eleven") | (this_movie_title == 'Ghostbusters'): # Same title in same franchise!

                    this_movie_title = item.find('a', {'data-qa' : True}).text.strip() + ' (' + str(this_movie_year) + ')'


                this_movie_rt = item.find('strong', {'data-qa': 'franchise-media-tomatometer'})
                

                if (this_movie_rt is not None) & (this_movie_year >= franchise_start_dates.get(key, 0)) & (this_movie_year <= franchise_end_dates.get(key,  current_year)):

                    franchise_results[key][this_movie_title]['Year'] = this_movie_year

                    franchise_results[key][this_movie_title]['RT'] = this_movie_rt.text[0:-1]

                    print(this_movie_title, this_movie_year, this_movie_rt.text[0:-1])

## Make franchise dataframe

In [None]:
franchises_df = nested_dict_to_df(franchise_results).rename_axis(['Subject', 'Title']).reset_index()


# List of movies that were skipped; modify as needed

franchise_manual_drops = []


# Drop skipped movies

franchises_df = franchises_df[~franchises_df['Title'].isin(franchise_manual_drops)]


franchises_df['Feed'] = 'Patreon'


franchises_df.tail()

# Other movies covered

## Setup

In [None]:
# Read in spreadsheet

da_moviesh = pd.read_excel(f'blank_check_addl.ods', sheet_name = 'Sheet1', dtype = {'Year' : 'Int64'}).reset_index()


# Unfortunately, I don't believe RT has "advanced" search options.
# It's possible that the movie you want won't be on the first page of results.
# There are also a couple of examples of movies in the RT database that have both the same year 
# and the same title. Rather than having to do a whole bunch more programming to handle a handful of cases,
# it's easier to simply add an actor name to the movies that have these issues. 
# If you see such issues, edit the dictionary below accordingly.

narrow_result = {'Justice League' : 'Justice League Affleck',
                    'Whisper of the Heart' : 'Whisper of the Heart Snow',
                    'Babe' : 'Babe Cromwell',
                    'Aladdin and the King of Thieves' : 'Aladdin and the King of Thieves Bettin',
                    'The Lego Batman Movie' : 'The Lego Batman Movie Arnett'}

da_moviesh['url_friendly_name'] = [narrow_result.get(x, x) for x in da_moviesh['Title']]


# "The Mummy" is particularly annoying because there are two covered movies named that,
# and only the lousy 2017 version shows up on the first page of results.

da_moviesh.loc[(da_moviesh['Title'] == 'The Mummy') & (da_moviesh['Year'] == 1999), 'url_friendly_name'] = 'The Mummy Fraser'


# Because it's possible a title can look like an integer (e.g. "2010"), we have to make sure Pandas knows the column
# we just made is a string. We also need to convert ampersands so the search can understand them.

da_moviesh['url_friendly_name'] = da_moviesh['url_friendly_name'].astype(str)

da_moviesh['url_friendly_name'] = da_moviesh['url_friendly_name'].str.lower().str.replace('&', '%26')


# Results list we're about to use

moviesh_results = []


da_moviesh.tail()

## Scraping

In [None]:
for this_index, this_title, this_year, this_search_name in zip(
    da_moviesh['index'], da_moviesh['Title'], 
    da_moviesh['Year'], da_moviesh['url_friendly_name']):

    if this_index not in moviesh_results:

        url = f'{general_search}{this_search_name}'

        print(f'{this_title}, Year Input: {this_year},', end = ' ')

        page = requests.get(url)

        soup = BeautifulSoup(page.content, 'html.parser')

        movies_only = soup.find('search-page-result', type = 'movie')

        movie_rows = movies_only.find_all('search-page-media-row')

        results_titles = []

        results_years = []
        
        results_tomatometers = []
        

        for row in movie_rows:

            if len(str(row['tomatometerscore'])) > 0: # Skip rows without RT scores

                results_titles.append(row.find('img')['alt'])

                results_years.append(row['releaseyear'])

                results_tomatometers.append(row['tomatometerscore'])


        df = pd.DataFrame({'result_title': results_titles, 'result_year': results_years, 'RT': results_tomatometers})    


        if pd.isna(this_year):

            df = df[df['result_title'].str.upper() == str(this_title).upper()]

        else:

            df = df[(df['result_title'].str.upper() == str(this_title).upper()) & (df['result_year'] == str(this_year))]

        
        # If adding to spreadsheet, be sure to check the output to see if there are match problems.

        if len(df.index) == 0:

            print('WARNING: No matches!')

        elif len(df.index) > 1:

            print(f'WARNING: {len(df.index)} matches!')

        else:

            print(f"Year Found: {df.iloc[0]['result_year']}, RT: {df.iloc[0]['RT']}")


        moviesh_results = moviesh_results + df.to_dict('records')

## Make other movie dataframe

In [None]:
moviesh_results_df = pd.DataFrame(moviesh_results).reset_index()


moviesh_df = pd.merge(da_moviesh,
                    moviesh_results_df,
                     on = 'index')

moviesh_df['Year'] = moviesh_df['result_year'].astype('int64')

moviesh_df = moviesh_df.drop(['index', 'result_title', 'url_friendly_name', 'result_year'], axis = 1)


moviesh_df.tail()

# Concatenate and output

In [None]:
output = pd.concat([directors_df, franchises_df, moviesh_df], ignore_index = True)

output = output[['Feed', 'Subject', 'Title', 'Year', 'RT']]

output = output.sort_values(by = ['Feed', 'Subject', 'Year'])

output.to_csv('blank_check_rt.csv', index = False)

In [None]:
output