# Rotten Tomatoes Data Scraping

In [11]:
import os
while os.path.basename(os.getcwd()) != "ada-project-private":
    os.chdir('..')

import pandas as pd
import json

DATA_FOLDER = './MovieSummaries/'

# Load character.metadata.tsv
character_metadata_cols = ["Wikipedia movie ID", "Freebase movie ID",
                           "Movie release date", "Character name",
                           "Actor date of birth", "Actor gender", "Actor height (in meters)", "Actor ethnicity (Freebase ID)",
                           "Actor name", "Actor age at movie release",
                           "Freebase character/actor map ID", "Freebase character ID", "Freebase actor ID"]
character_metadata_df = pd.read_csv(DATA_FOLDER + 'character.metadata.tsv', sep='\t', header=None, names=character_metadata_cols)

# Load movie.metadata.tsv
movie_metadata_cols = ["Wikipedia movie ID", "Freebase movie ID", "Movie name", "Movie release date", "Movie box office revenue", "Movie runtime",
                       "Movie languages (Freebase ID:name tuples)", "Movie countries (Freebase ID:name tuples)",
                       "Movie genres (Freebase ID:name tuples)"]
movie_metadata_df = pd.read_csv(DATA_FOLDER + 'movie.metadata.tsv', sep='\t', header=None, names=movie_metadata_cols)

# Load name.clusters.txt
name_clusters_cols = ["Character name", "Freebase movie ID"]
name_clusters_df = pd.read_csv(DATA_FOLDER + 'name.clusters.txt', sep='\t', header=None, names=name_clusters_cols)

# Load plot_summaries.txt
plot_summaries_cols = ["Wikipedia movie ID", "Wikipedia plot"]
plot_summaries_df = pd.read_csv(DATA_FOLDER + 'plot_summaries.txt', sep='\t', header=None, names=plot_summaries_cols)

# Load tvtropes.clusters.txt
tvtropes_clusters_cols = ["Character types", "details"]
tvtropes_clusters_df = pd.read_csv(DATA_FOLDER + 'tvtropes.clusters.txt', sep='\t', header=None, names=tvtropes_clusters_cols)

tvtropes_clusters_df["details_dict"] = tvtropes_clusters_df["details"].apply(json.loads)
tvtropes_clusters_df["Character name"] = tvtropes_clusters_df["details_dict"].apply(lambda x: x.get('char'))
tvtropes_clusters_df["Movie name"] = tvtropes_clusters_df["details_dict"].apply(lambda x: x.get('movie'))
tvtropes_clusters_df["Freebase character/actor map ID"] = tvtropes_clusters_df["details_dict"].apply(lambda x: x.get('id'))
tvtropes_clusters_df["Actor name"] = tvtropes_clusters_df["details_dict"].apply(lambda x: x.get('actor'))

tvtropes_clusters_df.drop(columns=["details", "details_dict"], inplace=True)



CURRENT_PATH = "DataScraping/RottenTomatoes/"

In [12]:
import numpy as np

def set_counter(n, name = "default"):
	np.save(CURRENT_PATH + "counters/counter_" + name + ".npy", np.array(int(n), dtype=int))

def get_counter(name = "default"):
	return np.load(CURRENT_PATH + "counters/counter_" + name + ".npy")

def add_counter(n, name = "default"):
	set_counter(get_counter(name) + n, name)

def safe_int_conversion(value, default):
    try:
        return int(value)
    except ValueError:
        return default

In [3]:
from fuzzywuzzy import fuzz

def names_similarity(str1, str2):
    return fuzz.ratio(str1.lower().replace(' ','').replace('\'',''), str2.lower().replace(' ','').replace('\'',''))/100.

def time_to_minutes(time_string):
    hours = 0
    minutes = 0
    
    time_parts = time_string.split()

    for part in time_parts:
        if 'h' in part:
            hours = int(part.replace('h', ''))
        if 'm' in part:
            minutes = int(part.replace('m', ''))

    total_minutes = hours * 60 + minutes
    return total_minutes

def safe_int_conversion(value, default):
    try:
        return int(value)
    except ValueError:
        return default

In [18]:
import requests
from bs4 import BeautifulSoup

def get_rotten_url_from_movie_name(movie_name, year = None, toll = 0.8):
    r = requests.get("https://www.rottentomatoes.com/search?search=" + requests.utils.quote(movie_name, safe=''))
    soup = BeautifulSoup(r.content, 'html.parser')
    movies = soup.find_all('search-page-media-row')
    movie_list = []
    
    for movie in movies:
        title_tag = movie.find('a', {'slot': 'title'})
        title = title_tag.get_text(strip=True) if title_tag else ''
        
        release_year = safe_int_conversion(movie.get('releaseyear', ''), None)
        
        link = title_tag.get('href') if title_tag else ''
        
        movie_list.append((title, release_year, link))

    best_match = dict()
    best_match['toll'] = toll
    best_match['movie'] = (None, None, None)
    for movie in movie_list:
        similarity = names_similarity(movie[0], movie_name)
        if similarity > best_match['toll'] and (year is None or movie[1] is None or abs(movie[1] - year) <= 1):
            best_match['toll'] = similarity
            best_match['movie'] = movie
    return best_match['movie'][2]

def get_rotten_movie_metadata(url, toll = 0.95):
    if url is None:
        return None
    
    movie_metadata = dict()
    
    r = requests.get(url)
    soup = BeautifulSoup(r.content, "html.parser")
    
    movie_metadata["title"] = soup.find('rt-text', {'slot': 'title'}).get_text(strip=True)
    
    critics_score_string = soup.find('rt-button', {'slot': 'criticsScore'}).find('rt-text').get_text(strip=True).replace("%","")
    movie_metadata["critics_score"] = safe_int_conversion(critics_score_string, None)
    audience_score_string = soup.find('rt-button', {'slot': 'audienceScore'}).find('rt-text').get_text(strip=True).replace("%","")
    movie_metadata["audience_score"] = safe_int_conversion(audience_score_string, None)
    
    category_wraps = soup.find_all('div', {'class': 'category-wrap', 'data-qa': 'item'})
    for wrap in category_wraps:
        wrap_title = wrap.find('rt-text', {'class': 'key'}).get_text(strip=True)
        
        if names_similarity(wrap_title, "Rating") > toll:
            movie_metadata["rating"] = wrap.find(['rt-text', 'rt-link'], {'data-qa': 'item-value'}).get_text(strip=True)
        if names_similarity(wrap_title, "Genre") > toll:
            movie_metadata["genre"] = wrap.find(['rt-text', 'rt-link'], {'data-qa': 'item-value'}).get_text(strip=True)
        if names_similarity(wrap_title, "Original Language") > toll:
            movie_metadata["language"] = wrap.find(['rt-text', 'rt-link'], {'data-qa': 'item-value'}).get_text(strip=True)
        if names_similarity(wrap_title, "Runtime") > toll:
            runtime_string = wrap.find(['rt-text', 'rt-link'], {'data-qa': 'item-value'}).get_text(strip=True)
            movie_metadata["runtime"] = time_to_minutes(runtime_string)
    
    return movie_metadata

def scrape_row(i):
    row = movie_metadata_df.iloc[i]

    movie_title = row["Movie name"]
    year = None if pd.isna(row["Movie release date"]) else safe_int_conversion(row["Movie release date"].split('-')[0], None)

    scraped_dict = get_rotten_movie_metadata(get_rotten_url_from_movie_name(movie_title, year))

    if scraped_dict:
        scraped_dict["freebaseID"] = row["Freebase movie ID"]
        scraped_dict["original_title"] = row["Movie name"]
        scraped_dict["year"] = year

    return scraped_dict

In [21]:
set_counter(0, name = "low")

In [22]:
scraped_data_list = []

n = len(movie_metadata_df)
step = 100

low = get_counter(name = "low")
while low < n:
    up = min(low + step, n)
    set_counter(up, name = "low")

    print(f"starting new batch {low} - {up}")
    
    scraped_data_list = []
    
    for i in range(low, up):
        scraped_dict = scrape_row(i)

        if scraped_dict is not None:
            scraped_data_list.append(scraped_dict)

        print(f"processed {i} - {i-low+1}/{step}")
    
    c = int(low // step + 1)
    pd.DataFrame(scraped_data_list).to_csv(CURRENT_PATH + f"Data/tempData/rotten_movie_data_{c:05d}.csv", index=False)

    low = get_counter(name = "low")

starting new batch 0 - 100
processed 0 - 0/100
processed 1 - 1/100
processed 2 - 2/100
processed 3 - 3/100
processed 4 - 4/100
processed 5 - 5/100
processed 6 - 6/100
processed 7 - 7/100
processed 8 - 8/100
processed 9 - 9/100
processed 10 - 10/100
processed 11 - 11/100
processed 12 - 12/100
processed 13 - 13/100
processed 14 - 14/100
processed 15 - 15/100
processed 16 - 16/100
processed 17 - 17/100
processed 18 - 18/100
processed 19 - 19/100
processed 20 - 20/100
processed 21 - 21/100
processed 22 - 22/100
processed 23 - 23/100
processed 24 - 24/100
processed 25 - 25/100
processed 26 - 26/100
processed 27 - 27/100
processed 28 - 28/100
processed 29 - 29/100
processed 30 - 30/100
processed 31 - 31/100
processed 32 - 32/100
processed 33 - 33/100
processed 34 - 34/100
processed 35 - 35/100
processed 36 - 36/100
processed 37 - 37/100
processed 38 - 38/100
processed 39 - 39/100
processed 40 - 40/100
processed 41 - 41/100
processed 42 - 42/100
processed 43 - 43/100
processed 44 - 44/100
pro

OSError: Cannot save file into a non-existent directory: 'DataScraping\RottenTomatoes\Data\tempData'

In [23]:
scraped_data_list

[{'title': 'White of the Eye',
  'critics_score': 50,
  'audience_score': 57,
  'rating': 'R',
  'genre': 'Mystery & Thriller',
  'language': 'English',
  'runtime': 110,
  'freebaseID': '/m/0285_cd',
  'original_title': 'White Of The Eye',
  'year': 1987},
 {'title': 'A Woman in Flames',
  'critics_score': None,
  'audience_score': 62,
  'genre': 'Drama',
  'language': 'German',
  'runtime': 107,
  'freebaseID': '/m/01mrr1',
  'original_title': 'A Woman in Flames',
  'year': 1983},
 {'title': "The Sorcerer's Apprentice",
  'critics_score': None,
  'audience_score': 32,
  'genre': 'Fantasy',
  'language': 'English',
  'runtime': 89,
  'freebaseID': '/m/04jcqvw',
  'original_title': "The Sorcerer's Apprentice",
  'year': 2002},
 {'title': "Alexander's Ragtime Band",
  'critics_score': 83,
  'audience_score': 46,
  'genre': 'Musical',
  'language': 'English',
  'runtime': 105,
  'freebaseID': '/m/02qc0j7',
  'original_title': "Alexander's Ragtime Band",
  'year': 1938},
 {'title': 'Sarah

In [24]:
pd.DataFrame(scraped_data_list).to_csv(CURRENT_PATH + f"Data/tempData/rotten_movie_data_{c:05d}.csv", index=False)