### webscraper

In [None]:
import os
os.environ['WDM_SSL_VERIFY'] = '0'
os.environ['WDM_TIMEOUT'] = '300'

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service
from bs4 import BeautifulSoup
import csv
import time
import re

def make_driver():
    options = Options()
    options.add_argument("--headless")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    options.add_argument("--window-size=1920,1080")
    options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
    return webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)


def get_rt_movie_info(driver, main_url, critic_reviews_url):
    print("Scraping RT movie info...")

    # Get title from main page
    driver.get(main_url)
    time.sleep(8)
    soup = BeautifulSoup(driver.page_source, "html.parser")
    title_tag = soup.find("h1")
    title = title_tag.get_text(strip=True) if title_tag else "N/A"

    # Get scores from critic reviews page JSON â€” we know they're embedded there
    print("Getting scores from reviews page JSON...")
    driver.get(critic_reviews_url)
    time.sleep(8)
    page_text = driver.page_source

    tomatometer = "N/A"
    popcornmeter = "N/A"

    # Extract from embedded JSON: "tomatometerScore":{"state":"certified-fresh","value":99}
    t_match = re.search(r'"tomatometerScore"\s*:\s*\{[^}]*"value"\s*:\s*(\d+)', page_text)
    if t_match:
        tomatometer = t_match.group(1) + "%"

    # Extract from embedded JSON: "audienceScore":{"value":90}
    a_match = re.search(r'"audienceScore"\s*:\s*\{[^}]*"value"\s*:\s*(\d+)', page_text)
    if a_match:
        popcornmeter = a_match.group(1) + "%"

    print(f"Title: {title} | Tomatometer: {tomatometer} | Popcornmeter: {popcornmeter}")
    return {"title": title, "tomatometer": tomatometer, "popcornmeter": popcornmeter}


def parse_review_cards(soup, review_type):
    reviews = []
    cards = soup.find_all("review-card")

    for card in cards:
        review = {}
        review["review_type"] = review_type

        # Reviewer name
        name_tag = card.find(attrs={"slot": "name"})
        review["reviewer"] = name_tag.get_text(strip=True) if name_tag else "N/A"

        # Publication
        pub_tag = card.find(attrs={"slot": "publication"})
        review["publication"] = pub_tag.get_text(strip=True) if pub_tag else "N/A"

        # Score
        rating_slot = card.find(attrs={"slot": "rating"})
        if rating_slot:
            stars = rating_slot.find("rating-stars-group")
            if stars:
                review["score"] = stars.get("score", "N/A") + "/5"
            else:
                inner_span = rating_slot.find("span", style=lambda s: s and "margin-top" in s)
                score_text = inner_span.get_text(strip=True) if inner_span else ""
                review["score"] = score_text if score_text else "N/A"
        else:
            review["score"] = "N/A"

        # Review text
        text_tag = card.find(attrs={"slot": "review"})
        review["review_text"] = text_tag.get_text(strip=True) if text_tag else "N/A"

        # Date
        date_tag = card.find(attrs={"slot": "timestamp"})
        review["date"] = date_tag.get_text(strip=True) if date_tag else "N/A"

        if review["reviewer"] != "N/A" or review["review_text"] != "N/A":
            reviews.append(review)

    return reviews


def scrape_rt_reviews(driver, reviews_url, review_type, max_reviews=50):
    print(f"\nScraping {review_type} reviews...")
    driver.get(reviews_url)
    time.sleep(8)

    all_reviews = []

    while True:
        soup = BeautifulSoup(driver.page_source, "html.parser")
        all_reviews = parse_review_cards(soup, review_type)
        print(f"  Collected {len(all_reviews)} {review_type} reviews so far...")

        if len(all_reviews) >= max_reviews:
            break

        clicked = False
        for selector in ["[data-qa='load-more-btn']", "rt-button[data-qa='load-more']", "button.load-more-btn"]:
            try:
                load_more = WebDriverWait(driver, 5).until(
                    EC.element_to_be_clickable((By.CSS_SELECTOR, selector))
                )
                driver.execute_script("arguments[0].click();", load_more)
                time.sleep(3)
                clicked = True
                break
            except:
                continue

        if not clicked:
            try:
                load_more = driver.find_element(By.XPATH, "//rt-button[contains(., 'Load More')] | //button[contains(., 'Load More')]")
                driver.execute_script("arguments[0].click();", load_more)
                time.sleep(3)
            except:
                print(f"  No more {review_type} reviews to load.")
                break

    return all_reviews[:max_reviews]


# ---- MAIN ----
MAIN_URL = input("Enter the Rotten Tomatoes movie URL (e.g. https://www.rottentomatoes.com/m/parasite_2019): ").strip()

base_url = MAIN_URL.rstrip("/")
CRITIC_REVIEWS_URL = f"{base_url}/reviews"
AUDIENCE_REVIEWS_URL = f"{base_url}/reviews/verified-audience"

print(f"\nMain URL:             {MAIN_URL}")
print(f"Critic reviews URL:   {CRITIC_REVIEWS_URL}")
print(f"Audience reviews URL: {AUDIENCE_REVIEWS_URL}\n")

driver = make_driver()

movie_info = get_rt_movie_info(driver, MAIN_URL, CRITIC_REVIEWS_URL)
critic_reviews = scrape_rt_reviews(driver, CRITIC_REVIEWS_URL, "critic", max_reviews=50)
audience_reviews = scrape_rt_reviews(driver, AUDIENCE_REVIEWS_URL, "audience", max_reviews=50)

driver.quit()

all_reviews = critic_reviews + audience_reviews
print(f"\nTotal: {len(critic_reviews)} critic + {len(audience_reviews)} audience = {len(all_reviews)} reviews")

for review in all_reviews:
    review["movie_title"] = movie_info["title"]
    review["tomatometer"] = movie_info["tomatometer"]
    review["popcornmeter"] = movie_info["popcornmeter"]

safe_title = re.sub(r'[^\w\s-]', '', movie_info['title']).strip().replace(' ', '_')
filename = f"reviews_{safe_title}_RT.csv"
filepath = os.path.join("/Users/Diane/Desktop/PSYCH 186B/project", filename)

with open(filepath, "w", newline="", encoding="utf-8") as f:
    fieldnames = ["movie_title",
                  "review_type", "reviewer", "publication", "score", "date", "review_text"]
    writer = csv.DictWriter(f, fieldnames=fieldnames)
    writer.writeheader()
    writer.writerows(all_reviews)

print(f"Saved to: {filepath}")


Main URL:             https://www.rottentomatoes.com/m/parasite_2019
Critic reviews URL:   https://www.rottentomatoes.com/m/parasite_2019/reviews
Audience reviews URL: https://www.rottentomatoes.com/m/parasite_2019/reviews/verified-audience





Scraping RT movie info...
Getting scores from reviews page JSON...
Title: Parasite | Tomatometer: 99% | Popcornmeter: N/A

Scraping critic reviews...
  Collected 20 critic reviews so far...
  Collected 40 critic reviews so far...
  Collected 60 critic reviews so far...

Scraping audience reviews...
  Collected 10 audience reviews so far...
  Collected 30 audience reviews so far...
  Collected 50 audience reviews so far...

Total: 50 critic + 50 audience = 100 reviews
Saved to: /Users/Diane/Desktop/PSYCH 186B/project/reviews_Parasite_RT.csv


### debug code

In [7]:
import os
os.environ['WDM_SSL_VERIFY'] = '0'
os.environ['WDM_TIMEOUT'] = '300'

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service
from bs4 import BeautifulSoup
import time

options = Options()
options.add_argument("--headless")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
options.add_argument("--window-size=1920,1080")
options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

# Check critic review card
driver.get("https://www.rottentomatoes.com/m/parasite_2019/reviews")
time.sleep(8)
soup = BeautifulSoup(driver.page_source, "html.parser")
cards = soup.find_all("review-card")
print(f"CRITIC PAGE - Found {len(cards)} cards")
if cards:
    print("First critic card slot='rating' raw HTML:")
    for card in cards[:3]:
        rating = card.find(attrs={"slot": "rating"})
        print(rating)
        print("---")

# Check audience review card
driver.get("https://www.rottentomatoes.com/m/parasite_2019/reviews/verified-audience")
time.sleep(8)
soup = BeautifulSoup(driver.page_source, "html.parser")
cards = soup.find_all("review-card")
print(f"\nAUDIENCE PAGE - Found {len(cards)} cards")
if cards:
    print("First audience card slot='rating' raw HTML:")
    for card in cards[:3]:
        rating = card.find(attrs={"slot": "rating"})
        print(rating)
        print("---")

driver.quit()



CRITIC PAGE - Found 20 cards
First critic card slot='rating' raw HTML:
<span slot="rating">
<score-icon-critics sentiment="positive" size="0.875" style="width: 0.875rem; height: 0.875rem;"></score-icon-critics>
<span style="margin-top: 1.4px;"></span>
</span>
---
<span slot="rating">
<score-icon-critics sentiment="positive" size="0.875" style="width: 0.875rem; height: 0.875rem;"></score-icon-critics>
<span style="margin-top: 1.4px;">5/5</span>
</span>
---
<span slot="rating">
<score-icon-critics sentiment="positive" size="0.875" style="width: 0.875rem; height: 0.875rem;"></score-icon-critics>
<span style="margin-top: 1.4px;">5/5</span>
</span>
---

AUDIENCE PAGE - Found 10 cards
First audience card slot='rating' raw HTML:
<rating-stars-group score="5" slot="rating"></rating-stars-group>
---
<rating-stars-group score="5" slot="rating"></rating-stars-group>
---
<rating-stars-group score="5" slot="rating"></rating-stars-group>
---
