### prelim ver that worked

In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service
from bs4 import BeautifulSoup
import csv
import time
import re
import os

def get_movie_info(driver, main_url):
    print("Scraping movie info...")
    driver.get(main_url)
    time.sleep(5)
    soup = BeautifulSoup(driver.page_source, "html.parser")

    info = {}

    # Title
    tag = soup.find(attrs={"data-testid": "hero__primary-text"})
    info["title"] = tag.get_text(strip=True) if tag else "N/A"

    # Rating
    tag = soup.find(attrs={"data-testid": "hero-rating-bar__aggregate-rating__score"})
    info["imdb_rating"] = tag.get_text(strip=True).replace("/10", "") if tag else "N/A"

    # Year - it's in the hero parent text, look for 4-digit year
    tag = soup.find(attrs={"data-testid": "hero-parent"})
    if tag:
        match = re.search(r'\b(19|20)\d{2}\b', tag.get_text())
        info["year"] = match.group(0) if match else "N/A"
    else:
        info["year"] = "N/A"

    # Budget
    tag = soup.find(attrs={"data-testid": "title-boxoffice-budget"})
    if tag:
        text = tag.get_text(strip=True).replace("Budget", "").replace("(estimated)", "").strip()
        info["budget"] = text
    else:
        info["budget"] = "N/A"

    # Gross US & Canada
    tag = soup.find(attrs={"data-testid": "title-boxoffice-grossdomestic"})
    if tag:
        text = tag.get_text(strip=True).replace("Gross US & Canada", "").strip()
        info["gross_us"] = text
    else:
        info["gross_us"] = "N/A"

    print(f"Movie info: {info}")
    return info


def scrape_reviews(driver, reviews_url):
    print("Scraping reviews...")
    driver.get(reviews_url)
    time.sleep(5)

    # THIS IS DIFFERENT 
    # Keep clicking "Load More"
    while True:
        try:
            load_more = WebDriverWait(driver, 5).until(
                EC.element_to_be_clickable((By.CSS_SELECTOR, "button[data-testid='tturv-load-more-button']"))
            )
            driver.execute_script("arguments[0].click();", load_more)
            time.sleep(2)
            print("Loaded more reviews...")
        except:
            print("All reviews loaded.")
            break

    soup = BeautifulSoup(driver.page_source, "html.parser")
    reviews = []
    articles = soup.find_all("article", class_="user-review-item")
    print(f"Found {len(articles)} reviews, parsing...")

    for article in articles:
        review = {}

        # Rating â€” only present if user gave one
        rating_tag = article.find("span", class_="ipc-rating-star--rating")
        review["user_rating"] = rating_tag.get_text(strip=True) if rating_tag else "N/A"

        # Title
        title_tag = article.find("h3", class_="ipc-title__text")
        review["review_title"] = title_tag.get_text(strip=True) if title_tag else "N/A"

        # Review text
        text_tag = article.find("div", attrs={"data-testid": "review-overflow"})
        if not text_tag:
            text_tag = article.find("div", class_=lambda c: c and "content" in c.lower())
        review["review_text"] = text_tag.get_text(strip=True) if text_tag else "N/A"

        # Author
        author_tag = article.find("a", attrs={"data-testid": "author-link"})
        review["author"] = author_tag.get_text(strip=True) if author_tag else "N/A"

        # Date
        date_tag = article.find("li", attrs={"data-testid": "review-date"})
        review["date"] = date_tag.get_text(strip=True) if date_tag else "N/A"

        reviews.append(review)

    return reviews


# ---- MAIN ----
MAIN_URL = "https://www.imdb.com/title/tt6751668/?ref_=tturv_ov_bk"
REVIEWS_URL = "https://www.imdb.com/title/tt6751668/reviews/?ref_=tt_ov_ururv"

# Set up headless Chrome
options = Options()
options.add_argument("--headless")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
options.add_argument("--window-size=1920,1080")
options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

# Scrape both pages
movie_info = get_movie_info(driver, MAIN_URL)
reviews = scrape_reviews(driver, REVIEWS_URL)
driver.quit()

print(f"\nScraped {len(reviews)} reviews total.")

# Save to CSV
safe_title = re.sub(r'[^\w\s-]', '', movie_info['title']).strip().replace(' ', '_')
filename = f"reviews_{safe_title}.csv"
filepath = os.path.join("/Users/Diane/Desktop/PSYCH 186B/project", filename)

with open(filepath, "w", newline="", encoding="utf-8") as f:
    fieldnames = ["author", "date", "user_rating", "review_title", "review_text"]
    writer = csv.DictWriter(f, fieldnames=fieldnames)
    writer.writeheader()
    writer.writerows(reviews)

print(f"Saved to: {filepath}")