In [122]:
# Charity Smith
# Scraping Rotten Tomatoes for Movie Reviews for Batman and The Dark Knight

In [1]:
# Importing necessary libraries
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
import pandas as pd
import time

The Rotten Tomatoes site loads reviews dynamically using JavaScript, which requires additional handling with tools like Selenium to scrape content that isn't initially available in the HTML source.

I also need to scrapte all the reviews that are hidden behind a "Load More" button, which involves simulating clicking the "Load More" button until all reviews are loaded.

In [123]:
# Path to your ChromeDriver (update this path)
CHROME_DRIVER_PATH = "/usr/local/bin/chromedriver"

In [128]:
def scrape_reviews(movie_url):
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')  # Run Chrome in headless mode
    options.add_argument('--disable-gpu')
    options.add_argument('--no-sandbox')

    service = Service(CHROME_DRIVER_PATH)
    
    driver = webdriver.Chrome(service=service, options=options)
    
    driver.get(movie_url)
    time.sleep(3)  # Wait for the page to load

    # Load all reviews by clicking the "Load More" button
    while True:
        try:
            load_more_button = driver.find_element(By.CSS_SELECTOR, 'div.load-more-container rt-button[data-loadmoremanager="btnLoadMore:click"]')
            if 'hide' in load_more_button.get_attribute('class'):
                print("No more reviews to load. All reviews loaded.")
                break
            else:
                driver.execute_script("arguments[0].click();", load_more_button)
                print("Clicked 'Load More' button")
                time.sleep(2)  # Wait for the reviews to load
        except NoSuchElementException:
            print("No more 'Load More' button found. All reviews loaded.")
            break

    
    
    review_elements = driver.find_elements(By.CLASS_NAME, 'review-text')
    review_texts = [review.text.strip() for review in review_elements]

    driver.quit()
    
    # Log the number of reviews found
    print(f"Found {len(review_texts)} reviews for URL: {movie_url}")
    return review_texts

In [129]:
# URLs of the movie reviews
urls = {
    "Batman": "https://www.rottentomatoes.com/m/1001781-batman/reviews",
    "The Dark Knight": "https://www.rottentomatoes.com/m/the_dark_knight/reviews"
}

In [130]:
# Dictionary to store the reviews
reviews_data = {}
start_time = time.time()

In [131]:
# Scrape reviews for each movie
for movie, url in urls.items():
    print(f"Scraping reviews for {movie}...")
    reviews_data[movie] = scrape_reviews(url)
    # Adding a delay to avoid hitting the server too frequently
    time.sleep(1)

Scraping reviews for Batman...
Clicked 'Load More' button
Clicked 'Load More' button
Clicked 'Load More' button
Clicked 'Load More' button
Clicked 'Load More' button
Clicked 'Load More' button
Clicked 'Load More' button
No more reviews to load. All reviews loaded.
Found 140 reviews for URL: https://www.rottentomatoes.com/m/1001781-batman/reviews
Scraping reviews for The Dark Knight...
Clicked 'Load More' button
Clicked 'Load More' button
Clicked 'Load More' button
Clicked 'Load More' button
Clicked 'Load More' button
Clicked 'Load More' button
Clicked 'Load More' button
Clicked 'Load More' button
Clicked 'Load More' button
Clicked 'Load More' button
Clicked 'Load More' button
Clicked 'Load More' button
Clicked 'Load More' button
Clicked 'Load More' button
Clicked 'Load More' button
Clicked 'Load More' button
Clicked 'Load More' button
No more reviews to load. All reviews loaded.
Found 347 reviews for URL: https://www.rottentomatoes.com/m/the_dark_knight/reviews
Scraping reviews for The

In [132]:
# Create a DataFrame to store the scraped reviews
reviews_df = pd.DataFrame({
    "Movie": [movie for movie in reviews_data for _ in reviews_data[movie]],
    "Review": [review for reviews in reviews_data.values() for review in reviews]
})

In [133]:
# Display the first few rows of the DataFrame
reviews_df.head()

Unnamed: 0,Movie,Review
0,Batman,Tim Burton's Batman is a Gothic superhero movi...
1,Batman,"Set to Danny Elfman’s triumphant score, Burton..."
2,Batman,The result is a film that may please adults mo...
3,Batman,The Gotham City created in Batman is one of th...
4,Batman,"It is both a technical and aesthetic triumph, ..."


In [134]:
# Save the reviews to a CSV file
reviews_df.to_csv('batman_movie_reviews.csv', index=False)

print("Scraped reviews have been saved to 'batman_movie_reviews.csv'")

Scraped reviews have been saved to 'batman_movie_reviews.csv'
