In [7]:
pip install selenium webdriver-manager beautifulsoup4 requests

Collecting selenium
  Downloading selenium-4.31.0-py3-none-any.whl.metadata (7.5 kB)
Collecting webdriver-manager
  Downloading webdriver_manager-4.0.2-py2.py3-none-any.whl.metadata (12 kB)
Collecting trio~=0.17 (from selenium)
  Downloading trio-0.30.0-py3-none-any.whl.metadata (8.5 kB)
Collecting trio-websocket~=0.9 (from selenium)
  Downloading trio_websocket-0.12.2-py3-none-any.whl.metadata (5.1 kB)
Collecting websocket-client~=1.8 (from selenium)
  Using cached websocket_client-1.8.0-py3-none-any.whl.metadata (8.0 kB)
Collecting sortedcontainers (from trio~=0.17->selenium)
  Downloading sortedcontainers-2.4.0-py2.py3-none-any.whl.metadata (10 kB)
Collecting outcome (from trio~=0.17->selenium)
  Downloading outcome-1.3.0.post0-py2.py3-none-any.whl.metadata (2.6 kB)
Collecting wsproto>=0.14 (from trio-websocket~=0.9->selenium)
  Downloading wsproto-1.2.0-py3-none-any.whl.metadata (5.6 kB)
Collecting pysocks!=1.5.7,<2.0,>=1.5.6 (from urllib3[socks]<3,>=1.26->selenium)
  Downloading P

In [1]:
# -*- coding: utf-8 -*-
import requests
from bs4 import BeautifulSoup
import json
import os
import re
import time
import random
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager

def initialize_driver():
    """Initialize and return a Selenium WebDriver"""
    chrome_options = Options()
    chrome_options.add_argument("--headless")  # Run in headless mode (no GUI)
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--window-size=1920,1080")
    chrome_options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36")
    
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
    return driver

def search_movie_by_title(movie_title, driver):
    """Search for a movie by title using Selenium WebDriver"""
    search_url = f"https://www.imdb.com/find/?q={movie_title.replace(' ', '+')}"
    
    print(f"Searching with URL: {search_url}")
    
    try:
        # Visit the IMDb homepage first to get cookies
        driver.get("https://www.imdb.com/")
        time.sleep(2)  # Let the page load
        
        # Now navigate to the search URL
        driver.get(search_url)
        time.sleep(3)  # Wait for search results to load
        
        # Wait for search results to appear
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, ".ipc-metadata-list-summary-item"))
        )
        
        # Parse the page using BeautifulSoup
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        
        # Find movie results
        search_results = []
        
        # Try to find the results in the search page
        result_items = soup.select('.ipc-metadata-list-summary-item')
        
        if not result_items:
            # Try alternative search
            driver.get(f"https://www.imdb.com/search/title/?title={movie_title.replace(' ', '+')}")
            time.sleep(3)
            
            soup = BeautifulSoup(driver.page_source, 'html.parser')
            result_items = soup.select('.lister-item-content')
        
        for item in result_items:
            try:
                # Find the title link
                link = item.select_one('a[href*="/title/tt"]')
                if not link:
                    continue
                
                title = link.text.strip()
                href = link.get('href', '')
                
                # Extract IMDb ID
                imdb_id_match = re.search(r'/title/(tt\d+)/', href)
                if not imdb_id_match:
                    continue
                
                imdb_id = imdb_id_match.group(1)
                
                # Try to extract year
                year = "Unknown"
                year_match = re.search(r'(\d{4})', item.text)
                if year_match:
                    year = year_match.group(1)
                
                search_results.append({
                    'title': title,
                    'year': year,
                    'imdb_id': imdb_id
                })
            except Exception as e:
                print(f"Error parsing result: {e}")
        
        if not search_results:
            # Direct search as a last resort
            driver.get(f"https://www.imdb.com/search/title/?title={movie_title.replace(' ', '+')}")
            time.sleep(3)
            
            soup = BeautifulSoup(driver.page_source, 'html.parser')
            
            # Try to find movie items
            items = soup.select('.lister-item')
            
            for item in items:
                try:
                    title_element = item.select_one('h3 a')
                    if not title_element:
                        continue
                    
                    title = title_element.text.strip()
                    href = title_element.get('href', '')
                    
                    imdb_id_match = re.search(r'/title/(tt\d+)/', href)
                    if not imdb_id_match:
                        continue
                    
                    imdb_id = imdb_id_match.group(1)
                    
                    year_element = item.select_one('.lister-item-year')
                    year = "Unknown"
                    if year_element:
                        year_match = re.search(r'(\d{4})', year_element.text)
                        if year_match:
                            year = year_match.group(1)
                    
                    search_results.append({
                        'title': title,
                        'year': year,
                        'imdb_id': imdb_id
                    })
                except Exception as e:
                    print(f"Error parsing direct search result: {e}")
        
        return search_results
        
    except Exception as e:
        print(f"Error during search: {e}")
        return []

def scrape_reviews_page(page_source, imdb_id):
    """Extract reviews from the page HTML"""
    soup = BeautifulSoup(page_source, 'html.parser')
    
    try:
        reviews = soup.find_all('div', {'class': 'imdb-user-review'})
    except:
        reviews = []
    
    data = {}
    data['ImdbId'] = imdb_id
    reviews_text = []
    
    for review in reviews:
        review_imdb = {}
        
        # Reviewer name
        try:
            review_imdb['reviewer_name'] = review.find('span', {'class': 'display-name-link'}).find('a').string.strip()
        except:
            review_imdb['reviewer_name'] = ""
            
        # Reviewer URL
        try:
            review_imdb['reviewer_url'] = review.find('span', {'class': 'display-name-link'}).find('a')['href']
        except:
            review_imdb['reviewer_url'] = ""
            
        # Review ID
        try:
            review_imdb['data-review-id'] = review['data-review-id']
        except:
            review_imdb['data-review-id'] = ""
            
        # Short review
        try:
            short_review = review.find('a', {'class': 'title'})
            text = short_review.string.strip()
            review_imdb['short_review'] = text
        except:
            review_imdb['short_review'] = ""
    
        # Full review
        try:
            full_review = review.find('div', {'class': 'show-more__control'})
            text = full_review.string.strip()
            review_imdb['full_review'] = text
        except:
            review_imdb['full_review'] = ""
            
        # Review date
        try:
            review_date = review.find('span', {'class': 'review-date'})
            text = review_date.string.strip()
            review_imdb['review_date'] = text    
        except:
            review_imdb['review_date'] = "" 
            
        # Rating value
        try:
            ratings_span = review.find('span', {'class': 'rating-other-user-rating'})
            text = ratings_span.find('span').string.strip()
            review_imdb['rating_value'] = text      
        except:
            review_imdb['rating_value'] = "" 
            
        reviews_text.append(review_imdb)    
    
    data['reviews'] = reviews_text
    return data

def scrape_all_reviews(imdb_id, driver, max_pages=None):
    """Scrape all review pages for a movie using Selenium"""
    all_data = []
    reviews_url = f"https://www.imdb.com/title/{imdb_id}/reviews"
    
    print(f"Opening reviews page: {reviews_url}")
    driver.get(reviews_url)
    time.sleep(3)  # Wait for page to load
    
    page_count = 0
    has_more = True
    
    # First parse the initial page
    data = scrape_reviews_page(driver.page_source, imdb_id)
    if data['reviews']:
        all_data.append(data)
        print(f"Found {len(data['reviews'])} reviews on page {page_count + 1}")
    
    # Click "Load More" button until no more results or reached max pages
    while has_more and (max_pages is None or page_count < max_pages):
        try:
            # Wait for the "Load More" button to be clickable
            load_more = WebDriverWait(driver, 10).until(
                EC.element_to_be_clickable((By.CLASS_NAME, "ipl-load-more__button"))
            )
            
            # Scroll to the button to make sure it's visible
            driver.execute_script("arguments[0].scrollIntoView();", load_more)
            time.sleep(1)
            
            # Click the button
            load_more.click()
            
            # Wait for new content to load
            time.sleep(3)
            
            # Parse the updated page
            page_count += 1
            print(f"Loading more reviews (page {page_count + 1})...")
            
            data = scrape_reviews_page(driver.page_source, imdb_id)
            if data['reviews']:
                all_data.append(data)
                print(f"Found {len(data['reviews'])} additional reviews")
                
        except Exception as e:
            print(f"No more reviews to load: {e}")
            has_more = False
    
    reviews = {}
    reviews['ImdbId'] = imdb_id
    reviews['reviews'] = all_data
    return reviews

def get_movie_reviews_by_title(movie_title, max_pages=None):
    """Main function to get reviews by movie title using Selenium"""
    print(f"\nSearching for movie: {movie_title}")
    
    driver = initialize_driver()
    
    try:
        # Search for the movie
        search_results = search_movie_by_title(movie_title, driver)
        
        if not search_results:
            print("No movies found matching that title.")
            driver.quit()
            return None
        
        # Display search results
        print("\nFound the following movies:")
        for i, movie in enumerate(search_results, 1):
            print(f"{i}. {movie['title']} ({movie['year']}) - {movie['imdb_id']}")
        
        # Let user choose a movie
        choice = 0
        while choice < 1 or choice > len(search_results):
            try:
                choice = int(input(f"\nSelect a movie (1-{len(search_results)}): "))
            except ValueError:
                print("Please enter a valid number.")
        
        selected_movie = search_results[choice-1]
        imdb_id = selected_movie['imdb_id']
        movie_title = selected_movie['title']
        
        print(f"\nScraping reviews for: {movie_title} ({imdb_id})")
        
        # Scrape reviews for the selected movie
        data = scrape_all_reviews(imdb_id, driver, max_pages)
        
        # Count total reviews across all pages
        total_reviews = 0
        for page in data['reviews']:
            total_reviews += len(page['reviews'])
        
        print(f"\nFound {total_reviews} reviews for {movie_title}")
        
        # Create directory if it doesn't exist
        os.makedirs("reviews", exist_ok=True)
        
        # Save to JSON file
        sanitized_title = re.sub(r'[\\/*?:"<>|]', "", movie_title.replace(' ', '_'))
        filename = f"reviews/reviews_{imdb_id}_{sanitized_title}.json"
        with open(filename, 'w', encoding='utf-8') as json_file:
            json.dump(data, json_file, ensure_ascii=False, indent=4)
        
        print(f"\nReviews saved to {filename}")
        return data
        
    finally:
        # Always close the driver when done
        driver.quit()

if __name__ == "__main__":
    print("IMDb Movie Review Scraper (Selenium Version)")
    print("-------------------------------------------")
    print("Note: This script requires Chrome and chromedriver to be installed.")
    print("It will automatically download chromedriver if not already installed.")
    print("Initial setup may take a moment.")
    print("-------------------------------------------")
    
    while True:
        movie_title = input("\nEnter movie title (or 'quit' to exit): ")
        if movie_title.lower() == 'quit':
            break
        
        max_pages = None
        page_limit = input("Enter maximum number of pages to scrape (or press Enter for all): ")
        if page_limit.strip():
            try:
                max_pages = int(page_limit)
            except ValueError:
                print("Invalid number, scraping all pages.")
        
        get_movie_reviews_by_title(movie_title, max_pages)

IMDb Movie Review Scraper (Selenium Version)
-------------------------------------------
Note: This script requires Chrome and chromedriver to be installed.
It will automatically download chromedriver if not already installed.
Initial setup may take a moment.
-------------------------------------------

Searching for movie: Avatar
Searching with URL: https://www.imdb.com/find/?q=Avatar

Found the following movies:
1. Avatar (2009) - tt0499549
2. Avatar: The Last Airbender (2005) - tt0417299
3. Avatar: Fire and Ash (2025) - tt1757678
4. Avatar: The Way of Water (2022) - tt1630029
5. Avatar: The Last Airbender (2024) - tt9018736

Scraping reviews for: Avatar (tt0499549)
Opening reviews page: https://www.imdb.com/title/tt0499549/reviews
No more reviews to load: Message: 
Stacktrace:
	GetHandleVerifier [0x00337FD3+60707]
	GetHandleVerifier [0x00338014+60772]
	(No symbol) [0x00160683]
	(No symbol) [0x001A8660]
	(No symbol) [0x001A89FB]
	(No symbol) [0x001F1022]
	(No symbol) [0x001CD094]
	(N