In [None]:
# -*- coding: utf-8 -*-
import requests
from bs4 import BeautifulSoup
import json
import os
import re
import time
import random
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from selenium.common.exceptions import TimeoutException, NoSuchElementException, ElementClickInterceptedException
import logging

# Set up logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.StreamHandler()
    ]
)
logger = logging.getLogger(__name__)

def initialize_driver():
    """Initialize and return a Selenium WebDriver"""
    chrome_options = Options()
    chrome_options.add_argument("--headless")  # Run in headless mode (no GUI)
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--window-size=1920,1080")
    
    # Randomize user agent to avoid detection
    user_agents = [
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36",
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/109.0",
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36"
    ]
    
    chrome_options.add_argument(f"--user-agent={random.choice(user_agents)}")
    
    # Additional options to avoid detection
    chrome_options.add_argument("--disable-blink-features=AutomationControlled")
    chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
    chrome_options.add_experimental_option("useAutomationExtension", False)
    
    try:
        driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
        # Execute CDP command to bypass bot detection
        driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
            "source": """
                Object.defineProperty(navigator, 'webdriver', {
                    get: () => undefined
                })
            """
        })
        return driver
    except Exception as e:
        logger.error(f"Failed to initialize driver: {e}")
        raise

def random_delay(min_seconds=2, max_seconds=5):
    """Add a random delay between requests to avoid detection"""
    delay = random.uniform(min_seconds, max_seconds)
    time.sleep(delay)
    return delay

def search_movie_by_title(movie_title, driver):
    """Search for a movie by title using Selenium WebDriver"""
    search_url = f"https://www.imdb.com/find/?q={movie_title.replace(' ', '+')}&s=tt&exact=true"
    
    logger.info(f"Searching with URL: {search_url}")
    
    try:
        # Visit the IMDb homepage first to get cookies
        driver.get("https://www.imdb.com/")
        random_delay(2, 4)
        
        # Now navigate to the search URL
        driver.get(search_url)
        random_delay(3, 5)
        
        # Wait for search results to appear - updated selectors for 2025 IMDb
        try:
            WebDriverWait(driver, 15).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, ".find-result-item"))
            )
        except TimeoutException:
            logger.warning("Timeout waiting for search results. Trying alternative selector.")
            WebDriverWait(driver, 15).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, ".ipc-metadata-list-summary-item"))
            )
        
        # Parse the page using BeautifulSoup
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        
        # Find movie results
        search_results = []
        
        # Try multiple possible selectors for results
        result_selectors = [
            '.find-result-item',           # Current IMDb (2025)
            '.ipc-metadata-list-summary-item',  # Alternative format
            '.findResult'                  # Legacy format
        ]
        
        for selector in result_selectors:
            result_items = soup.select(selector)
            if result_items:
                logger.info(f"Found {len(result_items)} results using selector: {selector}")
                break
        
        if not result_items:
            # Try alternative search format
            alternative_url = f"https://www.imdb.com/search/title/?title={movie_title.replace(' ', '+')}"
            logger.info(f"No results found. Trying alternative search: {alternative_url}")
            
            driver.get(alternative_url)
            random_delay(3, 5)
            
            soup = BeautifulSoup(driver.page_source, 'html.parser')
            result_items = soup.select('.lister-item')
        
        # Process search results
        for item in result_items:
            try:
                # Find the title link - try multiple possible selectors
                link = (
                    item.select_one('a[href*="/title/tt"]') or 
                    item.select_one('.ipc-metadata-list-summary-item__t') or
                    item.select_one('.result_text a')
                )
                
                if not link:
                    continue
                
                title = link.text.strip()
                href = link.get('href', '')
                
                # Extract IMDb ID
                imdb_id_match = re.search(r'/title/(tt\d+)/?', href)
                if not imdb_id_match:
                    continue
                
                imdb_id = imdb_id_match.group(1)
                
                # Try to extract year using various selectors
                year = "Unknown"
                year_element = (
                    item.select_one('.year_type') or
                    item.select_one('.lister-item-year') or
                    None
                )
                
                if year_element:
                    year_match = re.search(r'(\d{4})', year_element.text)
                    if year_match:
                        year = year_match.group(1)
                else:
                    # Try to find year in the item text
                    year_match = re.search(r'(\d{4})', item.text)
                    if year_match:
                        year = year_match.group(1)
                
                search_results.append({
                    'title': title,
                    'year': year,
                    'imdb_id': imdb_id
                })
            except Exception as e:
                logger.error(f"Error parsing result: {e}")
        
        return search_results
        
    except Exception as e:
        logger.error(f"Error during search: {e}")
        return []

def scrape_reviews_page(page_source, imdb_id):
    """Extract reviews from the page HTML"""
    soup = BeautifulSoup(page_source, 'html.parser')
    
    # Debug: Save the HTML to examine the structure
    with open(f"debug_imdb_{imdb_id}.html", "w", encoding="utf-8") as f:
        f.write(soup.prettify())
    
    logger.info(f"Saved debug HTML to debug_imdb_{imdb_id}.html")
    
    # Updated selectors for 2025 IMDb review container
    review_containers = []
    review_selectors = [
        'div.review-container',        # Standard format
        '.imdb-user-review',           # Legacy format
        '.lister-item.review-item',    # Alternative format
        '.review-container',           # Direct container
        '.ipl-content-list__item',     # List item container
        'div[data-testid="review"]',   # New data-testid format (common in 2025)
        'article.review',              # Article format
        '.review-list-item'            # List item format
    ]
    
    for selector in review_selectors:
        review_containers = soup.select(selector)
        if review_containers:
            logger.info(f"Found {len(review_containers)} reviews using selector: {selector}")
            break
    
    # If no reviews found with standard selectors, try a more generic approach
    if not review_containers:
        logger.warning("No reviews found with standard selectors. Trying alternative approach.")
        
        # Look for any elements containing common review text patterns
        potential_reviews = []
        for elem in soup.find_all(['div', 'article', 'section']):
            text = elem.text.lower()
            if ('rated this' in text or 'out of 10' in text or 'user review' in text) and len(text) > 100:
                potential_reviews.append(elem)
        
        if potential_reviews:
            logger.info(f"Found {len(potential_reviews)} potential reviews using text pattern matching")
            review_containers = potential_reviews
    
    data = {}
    data['ImdbId'] = imdb_id
    reviews_text = []
    
    for review in review_containers:
        review_imdb = {}
        
        # Reviewer name
        try:
            # Try multiple possible selectors
            name_element = (
                review.select_one('.display-name-link a') or
                review.select_one('.review-container__author a') or
                review.select_one('.review-container__name') or
                review.select_one('.author a')
            )
            
            if name_element:
                review_imdb['reviewer_name'] = name_element.text.strip()
            else:
                review_imdb['reviewer_name'] = ""
        except Exception as e:
            logger.debug(f"Error extracting reviewer name: {e}")
            review_imdb['reviewer_name'] = ""
            
        # Reviewer URL
        try:
            # Try multiple possible selectors
            url_element = (
                review.select_one('.display-name-link a') or
                review.select_one('.review-container__author a') or
                review.select_one('.author a')
            )
            
            if url_element and url_element.has_attr('href'):
                review_imdb['reviewer_url'] = url_element['href']
            else:
                review_imdb['reviewer_url'] = ""
        except Exception as e:
            logger.debug(f"Error extracting reviewer URL: {e}")
            review_imdb['reviewer_url'] = ""
            
        # Review ID
        try:
            if review.has_attr('data-review-id'):
                review_imdb['data-review-id'] = review['data-review-id']
            elif review.has_attr('id'):
                review_imdb['data-review-id'] = review['id']
            else:
                review_imdb['data-review-id'] = ""
        except Exception as e:
            logger.debug(f"Error extracting review ID: {e}")
            review_imdb['data-review-id'] = ""
            
        # Short review (title)
        try:
            # Try multiple possible selectors
            title_element = (
                review.select_one('a.title') or
                review.select_one('.review-title') or
                review.select_one('.title') or
                review.select_one('h3')
            )
            
            if title_element:
                review_imdb['short_review'] = title_element.text.strip()
            else:
                review_imdb['short_review'] = ""
        except Exception as e:
            logger.debug(f"Error extracting short review: {e}")
            review_imdb['short_review'] = ""
    
        # Full review
        try:
            # Try multiple possible selectors
            content_element = (
                review.select_one('.show-more__control') or
                review.select_one('.content') or
                review.select_one('.text') or
                review.select_one('.review-content') or
                review.select_one('.review-text')
            )
            
            if content_element:
                review_imdb['full_review'] = content_element.text.strip()
            else:
                review_imdb['full_review'] = ""
        except Exception as e:
            logger.debug(f"Error extracting full review: {e}")
            review_imdb['full_review'] = ""
            
        # Review date
        try:
            # Try multiple possible selectors
            date_element = (
                review.select_one('.review-date') or
                review.select_one('.review-container__date') or
                review.select_one('.date')
            )
            
            if date_element:
                review_imdb['review_date'] = date_element.text.strip()
            else:
                review_imdb['review_date'] = ""
        except Exception as e:
            logger.debug(f"Error extracting review date: {e}")
            review_imdb['review_date'] = ""
            
        # Rating value
        try:
            # Try multiple possible selectors
            rating_element = (
                review.select_one('.rating-other-user-rating span') or
                review.select_one('.ipl-ratings-bar span') or
                review.select_one('.rating span') or
                review.select_one('.rating')
            )
            
            if rating_element:
                rating_text = rating_element.text.strip()
                # Extract just the number if there's a pattern like "8/10"
                rating_match = re.search(r'(\d+)(?:/\d+)?', rating_text)
                if rating_match:
                    review_imdb['rating_value'] = rating_match.group(1)
                else:
                    review_imdb['rating_value'] = rating_text
            else:
                review_imdb['rating_value'] = ""
        except Exception as e:
            logger.debug(f"Error extracting rating: {e}")
            review_imdb['rating_value'] = ""
            
        reviews_text.append(review_imdb)
    
    data['reviews'] = reviews_text
    return data

def scrape_all_reviews(imdb_id, driver, max_pages=None):
    """Scrape all review pages for a movie using Selenium"""
    all_data = []
    reviews_url = f"https://www.imdb.com/title/{imdb_id}/reviews"
    
    logger.info(f"Opening reviews page: {reviews_url}")
    driver.get(reviews_url)
    random_delay(3, 5)
    
    # Accept cookies if the dialog appears
    try:
        cookie_selectors = [
            "button[id*='accept']",
            "button[data-testid='accept']",
            ".ipc-button--accept-cookies",
            ".accept-cookies"
        ]
        
        for selector in cookie_selectors:
            try:
                accept_button = WebDriverWait(driver, 3).until(
                    EC.element_to_be_clickable((By.CSS_SELECTOR, selector))
                )
                accept_button.click()
                logger.info(f"Accepted cookies using selector: {selector}")
                random_delay(1, 2)
                break
            except TimeoutException:
                continue
    except Exception:
        logger.info("No cookie prompt found or already accepted")

    logger.info("Waiting for review page to fully load...")
    try:
        # First wait for the page structure to load
        WebDriverWait(driver, 20).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "body"))
        )
        
        # Then wait for potential review elements
        try:
            WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, ".ipc-list-card, article, .review-container"))
            )
        except TimeoutException:
            logger.info("No review elements found with standard selectors, continuing anyway")
        
        # Scroll down several times to trigger lazy loading
        for _ in range(3):
            driver.execute_script("window.scrollBy(0, 800);")
            time.sleep(2)
    except Exception as e:
        logger.warning(f"Error waiting for page load: {e}")
    
    page_count = 0
    has_more = True
    
    # First parse the initial page
    data = scrape_reviews_page(driver.page_source, imdb_id)
    if data['reviews']:
        all_data.append(data)
        logger.info(f"Found {len(data['reviews'])} reviews on initial page")
    else:
        logger.warning("No reviews found on initial page. Page structure may have changed.")
        
        # Try navigating to an alternative review URL
        alt_reviews_url = f"https://www.imdb.com/title/{imdb_id}/reviews/_ajax"
        logger.info(f"Trying alternative reviews URL: {alt_reviews_url}")
        driver.get(alt_reviews_url)
        random_delay(3, 5)
        
        # Try again with the alternative URL
        data = scrape_reviews_page(driver.page_source, imdb_id)
        if data['reviews']:
            all_data.append(data)
            logger.info(f"Found {len(data['reviews'])} reviews on alternative page")
    
    # List of possible load more button selectors
    load_more_selectors = [
        ".load-more-data",
        "#load-more-trigger",
        "button.ipc-see-more__button",  # New "Load More" pattern
        ".ipc-pagination__next-button",  # Pagination next button
        "button.more-reviews",
        ".see-more a",
        ".ipl-load-more__button",
        "button[data-testid='load-more']"  # Data testid pattern
    ]
    
    # Click "Load More" button until no more results or reached max pages
    while has_more and (max_pages is None or page_count < max_pages):
        try:
            # Try each possible selector for load more button
            load_more = None
            used_selector = None
            
            for selector in load_more_selectors:
                try:
                    load_more = WebDriverWait(driver, 5).until(
                        EC.element_to_be_clickable((By.CSS_SELECTOR, selector))
                    )
                    used_selector = selector
                    break
                except TimeoutException:
                    continue
            
            if not load_more:
                logger.info("No 'Load More' button found. All reviews loaded or button selector changed.")
                break
            
            logger.info(f"Found 'Load More' button using selector: {used_selector}")
            
            # Scroll to the button
            driver.execute_script("arguments[0].scrollIntoView({behavior: 'smooth', block: 'center'});", load_more)
            random_delay(1, 2)
            
            # Try to click the button
            try:
                load_more.click()
                logger.info("Clicked 'Load More' button")
            except ElementClickInterceptedException:
                # If normal click fails, try JavaScript click
                driver.execute_script("arguments[0].click();", load_more)
                logger.info("Used JavaScript to click 'Load More' button")
            
            # Wait for new content to load with random delay
            random_delay(3, 6)
            
            # Scroll a bit after loading to ensure new content is rendered
            driver.execute_script("window.scrollBy(0, 300);")
            time.sleep(2)
            
            # Check if new reviews were loaded
            new_data = scrape_reviews_page(driver.page_source, imdb_id)
            if new_data['reviews']:
                current_review_count = sum(len(page['reviews']) for page in all_data)
                new_review_count = len(new_data['reviews'])
                
                # Only append if we got new reviews
                if new_review_count > current_review_count:
                    all_data.append(new_data)
                    page_count += 1
                    logger.info(f"Loaded page {page_count + 1}: Found {new_review_count} reviews (Total now: {current_review_count + new_review_count})")
                else:
                    logger.info("No new reviews loaded, reached end of reviews")
                    has_more = False
            else:
                logger.info("No new reviews found after clicking load more")
                has_more = False
                
        except Exception as e:
            logger.error(f"Error loading more reviews: {e}")
            has_more = False
    
    # Combine all reviews into a single structure
    all_reviews = []
    for page_data in all_data:
        all_reviews.extend(page_data['reviews'])
    
    # Remove potential duplicates (based on review text)
    unique_reviews = []
    seen_reviews = set()
    
    for review in all_reviews:
        # Create a simple hash of the review content to identify duplicates
        review_hash = hash(review.get('full_review', '') + review.get('reviewer_name', ''))
        if review_hash not in seen_reviews:
            seen_reviews.add(review_hash)
            unique_reviews.append(review)
    
    reviews = {
        'ImdbId': imdb_id,
        'total_reviews': len(unique_reviews),
        'reviews': unique_reviews
    }
    
    return reviews

def get_movie_reviews_by_title(movie_title, max_pages=None):
    """Main function to get reviews by movie title using Selenium"""
    logger.info(f"\nSearching for movie: {movie_title}")
    
    driver = initialize_driver()
    
    try:
        # Search for the movie
        search_results = search_movie_by_title(movie_title, driver)
        
        if not search_results:
            logger.warning("No movies found matching that title.")
            return None
        
        # Display search results
        print("\nFound the following movies:")
        for i, movie in enumerate(search_results, 1):
            print(f"{i}. {movie['title']} ({movie['year']}) - {movie['imdb_id']}")
        
        # Let user choose a movie or use first result in automated mode
        selected_movie = None
        if len(search_results) == 1:
            selected_movie = search_results[0]
            print(f"Auto-selecting the only result: {selected_movie['title']}")
        else:
            try:
                choice = int(input(f"\nSelect a movie (1-{len(search_results)}): "))
                if 1 <= choice <= len(search_results):
                    selected_movie = search_results[choice-1]
                else:
                    logger.error("Invalid selection")
                    return None
            except ValueError:
                logger.error("Please enter a valid number")
                return None
        
        imdb_id = selected_movie['imdb_id']
        movie_title = selected_movie['title']
        
        logger.info(f"\nScraping reviews for: {movie_title} ({imdb_id})")
        
        # Scrape reviews for the selected movie
        data = scrape_all_reviews(imdb_id, driver, max_pages)
        
        # Count total reviews
        total_reviews = len(data['reviews'])
        
        logger.info(f"\nFound {total_reviews} reviews for {movie_title}")
        
        # Create directory if it doesn't exist
        os.makedirs("reviews", exist_ok=True)
        
        # Save to JSON file
        sanitized_title = re.sub(r'[\\/*?:"<>|]', "", movie_title.replace(' ', '_'))
        filename = f"reviews/reviews_{imdb_id}_{sanitized_title}.json"
        with open(filename, 'w', encoding='utf-8') as json_file:
            json.dump(data, json_file, ensure_ascii=False, indent=4)
        
        logger.info(f"\nReviews saved to {filename}")
        return data
        
    except Exception as e:
        logger.error(f"Error: {e}")
        return None
    finally:
        # Always close the driver when done
        driver.quit()

def main():
    print("IMDb Movie Review Scraper (Selenium Version)")
    print("-------------------------------------------")
    print("Note: This script requires Chrome and chromedriver to be installed.")
    print("It will automatically download chromedriver if not already installed.")
    print("Initial setup may take a moment.")
    print("-------------------------------------------")
    
    while True:
        movie_title = input("\nEnter movie title (or 'quit' to exit): ")
        if movie_title.lower() == 'quit':
            break
        
        max_pages = None
        page_limit = input("Enter maximum number of pages to scrape (or press Enter for all): ")
        if page_limit.strip():
            try:
                max_pages = int(page_limit)
            except ValueError:
                print("Invalid number, scraping all pages.")
        
        get_movie_reviews_by_title(movie_title, max_pages)

if __name__ == "__main__":
    main()

IMDb Movie Review Scraper (Selenium Version)
-------------------------------------------
Note: This script requires Chrome and chromedriver to be installed.
It will automatically download chromedriver if not already installed.
Initial setup may take a moment.
-------------------------------------------


2025-04-24 12:00:22,949 - INFO - 
Searching for movie: avatar
2025-04-24 12:00:24,241 - INFO - Get LATEST chromedriver version for google-chrome
2025-04-24 12:00:24,385 - INFO - Get LATEST chromedriver version for google-chrome
2025-04-24 12:00:24,476 - INFO - Driver [C:\Users\LEGION\.wdm\drivers\chromedriver\win64\135.0.7049.114\chromedriver-win32/chromedriver.exe] found in cache
2025-04-24 12:00:25,653 - INFO - Searching with URL: https://www.imdb.com/find/?q=avatar&s=tt&exact=true
2025-04-24 12:00:34,836 - INFO - Found 25 results using selector: .find-result-item



Found the following movies:
1. Avatar (2009) - tt0499549
2. Avatar: The Last Airbender (2005) - tt0417299
3. Tomorrow Never Dies (1997) - tt0120347
4. Avatar (2016) - tt5863892
5. Avatar (2022) - tt27931855
6. Avatar (2011) - tt1775309
7. Avatar (1916) - tt0278325
8. Avatar (1941) - tt0154182
9. Avatar (2003) - tt0375570
10. Avatar (2006) - tt1622577
11. Chrysalis (2007) - tt0884335
12. Avatar (1964) - tt0959431
13. Avatar (2005) - tt0497595
14. Rifftrax: Avatar (2010) - tt16492516
15. Cyber Wars (2004) - tt0270841
16. Avatar (2008) - tt1378189
17. Avatar (2021) - tt21833600
18. Avatar (1979) - tt2136754
19. Avatar (2004) - tt0709042
20. Avatar (2022) - tt32623861
21. Avatar (1996) - tt0751080
22. Avatar (2006) - tt0860057
23. Avatar (2007) - tt1015442
24. Avatar (2000) - tt0703664
25. Avatar (2009) - tt10932508


2025-04-24 12:00:37,670 - INFO - 
Scraping reviews for: Avatar (tt0499549)
2025-04-24 12:00:37,671 - INFO - Waiting for review page to fully load...
2025-04-24 12:00:44,864 - INFO - Saved debug HTML to debug_imdb_tt0499549.html
2025-04-24 12:01:10,637 - INFO - No 'Load More' button found. All reviews loaded or button selector changed.
2025-04-24 12:01:10,638 - INFO - 
Found 0 reviews for Avatar
2025-04-24 12:01:10,640 - INFO - 
Reviews saved to reviews/reviews_tt0499549_Avatar.json
