In [None]:
import csv
import time
from datetime import datetime
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# Configure and start WebDriver (using Chrome instead of Edge)
chrome_options = webdriver.ChromeOptions()

# No headless mode for now (commented out)
# chrome_options.add_argument("--headless")
chrome_options.add_argument("--window-size=1920,1080")
chrome_options.add_argument("--start-maximized")
chrome_options.add_argument("--disable-gpu")
#chrome_options.add_argument("--headless")

# Disable images to save bandwidth and speed up page loading
chrome_options.add_argument("--blink-settings=imagesEnabled=false")

# Run the browser in incognito mode (optional, but can improve performance)
chrome_options.add_argument("--incognito")

# Configure WebDriver to use Chrome
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=chrome_options)

# IMDb URL for scraping in English
url = "https://www.imdb.com/search/title/?title_type=feature&release_date=2001-01-01,2023-12-31&num_votes=200,&genres=!documentary,!short&sort=num_votes,desc"
driver.get(url)

# Handle the cookie consent pop-up using JavaScript
try:
    decline_button = WebDriverWait(driver, 10).until(
        EC.element_to_be_clickable((By.XPATH, "//button[contains(text(), 'Decline')]"))
    )
    # Click using JavaScript to ensure it works in headless mode
    driver.execute_script("arguments[0].click();", decline_button)
    print("Clicked 'Decline' button on cookie consent pop-up using JavaScript.")
except Exception as e:
    print(f"Error handling cookie consent pop-up: {e}")
time.sleep(2)

def calculate_load_more_clicks(target_num_movies):
    """Calculate how many '50 more' clicks are needed to load the target number of movies."""
    movies_per_page = 50
    clicks_needed = (target_num_movies // movies_per_page)
    if target_num_movies % movies_per_page > 0:
        clicks_needed += 1
    return clicks_needed

def scrape_movies(num_movies):
    """Function to scrape a specified number of movies."""
    movies_data = []
    clicks_needed = calculate_load_more_clicks(num_movies)
    print(f"Will click '50 more' button {clicks_needed} times to load {num_movies} movies.")

    # Click the '50 more' button the required number of times
    for click in range(clicks_needed):
        try:
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(0.7)  # Wait for the page to load completely

            driver.execute_script("window.scrollBy(0, -100);")
            time.sleep(0.05)  # Wait a little for the scroll action to take effect
            
            # Find the '50 more' button with multiple language options
            load_more_button = None
            for button_text in ['50 más', '50 en plus', '50 more']:
                try:
                    load_more_button = driver.find_element(By.XPATH, f"//span[contains(text(), '{button_text}')]")
                    if load_more_button.is_displayed() and load_more_button.is_enabled():
                        break
                except:
                    continue
            
            if load_more_button and load_more_button.is_displayed() and load_more_button.is_enabled():
                load_more_button.click()
                print(f"Clicked '50 more' button ({click + 1}/{clicks_needed}).")
                time.sleep(2)  # Wait for the new movies to load
            else:
                print("No more '50 more' button found.")
                break
        except Exception as e:
            print(f"Error clicking '50 more' button: {e}")
            break

    # Now, we scrape the data from the current page
    print("Scraping movie data from the last loaded page.")
    movie_elements = driver.find_elements(By.CSS_SELECTOR, "a.ipc-title-link-wrapper")
    ratings = driver.find_elements(By.CSS_SELECTOR, "span.ipc-rating-star--rating")
    
    # Updated CSS selectors for release year and duration
    release_dates = driver.find_elements(By.CSS_SELECTOR, "span.sc-300a8231-7.eaXxft.dli-title-metadata-item")  # For release year
    durations = driver.find_elements(By.CSS_SELECTOR, "span.sc-300a8231-7.eaXxft.dli-title-metadata-item:nth-child(2)")  # For duration

    for idx, movie in enumerate(movie_elements):
        if len(movies_data) >= num_movies:
            break
        try:
            link = movie.get_attribute("href")  # Movie URL
            title_id = link.split("/")[4]  # Extract 'ttXXXXXXX'
            title_name = movie.find_element(By.CSS_SELECTOR, "h3.ipc-title__text").text  # Movie title
            rating = ratings[idx].text if idx < len(ratings) else "N/A"  # Movie rating
            release_year = release_dates[idx].text if idx < len(release_dates) else "N/A"  # Movie release year
            duration = durations[idx].text if idx < len(durations) else "N/A"  # Movie duration
            
            movies_data.append({
                "Title": title_name,
                "ID": title_id,
                "Rating": rating,
                "Release Year": release_year,
                "Duration": duration
            })
            print(f"Scraped movie: {title_name}, ID: {title_id}, Rating: {rating}, Release Year: {release_year}, Duration: {duration}")
        except Exception as e:
            print(f"Error with a movie: {e}")

    return movies_data

# Specify the number of movies to retrieve
num_movies = 5000  # Adjust to your needs
movies = scrape_movies(num_movies)

# Generate a unique filename using the current date and time
timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
filename = f"movies_data_{timestamp}.csv"

# Write the movie data to a new CSV file with a unique name
with open(filename, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.DictWriter(file, fieldnames=["Title", "ID", "Rating", "Release Year", "Duration"])
    writer.writeheader()
    for movie in movies:
        writer.writerow(movie)

print(f"Data has been written to {filename}")

# Do not close the browser to allow for manual inspection
# driver.quit()


Clicked 'Decline' button on cookie consent pop-up using JavaScript.
Will click '50 more' button 400 times to load 20000 movies.
Clicked '50 more' button (1/400).
Clicked '50 more' button (2/400).
Clicked '50 more' button (3/400).
Clicked '50 more' button (4/400).
Clicked '50 more' button (5/400).
Clicked '50 more' button (6/400).
Clicked '50 more' button (7/400).
Clicked '50 more' button (8/400).
Clicked '50 more' button (9/400).
Clicked '50 more' button (10/400).
Clicked '50 more' button (11/400).
Clicked '50 more' button (12/400).
Clicked '50 more' button (13/400).
Clicked '50 more' button (14/400).
Clicked '50 more' button (15/400).
Clicked '50 more' button (16/400).
Clicked '50 more' button (17/400).
Clicked '50 more' button (18/400).
Clicked '50 more' button (19/400).
Clicked '50 more' button (20/400).
Clicked '50 more' button (21/400).
Clicked '50 more' button (22/400).
Clicked '50 more' button (23/400).
Clicked '50 more' button (24/400).
Clicked '50 more' button (25/400).
Clicke