In [4]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
from selenium.webdriver.common.by import By
import pandas as pd
import time

In [5]:

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))


movie_data = []
for i in range(601,686):
    driver.get(f"https://www.metacritic.com/browse/movie/?releaseYearMin=1910&releaseYearMax=2024&page={i}")
    # Find the main container holding all movies
    container = driver.find_element(By.CLASS_NAME, "c-productListings")
    movies = container.find_elements(By.XPATH, ".//a[contains(@href, '/movie/')]")

    # Collect each movie link
    movie_links = [movie.get_attribute("href") for movie in movies]

    for link in movie_links:
        # Open each movie link
        driver.get(link)
        time.sleep(2)  # Wait for the movie page to load
        
        # Initialize variables
        title = None
        reviews_number = None
        rating = None
        release_year = None
        duration = None
        genres = []
        
        # Try to extract the title
        try:
            title = driver.find_element(By.XPATH, '//div[@data-testid="hero-title"]/h1').text.strip()
        except:
            print("Title not found for this movie.")
        
        # Try to extract the number of user reviews
        try:
            review_span = driver.find_element(By.XPATH, ".//span[contains(text(), 'User Ratings')]")
            if review_span:
                reviews_text = review_span.text.strip()
                reviews_number = ''.join(filter(str.isdigit, reviews_text))
        except:
            print("User reviews not found for this movie.")
        
        # Try to extract genres
        try:
            genre_elements = driver.find_elements(By.XPATH, '//ul[@class="c-genreList u-flexbox g-inner-spacing-top-medium"]/li')
            genres = [genre.text.strip() for genre in genre_elements if genre.text.strip()]
        except:
            print("Genres not found for this movie.")
        
        # Extract rating using BeautifulSoup
        try:
            soup = BeautifulSoup(driver.page_source, "html.parser")    
            rating_container = soup.find('div', class_='c-siteReviewScore_user')
            if rating_container:
                rating_span = rating_container.find('span')
                if rating_span:
                    rating = rating_span.text.strip()
        except:
            print("Rating not found for this movie.")
        
        # Try to extract the release year
        try:
            first_li = soup.find('li', class_='c-heroMetadata_item u-inline')
            if first_li:
                release_year_span = first_li.find('span')
                if release_year_span:
                    release_year = release_year_span.text.strip()
        except:
            print("Release year not found for this movie.")
        
        # Try to extract duration
        try:
            li_elements = soup.find_all('li', class_='c-heroMetadata_item u-inline')
            if len(li_elements) >= 4:
                duration_li = li_elements[3]
                duration_span = duration_li.find('span')
                if duration_span:
                    duration = duration_span.text.strip()
        except:
            print("Duration not found for this movie.")
        
        # Append movie data as a dictionary to the list
        movie_data.append({
            "Title": title,
            "Rating": rating,
            "Number of User Reviews": reviews_number,
            "Year": release_year,
            "Duration": duration,
            "Genres": genres
        })
        
        # Go back to the main movies page
        print(f"{title} data extract End")
        driver.back()
        time.sleep(2)  # Wait for the page to reload
    print(f"\n Page{i}")  

# Close the browser
driver.quit()

# Create a DataFrame from the movie data
df = pd.DataFrame(movie_data, columns=["Title", "Rating", "Number of User Reviews", "Year", "Duration", "Genres"])

# Save the DataFrame to a CSV file
df.to_csv("movies_data.csv", index=False)

Sleepwalkers data extract End
All Eyez on Me data extract End
Balls of Fury data extract End
Never Die Alone data extract End
User reviews not found for this movie.
King of the Gypsies data extract End
The Vatican Tapes data extract End
Just Visiting data extract End
Radio data extract End
The Life Before Her Eyes data extract End
Abandoned data extract End
Ouija data extract End
No Escape data extract End
Gigantic data extract End
User reviews not found for this movie.
The Last Time data extract End
Confessions of a Shopaholic data extract End
The November Man data extract End
Goats data extract End
Lay the Favorite data extract End
Ghosts of War data extract End
The Rite data extract End
Paparazzi data extract End
Raising Helen data extract End
Nutty Professor II: The Klumps data extract End
User reviews not found for this movie.
The Other Side of Heaven data extract End

 Page601
Laws of Attraction data extract End
The Seeker: The Dark Is Rising data extract End
Seeking Justice data