##### BAIS 3250 Project Webscrape 
##### IMDb Movie Data Web Scraping
###### May 9, 2025
###### Ella Solie

###### Import libraries

In [1]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
import time
import random
import requests
import re

###### Open IMDb movie dataset URL

In [4]:
options = webdriver.ChromeOptions()
options.headless = True  # Run in headless mode (no browser UI)
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

# URL for IMDb movie list
url = "https://www.imdb.com/list/ls026212430/"

# Open the URL
driver.get(url)
time.sleep(random.uniform(5, 10))  # Wait for the page to load

###### Create lists to store extracted data. Extract data from movie dataset.

In [6]:
imdb_ids, titles, ratings, years, votes, durations, movie_ratings = [], [], [], [], [], [], []  # Renamed variable for movie ratings

while True:  # Loop through pages
    # Extract movie elements
    movies = driver.find_elements(By.CLASS_NAME, "ipc-metadata-list-summary-item")

    for movie in movies:
        # Extract IMDb ID from href
        try:
            link = movie.find_element(By.TAG_NAME, 'a').get_attribute('href')
            imdb_id = link.split('/')[4]  # e.g., /title/tt1234567/
            imdb_ids.append(imdb_id)
        except:
            imdb_ids.append(None)
        # Extract title
        try:
            title = movie.find_element(By.CLASS_NAME, "ipc-title__text").text
            titles.append(title)
        except:
            titles.append(None)

        # Extract IMDb star rating
        try:
            rating = movie.find_element(By.CLASS_NAME, "ipc-rating-star--rating").text
            ratings.append(float(rating))
        except:
            ratings.append(None)

        # Extract vote count
        try:
            vote_text = movie.find_element(By.XPATH, ".//span[contains(@class, 'voteCount')]").text
            vote_text = vote_text.replace("(", "").replace(")", "")
            if 'M' in vote_text:
                vote_text = vote_text.replace('M', '').strip()
                votes.append(int(float(vote_text) * 1_000_000))
            elif 'K' in vote_text:
                vote_text = vote_text.replace('K', '').strip()
                votes.append(int(float(vote_text) * 1_000))
            else:
                votes.append(int(vote_text.replace(",", "").strip()))
        except:
            votes.append(None)

        # Extract year
        try:
            year_text = movie.find_element(By.XPATH, ".//span[contains(@class, 'dli-title')]").text
            years.append(year_text)
        except:
            years.append(None)

        # Extract duration
        try:
            duration_text = movie.find_element(By.XPATH, ".//span[contains(@class, 'metadata-item') and contains(text(), 'm')]").text
            durations.append(duration_text)
        except:
            durations.append(None)
            
        # Extract movie rating
        try:
            movie_rating = movie.find_element(By.XPATH, ".//span[contains(@class, 'metadata-item') and (contains(text(), 'G') or contains(text(), 'PG') or contains(text(), 'PG-13') or contains(text(), 'R') or contains(text(), 'Approved') or contains(text(), 'Not Rated') or contains(text(), 'NC-17'))]").text
            movie_ratings.append(movie_rating)  
        except:
            movie_ratings.append(None)

    # Find the "Next" button 
    try:
        next_buttons = driver.find_elements(By.XPATH, "//button/span[contains(text(), 'Next')]")
        if next_buttons:
            next_button = next_buttons[0]  # Select first match
            driver.execute_script("arguments[0].scrollIntoView();", next_button) 
            time.sleep(random.uniform(2, 5))  # Allow time 
            next_button.click()
            time.sleep(random.uniform(5, 10))  # Wait for new page to load
        else:
            print("No more pages to scrape.")
            break  # Exit loop if there is no "Next" button
    except:
        print("No more pages to scrape.")
        break 

# Close the Chrome browser
driver.quit()

# Save the data to a DataFrame
df = pd.DataFrame({
    "imdb_id": imdb_ids,
    "movie_title": titles,
    "average_star_rating_movie": ratings,
    "release_year_movie": years,
    "vote_count_movie": votes,
    "duration_movie": durations,
    "movie_rating": movie_ratings # Updated list name to match variable
})

print(df)

No more pages to scrape.
       imdb_id                                        movie_title  \
0    tt0109830                                    1. Forrest Gump   
1    tt0068646                                   2. The Godfather   
2    tt0120737  3. The Lord of the Rings: The Fellowship of th...   
3    tt0167260   4. The Lord of the Rings: The Return of the King   
4    tt0167261           5. The Lord of the Rings: The Two Towers   
..         ...                                                ...   
245  tt0092005                                   246. Stand by Me   
246  tt0061452                                 247. Casino Royale   
247  tt0381061                                 248. Casino Royale   
248  tt0360556                                249. Fahrenheit 451   
249  tt1302006                                  250. The Irishman   

     average_star_rating_movie release_year_movie  vote_count_movie  \
0                          8.8               1994         2400000.0   
1   

In [9]:
# Function to scrape the top review from IMDb for a given movie
def scrape_top_review(imdb_id):
    url = f"https://www.imdb.com/title/{imdb_id}/reviews"
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)',
        'Accept-Language': 'en-US,en;q=0.9'
    }

    response = requests.get(url, headers=headers)
    html = response.text

    match = re.search(r'<div class="ipc-html-content-inner-div" role="presentation">(.+?)</div>', html, re.DOTALL)
    if match:
        return re.sub(r'<.*?>', '', match.group(1)).strip()
    return None

# Scrape top reviews for all IMDb IDs collected
top_reviews = []

for imdb_id in df['imdb_id']:
    print(f"Scraping first review for {imdb_id}.")
    review = scrape_top_review(imdb_id)
    top_reviews.append(review)
    time.sleep(1.5)  # Be respectful

# Add review column to DataFrame
df['review_text'] = top_reviews

print(df)

Scraping top review for tt0109830...
Scraping top review for tt0068646...
Scraping top review for tt0120737...
Scraping top review for tt0167260...
Scraping top review for tt0167261...
Scraping top review for tt0914798...
Scraping top review for tt0414387...
Scraping top review for tt0332280...
Scraping top review for tt0421715...
Scraping top review for tt0281358...
Scraping top review for tt0480249...
Scraping top review for tt0396171...
Scraping top review for tt0458352...
Scraping top review for tt0253474...
Scraping top review for tt0398808...
Scraping top review for tt0431308...
Scraping top review for tt1078588...
Scraping top review for tt0382625...
Scraping top review for tt0808151...
Scraping top review for tt3062096...
Scraping top review for tt0822832...
Scraping top review for tt0343818...
Scraping top review for tt0110148...
Scraping top review for tt0107290...
Scraping top review for tt0108052...
Scraping top review for tt1010048...
Scraping top review for tt0243155...
S

###### Save the dataframe to CSV file

In [11]:
df.to_csv("esolie_bais3250_project_webscrape_imdb.csv", index=False, encoding='utf-8')