In [12]:
import pandas as pd
import time
import re
from bs4 import BeautifulSoup

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

options = Options()
options.add_argument("--headless")
options.add_argument("--lang=en-US")
options.add_argument(f"user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36")

driver = webdriver.Chrome(options=options)

BASE_URL = "https://www.imdb.com/chart/top/?ref_=nv_mv_250"

driver.get(BASE_URL)
driver.add_cookie({'name': 'lc-main', 'value': 'en-US'})

last_height = driver.execute_script("return document.body.scrollHeight")
while True:
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    time.sleep(2)
    new_height = driver.execute_script("return document.body.scrollHeight")
    if new_height == last_height:
        break
    last_height = new_height

html = driver.page_source
soup = BeautifulSoup(html, 'html.parser')

movie_grid = soup.find_all("ul", attrs={"class": "ipc-metadata-list"})
print(f"Found {len(movie_grid)} ul.ipc-metadata-list elements")

Found 1 ul.ipc-metadata-list elements


In [13]:
def scrape_movie_details(movie, driver, MOVIE_URL, VALID_GENRES):
    a_tag = movie.find('a', class_='ipc-title-link-wrapper')
    if not a_tag:
        return None
    
    relative_url = a_tag.get('href')
    full_url = MOVIE_URL + relative_url.split('?')[0]
    driver.get(full_url)

    try:
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.TAG_NAME, "h1"))
        )
    except:
        print(f"⚠️ Title not found in time: {full_url}")
        return None

    detail_html = driver.page_source
    detail_soup = BeautifulSoup(detail_html, 'html.parser')

    h3_tag = movie.find('h3', class_='ipc-title__text')
    if h3_tag:
        raw_title = h3_tag.get_text(strip=True)
        title = re.sub(r'^\d+\.\s*', '', raw_title)
    else:
        title = 'N/A'

    genres = []
    genre_div = detail_soup.find('div', class_='ipc-chip-list__scroller')
    if genre_div:
        genre_spans = genre_div.find_all('span', class_='ipc-chip__text')
        for span in genre_spans:
            genre_text = span.get_text(strip=True)
            if genre_text in VALID_GENRES:
                genres.append(genre_text)
    genre_str = ", ".join(genres) if genres else 'N/A'

    year_tag = detail_soup.find('a', href=lambda x: x and '/releaseinfo' in x)
    year = year_tag.text.strip() if year_tag else 'N/A'

    rating_tag = detail_soup.find('span', class_='sc-d541859f-1 imUuxf')
    rating = rating_tag.text.strip() if rating_tag else 'N/A'

    director_tag = detail_soup.find('a', href=lambda x: x and '/?ref_=tt_ov_dr_' in x)
    director = director_tag.text.strip() if director_tag else 'N/A'

    return {
        'Title': title,
        'Genre': genre_str,
        'Year': year,
        'Rating': rating,
        'Director': director
    }


In [14]:
MOVIE_URL = "https://www.imdb.com"
VALID_GENRES = {
    "Comedy", "Drama", "Action", "Romance", "Horror", 
    "Thriller", "Sci-Fi", "Fantasy", "Animation", 
    "Adventure", "Biography"
}
movie_grid = soup.find_all("ul", attrs={"class": "ipc-metadata-list"})
movies = movie_grid[0].find_all("li", attrs={"class": "ipc-metadata-list-summary-item"})

movie_details_list = []

for movie in movies:
    details = scrape_movie_details(movie, driver, MOVIE_URL, VALID_GENRES)
    if details:
        movie_details_list.append(details)
        print(f"Scraped: {details['Title']}")
    time.sleep(1) 

df = pd.DataFrame(movie_details_list)
print(df.head())


Scraped: The Shawshank Redemption
Scraped: The Godfather
Scraped: The Dark Knight
Scraped: The Godfather Part II
Scraped: 12 Angry Men
Scraped: The Lord of the Rings: The Return of the King
Scraped: Schindler's List
Scraped: Pulp Fiction
Scraped: The Lord of the Rings: The Fellowship of the Ring
Scraped: The Good, the Bad and the Ugly
Scraped: Forrest Gump
Scraped: The Lord of the Rings: The Two Towers
Scraped: Fight Club
Scraped: Inception
Scraped: Star Wars: Episode V - The Empire Strikes Back
Scraped: The Matrix
Scraped: Goodfellas
Scraped: Interstellar
Scraped: One Flew Over the Cuckoo's Nest
Scraped: Se7en
Scraped: It's a Wonderful Life
Scraped: The Silence of the Lambs
Scraped: Seven Samurai
Scraped: Saving Private Ryan
Scraped: City of God
Scraped: The Green Mile
Scraped: Life Is Beautiful
Scraped: Terminator 2: Judgment Day
Scraped: Star Wars: Episode IV - A New Hope
Scraped: Back to the Future
Scraped: Spirited Away
Scraped: The Pianist
Scraped: Gladiator
Scraped: Parasite
Scr

In [15]:
df

Unnamed: 0,Title,Genre,Year,Rating,Director
0,The Shawshank Redemption,Drama,1994,9.3,Frank Darabont
1,The Godfather,Drama,1972,9.2,Francis Ford Coppola
2,The Dark Knight,"Action, Drama, Thriller",2008,9.0,Christopher Nolan
3,The Godfather Part II,Drama,1974,9.0,Francis Ford Coppola
4,12 Angry Men,Drama,1957,9.0,Sidney Lumet
...,...,...,...,...,...
245,Groundhog Day,"Comedy, Drama, Fantasy, Romance",1993,8.0,Harold Ramis
246,The Help,Drama,2011,8.1,Tate Taylor
247,Amores Perros,"Drama, Thriller",2000,8.0,Alejandro G. Iñárritu
248,Drishyam,"Drama, Thriller",2015,8.2,Nishikant Kamat


In [None]:
df.to_csv()