In [5]:
import pandas as pd
import time
import re
from bs4 import BeautifulSoup
import requests

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

options = Options()
options.add_argument("--headless")
options.add_argument("--lang=en-US")
options.add_argument(f"user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36")

driver = webdriver.Chrome(options=options)

BASE_URL = "https://www.imdb.com/chart/top/?ref_=nv_mv_250"

driver.get(BASE_URL)
driver.add_cookie({'name': 'lc-main', 'value': 'en-US'})

last_height = driver.execute_script("return document.body.scrollHeight")
while True:
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    time.sleep(2)
    new_height = driver.execute_script("return document.body.scrollHeight")
    if new_height == last_height:
        break
    last_height = new_height

html = driver.page_source
soup = BeautifulSoup(html, 'html.parser')

movie_grid = soup.find_all("ul", attrs={"class": "ipc-metadata-list"})
print(f"Found {len(movie_grid)} ul.ipc-metadata-list elements")

Found 1 ul.ipc-metadata-list elements


In [38]:
def scrape_movie_details(movie, driver, MOVIE_URL, VALID_GENRES):
    a_tag = movie.find('a', class_='ipc-title-link-wrapper')
    if not a_tag:
        return None
    
    relative_url = a_tag.get('href')
    full_url = MOVIE_URL + relative_url.split('?')[0]
    driver.get(full_url)

    try:
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.TAG_NAME, "h1"))
        )
    except:
        print(f"⚠️ Title not found in time: {full_url}")
        return None

    detail_html = driver.page_source
    detail_soup = BeautifulSoup(detail_html, 'html.parser')

    h3_tag = movie.find('h3', class_='ipc-title__text')
    if h3_tag:
        raw_title = h3_tag.get_text(strip=True)
        title = re.sub(r'^\d+\.\s*', '', raw_title)
    else:
        title = 'N/A'

    genres = []
    genre_div = detail_soup.find('div', class_='ipc-chip-list__scroller')
    if genre_div:
        genre_spans = genre_div.find_all('span', class_='ipc-chip__text')
        for span in genre_spans:
            genre_text = span.get_text(strip=True)
            if genre_text in VALID_GENRES:
                genres.append(genre_text)
    genre_str = ", ".join(genres) if genres else 'N/A'

    year_tag = detail_soup.find('a', href=lambda x: x and '/releaseinfo' in x)
    year = year_tag.text.strip() if year_tag else 'N/A'

    rating_tag = detail_soup.find('span', class_='sc-d541859f-1 imUuxf')
    rating = rating_tag.text.strip() if rating_tag else 'N/A'

    votes_tag = detail_soup.find('div', class_='sc-d541859f-3 dwhNqC')
    votes = votes_tag.text.strip() if votes_tag else 'N/A'

    director_tag = detail_soup.find('a', href=lambda x: x and '/?ref_=tt_ov_dr_' in x)
    director = director_tag.text.strip() if director_tag else 'N/A'

    id = relative_url.split('/')[2]

    headers = {
    "accept": "application/json",
    "Authorization": "Bearer eyJhbGciOiJIUzI1NiJ9.eyJhdWQiOiI5ZmY4MmIzMzFlODllZjZkZDNjMmI1ODc5N2JjYTIzZSIsIm5iZiI6MTc0Nzc0OTM2Mi40MjksInN1YiI6IjY4MmM4OWYyNzUyNzQ4MjRjMmUyNTFlMiIsInNjb3BlcyI6WyJhcGlfcmVhZCJdLCJ2ZXJzaW9uIjoxfQ.UKbsLozO_r8xvD28bYK8F5uz_YLzrWDq3ElTgrOLhuo"
    }
    full_api_url = f"https://api.themoviedb.org/3/movie/{id}?language=en-US"
    driver.get(full_api_url)
    response = requests.get(full_api_url, headers=headers)
    data = response.json()

    budget = data.get('budget', 'N/A') or 'N/A'
    revenue = data.get('revenue', 'N/A') or 'N/A'
    release_date = data.get('release_date', 'N/A') or 'N/A'
    origin_country = data.get('origin_country', 'N/A') or 'N/A'

    return {
        'Title': title,
        'Director': director,
        'Genre': genre_str,
        'Year': year,
        'Release date': release_date,
        'Country': origin_country,
        'Rating': rating,
        'Votes': votes,
        'Budget': budget,
        'Revenue': revenue,
        'IMDB id': id
    }


In [41]:
MOVIE_URL = "https://www.imdb.com"
VALID_GENRES = {
    "Comedy", "Drama", "Action", "Romance", "Horror", 
    "Thriller", "Sci-Fi", "Fantasy", "Animation", 
    "Adventure", "Biography"
}
movie_grid = soup.find_all("ul", attrs={"class": "ipc-metadata-list"})
movies = movie_grid[0].find_all("li", attrs={"class": "ipc-metadata-list-summary-item"})

movie_details_list = []

for movie in movies:
    details = scrape_movie_details(movie, driver, MOVIE_URL, VALID_GENRES)
    if details:
        movie_details_list.append(details)
        print(f"Scraped: {details['Title']}")
    time.sleep(1) 

df = pd.DataFrame(movie_details_list)
print(df.head())


Scraped: The Shawshank Redemption
Scraped: The Godfather
Scraped: The Dark Knight
Scraped: The Godfather Part II
Scraped: 12 Angry Men
Scraped: The Lord of the Rings: The Return of the King
Scraped: Schindler's List
Scraped: Pulp Fiction
Scraped: The Lord of the Rings: The Fellowship of the Ring
Scraped: The Good, the Bad and the Ugly
Scraped: Forrest Gump
Scraped: The Lord of the Rings: The Two Towers
Scraped: Fight Club
Scraped: Inception
Scraped: Star Wars: Episode V - The Empire Strikes Back
Scraped: The Matrix
Scraped: Goodfellas
Scraped: Interstellar
Scraped: One Flew Over the Cuckoo's Nest
Scraped: Se7en
Scraped: It's a Wonderful Life
Scraped: The Silence of the Lambs
Scraped: Seven Samurai
Scraped: Saving Private Ryan
Scraped: City of God
Scraped: The Green Mile
Scraped: Life Is Beautiful
Scraped: Terminator 2: Judgment Day
Scraped: Star Wars: Episode IV - A New Hope
Scraped: Back to the Future
Scraped: Spirited Away
Scraped: The Pianist
Scraped: Gladiator
Scraped: Parasite
Scr

In [None]:
themes_dictionary = {
    "Addiction": [
        "addiction", "alcohol", "drugs", "narcotic", "overdose", "substance abuse"],
    "Aliens": [
        "alien", "extraterrestrial", "first contact", "space", "space invasion"],
    "Apocalypse": [
        "apocalypse", "catastrophe", "end of the world", "post-apocalyptic"],
    "Betrayal": [
        "backstab", "betrayal", "traitor"],
    "Comedy": [
        "comedy", "funny", "hilarious", "humor", "laugh"],
    "Coming of Age": [
        "adolescence", "coming of age", "growing up", "maturity", "teenager"],
    "Crime and Justice": [
        "crime", "detective", "investigation", "jury", "juror", "law", "mafia", "justice", "outlaw", "trial"],
    "Destiny": [
        "call to action", "chosen one", "destiny", "fate", "foreseen", "inevitable", "karma", "kismet", "oracle", "predetermined", "prophecy"],
    "Disability": [
        "amputee", "autism", "blind", "cerebral palsy", "deaf", "disability", "disabled", "handicap", "hearing loss",
        "impairment", "mute", "paralysis", "prosthetic", "speech disorder", "visual impairment", "wheelchair"],
    "Dystopia": [
        "authoritarian", "dystopia", "totalitarian"],
    "Environmental Disaster": [
        "climate", "disaster", "ecological", "environment", "pollution"],
    "Family": [
        "adoption", "family bond", "family conflict", "family drama", "family feud", "family loyalty", "family sacrifice",
        "family secret", "family support", "family ties", "family tradition", "family values", "foster care",
        "generational", "household", "inheritance", "motherhood", "parental", "parental love", "reunion", "stepfamily", "fatherhood"],
    "Friendship": [
        "allies", "companionship", "fellowship", "friend", "friendship"],
    "Happy Ending": [
        "feel good", "happy ending", "hopeful"],
    "Hero’s Journey": [
        "hero's journey", "heroic", "mission", "quest"],
    "Identity Crisis": [
        "crisis of identity", "existential", "feeling lost", "inner conflict", "introspection", "lost sense of self",
        "personal awakening", "personal truth", "purpose", "questioning reality", "redefine oneself", "search for meaning",
        "self-discovery", "self-exploration", "sense of self", "what am i"],
    "LGBTI+": [
        "bisexual", "gay", "gender identity", "lesbian", "lgbt", "queer", "sex change", "sex gender", "trans"],
    "Mental Health": [
        "low IQ", "anxiety", "autism", "bipolar", "depression", "down syndrome", "mental illness",
        "paranoi", "psychiatric", "psychological", "schizophrenia"],
    "Moral Dilemma": [
        "ethics", "moral dilemma", "right and wrong"],
    "Music": [
        "choir", "composer", "concert", "conductor", "drum", "guitar", "hip hop", "jazz", "musical", "musician",
        "opera", "orchestra", "performance", "pianist", "piano", "singer", "songwriter", "symphony", "violin"],
    "Overcoming Adversity": [
        "adversity", "overcome", "overcoming", "resilience", "struggle"],
    "Political Corruption": [
        "conspiracy", "corruption", "cover-up", "politician"],
    "Psychological": [
        "agonizing", "disturbing", "mind-bending", "nail-biting", "twisted"],
    "Psychological Abuse": [
        "bullying", "gaslight", "mentally abusive", "psychological abuse", "psychological harassment", "psychological manipulation", "verbal abuse"],
    "Redemption": [
        "atonement", "forgiveness", "redemption"],
    "Revenge": [
        "payback", "revenge", "vengeance"],
    "Social Inequality": [
        "capitalism", "upper class", "inequality", "injustice", "oppression", "poverty"],
    "Super-Hero": [
        "avengers", "batman", "dark knight", "dc", "marvel", "spider-man", "spiderman", "superman"],
    "Survival": [
        "danger", "escape", "stranded", "survival", "survivor", "wilderness"],
    "Technology and AI": [
        "android", "artificial intelligence", "biotechnology", "cybernetic", "interface",
        "machine uprising", "nanotechnology", "robot", "synthetic", "tech", "virtual reality"],
    "Violence": [
        "assault", "behead", "brutal", "decapitat", "gore", "homicide", "kill", "murder", "stabbing", "violence", "violent"],
    "War": [
        "battle", "combat", "military", "soldier", "world war", "world-war"],
    "Zombie": [
        "undead", "zombie"]
}

themes
[Friendship]                                                         15
[Overcoming Adversity]                                                8
[Survival]                                                            7
[Violence]                                                            7
[War]                                                                 6
                                                                     ..
[Family, Identity Crisis]                                             1
[Crime and Justice, Destiny, Super-Hero]                              1
[Comedy, Revenge, Violence, War]                                      1
[Crime and Justice, Overcoming Adversity, Social Inequality, War]     1
[Betrayal, Crime and Justice, Revenge, Survival, Violence, War]       1
Name: count, Length: 164, dtype: int64

In [6]:
summary = pd.read_csv('themes_summary.csv')
summary

Unnamed: 0,themes_clean,movie_count,total_profit,average_rating
0,Crime and Justice,57,6011680000.0,8.329825
1,Violence,50,7096710000.0,8.336
2,Friendship,49,12646410000.0,8.302041
3,Survival,36,8707576000.0,8.322222
4,Overcoming Adversity,33,5516913000.0,8.387879
5,Betrayal,30,2338507000.0,8.353333
6,Destiny,29,12410630000.0,8.403448
7,Hero’s Journey,26,9017728000.0,8.376923
8,Mental Health,24,3807240000.0,8.304167
9,War,21,4664872000.0,8.3
