In [18]:
import pandas as pd
import time
import re
from bs4 import BeautifulSoup
import requests

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

options = Options()
options.add_argument("--headless")
options.add_argument("--lang=en-US")
options.add_argument(f"user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36")

driver = webdriver.Chrome(options=options)

BASE_URL = "https://www.imdb.com/chart/top/?ref_=nv_mv_250"

driver.get(BASE_URL)
driver.add_cookie({'name': 'lc-main', 'value': 'en-US'})

last_height = driver.execute_script("return document.body.scrollHeight")
while True:
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    time.sleep(2)
    new_height = driver.execute_script("return document.body.scrollHeight")
    if new_height == last_height:
        break
    last_height = new_height

html = driver.page_source
soup = BeautifulSoup(html, 'html.parser')

movie_grid = soup.find_all("ul", attrs={"class": "ipc-metadata-list"})
print(f"Found {len(movie_grid)} ul.ipc-metadata-list elements")

Found 1 ul.ipc-metadata-list elements


In [38]:
def scrape_movie_details(movie, driver, MOVIE_URL, VALID_GENRES):
    a_tag = movie.find('a', class_='ipc-title-link-wrapper')
    if not a_tag:
        return None
    
    relative_url = a_tag.get('href')
    full_url = MOVIE_URL + relative_url.split('?')[0]
    driver.get(full_url)

    try:
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.TAG_NAME, "h1"))
        )
    except:
        print(f"⚠️ Title not found in time: {full_url}")
        return None

    detail_html = driver.page_source
    detail_soup = BeautifulSoup(detail_html, 'html.parser')

    h3_tag = movie.find('h3', class_='ipc-title__text')
    if h3_tag:
        raw_title = h3_tag.get_text(strip=True)
        title = re.sub(r'^\d+\.\s*', '', raw_title)
    else:
        title = 'N/A'

    genres = []
    genre_div = detail_soup.find('div', class_='ipc-chip-list__scroller')
    if genre_div:
        genre_spans = genre_div.find_all('span', class_='ipc-chip__text')
        for span in genre_spans:
            genre_text = span.get_text(strip=True)
            if genre_text in VALID_GENRES:
                genres.append(genre_text)
    genre_str = ", ".join(genres) if genres else 'N/A'

    year_tag = detail_soup.find('a', href=lambda x: x and '/releaseinfo' in x)
    year = year_tag.text.strip() if year_tag else 'N/A'

    rating_tag = detail_soup.find('span', class_='sc-d541859f-1 imUuxf')
    rating = rating_tag.text.strip() if rating_tag else 'N/A'

    votes_tag = detail_soup.find('div', class_='sc-d541859f-3 dwhNqC')
    votes = votes_tag.text.strip() if votes_tag else 'N/A'

    director_tag = detail_soup.find('a', href=lambda x: x and '/?ref_=tt_ov_dr_' in x)
    director = director_tag.text.strip() if director_tag else 'N/A'

    id = relative_url.split('/')[2]

    headers = {
    "accept": "application/json",
    "Authorization": "Bearer eyJhbGciOiJIUzI1NiJ9.eyJhdWQiOiI5ZmY4MmIzMzFlODllZjZkZDNjMmI1ODc5N2JjYTIzZSIsIm5iZiI6MTc0Nzc0OTM2Mi40MjksInN1YiI6IjY4MmM4OWYyNzUyNzQ4MjRjMmUyNTFlMiIsInNjb3BlcyI6WyJhcGlfcmVhZCJdLCJ2ZXJzaW9uIjoxfQ.UKbsLozO_r8xvD28bYK8F5uz_YLzrWDq3ElTgrOLhuo"
    }
    full_api_url = f"https://api.themoviedb.org/3/movie/{id}?language=en-US"
    driver.get(full_api_url)
    response = requests.get(full_api_url, headers=headers)
    data = response.json()

    budget = data.get('budget', 'N/A') or 'N/A'
    revenue = data.get('revenue', 'N/A') or 'N/A'
    release_date = data.get('release_date', 'N/A') or 'N/A'
    origin_country = data.get('origin_country', 'N/A') or 'N/A'

    return {
        'Title': title,
        'Director': director,
        'Genre': genre_str,
        'Year': year,
        'Release date': release_date,
        'Country': origin_country,
        'Rating': rating,
        'Votes': votes,
        'Budget': budget,
        'Revenue': revenue,
        'IMDB id': id
    }


In [41]:
MOVIE_URL = "https://www.imdb.com"
VALID_GENRES = {
    "Comedy", "Drama", "Action", "Romance", "Horror", 
    "Thriller", "Sci-Fi", "Fantasy", "Animation", 
    "Adventure", "Biography"
}
movie_grid = soup.find_all("ul", attrs={"class": "ipc-metadata-list"})
movies = movie_grid[0].find_all("li", attrs={"class": "ipc-metadata-list-summary-item"})

movie_details_list = []

for movie in movies:
    details = scrape_movie_details(movie, driver, MOVIE_URL, VALID_GENRES)
    if details:
        movie_details_list.append(details)
        print(f"Scraped: {details['Title']}")
    time.sleep(1) 

df = pd.DataFrame(movie_details_list)
print(df.head())


Scraped: The Shawshank Redemption
Scraped: The Godfather
Scraped: The Dark Knight
Scraped: The Godfather Part II
Scraped: 12 Angry Men
Scraped: The Lord of the Rings: The Return of the King
Scraped: Schindler's List
Scraped: Pulp Fiction
Scraped: The Lord of the Rings: The Fellowship of the Ring
Scraped: The Good, the Bad and the Ugly
Scraped: Forrest Gump
Scraped: The Lord of the Rings: The Two Towers
Scraped: Fight Club
Scraped: Inception
Scraped: Star Wars: Episode V - The Empire Strikes Back
Scraped: The Matrix
Scraped: Goodfellas
Scraped: Interstellar
Scraped: One Flew Over the Cuckoo's Nest
Scraped: Se7en
Scraped: It's a Wonderful Life
Scraped: The Silence of the Lambs
Scraped: Seven Samurai
Scraped: Saving Private Ryan
Scraped: City of God
Scraped: The Green Mile
Scraped: Life Is Beautiful
Scraped: Terminator 2: Judgment Day
Scraped: Star Wars: Episode IV - A New Hope
Scraped: Back to the Future
Scraped: Spirited Away
Scraped: The Pianist
Scraped: Gladiator
Scraped: Parasite
Scr

In [66]:
import requests
import json

url = "https://api.themoviedb.org/3/movie/tt0111161?language=en-US"

headers = {
    "accept": "application/json",
    "Authorization": "Bearer eyJhbGciOiJIUzI1NiJ9.eyJhdWQiOiI5ZmY4MmIzMzFlODllZjZkZDNjMmI1ODc5N2JjYTIzZSIsIm5iZiI6MTc0Nzc0OTM2Mi40MjksInN1YiI6IjY4MmM4OWYyNzUyNzQ4MjRjMmUyNTFlMiIsInNjb3BlcyI6WyJhcGlfcmVhZCJdLCJ2ZXJzaW9uIjoxfQ.UKbsLozO_r8xvD28bYK8F5uz_YLzrWDq3ElTgrOLhuo"
}

response = requests.get(url, headers=headers)
data = response.json()

print(json.dumps(data, indent=4))

{
    "adult": false,
    "backdrop_path": "/kXfqcdQKsToO0OUXHcrrNCHDBzO.jpg",
    "belongs_to_collection": null,
    "budget": 25000000,
    "genres": [
        {
            "id": 18,
            "name": "Drama"
        },
        {
            "id": 80,
            "name": "Crime"
        }
    ],
    "homepage": "",
    "id": 278,
    "imdb_id": "tt0111161",
    "origin_country": [
        "US"
    ],
    "original_language": "en",
    "original_title": "The Shawshank Redemption",
    "overview": "Imprisoned in the 1940s for the double murder of his wife and her lover, upstanding banker Andy Dufresne begins a new life at the Shawshank prison, where he puts his accounting skills to work for an amoral warden. During his long stretch in prison, Dufresne comes to be admired by the other inmates -- including an older prisoner named Red -- for his integrity and unquenchable sense of hope.",
    "popularity": 33.2316,
    "poster_path": "/9cqNxx0GxF0bflZmeSMuL5tnGzr.jpg",
    "production_

In [45]:
df = pd.read_csv('C:/Users/hsdc/OneDrive/Documentos/IRONHACK/Week 3/imdb/top_250.csv')
df

Unnamed: 0,Title,Director,Genre,Year,Release date,Country,Rating,Votes,Budget,Revenue,IMDB id
0,The Shawshank Redemption,Frank Darabont,Drama,1994,1994-09-23,['US'],9.3,3M,25000000.0,2.834147e+07,tt0111161
1,The Godfather,Francis Ford Coppola,Drama,1972,1972-03-14,['US'],9.2,2.1M,6000000.0,2.450664e+08,tt0068646
2,The Dark Knight,Christopher Nolan,"Action, Drama, Thriller",2008,2008-07-16,['US'],9.0,3M,185000000.0,1.004558e+09,tt0468569
3,The Godfather Part II,Francis Ford Coppola,Drama,1974,1974-12-20,['US'],9.0,1.4M,13000000.0,1.026000e+08,tt0071562
4,12 Angry Men,Sidney Lumet,Drama,1957,1957-04-10,['US'],9.0,926K,397751.0,4.360000e+06,tt0050083
...,...,...,...,...,...,...,...,...,...,...,...
245,Groundhog Day,Harold Ramis,"Comedy, Drama, Fantasy, Romance",1993,1993-02-11,['US'],8.0,717K,14600000.0,7.110878e+07,tt0107048
246,The Help,Tate Taylor,Drama,2011,2011-08-09,['US'],8.1,513K,25000000.0,2.166000e+08,tt1454029
247,Amores Perros,Alejandro G. Iñárritu,"Drama, Thriller",2000,2000-06-16,['MX'],8.0,268K,2000000.0,2.090847e+07,tt0245712
248,Drishyam,Nishikant Kamat,"Drama, Thriller",2015,2015-07-30,['IN'],8.2,101K,4600000.0,1.800000e+07,tt4430212


In [49]:
df.to_csv('C:/Users/hsdc/OneDrive/Documentos/IRONHACK/Week 3/imdb/top_250.csv', index=False)

In [50]:
df.dtypes

Title            object
Director         object
Genre            object
Year              int64
Release date     object
Country          object
Rating          float64
Votes            object
Budget           object
Revenue          object
IMDB id          object
dtype: object

In [53]:
pip install sentence-transformers scikit-learn pandas

Collecting sentence-transformersNote: you may need to restart the kernel to use updated packages.

  Downloading sentence_transformers-4.1.0-py3-none-any.whl.metadata (13 kB)
Collecting transformers<5.0.0,>=4.41.0 (from sentence-transformers)
  Downloading transformers-4.52.1-py3-none-any.whl.metadata (38 kB)
Collecting torch>=1.11.0 (from sentence-transformers)
  Downloading torch-2.7.0-cp312-cp312-win_amd64.whl.metadata (29 kB)
Collecting huggingface-hub>=0.20.0 (from sentence-transformers)
  Downloading huggingface_hub-0.31.4-py3-none-any.whl.metadata (13 kB)
Collecting sympy>=1.13.3 (from torch>=1.11.0->sentence-transformers)
  Downloading sympy-1.14.0-py3-none-any.whl.metadata (12 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers<5.0.0,>=4.41.0->sentence-transformers)
  Downloading tokenizers-0.21.1-cp39-abi3-win_amd64.whl.metadata (6.9 kB)
Collecting safetensors>=0.4.3 (from transformers<5.0.0,>=4.41.0->sentence-transformers)
  Downloading safetensors-0.5.3-cp38-abi3-win_a

In [58]:
df2 = df

def get_description(imdb_id):
    headers = {
        "accept": "application/json",
        "Authorization": "Bearer eyJhbGciOiJIUzI1NiJ9.eyJhdWQiOiI5ZmY4MmIzMzFlODllZjZkZDNjMmI1ODc5N2JjYTIzZSIsIm5iZiI6MTc0Nzc0OTM2Mi40MjksInN1YiI6IjY4MmM4OWYyNzUyNzQ4MjRjMmUyNTFlMiIsInNjb3BlcyI6WyJhcGlfcmVhZCJdLCJ2ZXJzaW9uIjoxfQ.UKbsLozO_r8xvD28bYK8F5uz_YLzrWDq3ElTgrOLhuo"
    }
    url = f"https://api.themoviedb.org/3/movie/{imdb_id}?language=en-US"
    response = requests.get(url, headers=headers)
    data = response.json()
    return data.get('overview', 'N/A') or 'N/A'

df2['description'] = df2['IMDB id'].apply(get_description)
df2

Unnamed: 0,Title,Director,Genre,Year,Release date,Country,Rating,Votes,Budget,Revenue,IMDB id,description
0,The Shawshank Redemption,Frank Darabont,Drama,1994,1994-09-23,['US'],9.3,3M,25000000.0,28341469.0,tt0111161,Imprisoned in the 1940s for the double murder ...
1,The Godfather,Francis Ford Coppola,Drama,1972,1972-03-14,['US'],9.2,2.1M,6000000.0,245066411.0,tt0068646,"Spanning the years 1945 to 1955, a chronicle o..."
2,The Dark Knight,Christopher Nolan,"Action, Drama, Thriller",2008,2008-07-16,['US'],9.0,3M,185000000.0,1004558444.0,tt0468569,Batman raises the stakes in his war on crime. ...
3,The Godfather Part II,Francis Ford Coppola,Drama,1974,1974-12-20,['US'],9.0,1.4M,13000000.0,102600000.0,tt0071562,In the continuing saga of the Corleone crime f...
4,12 Angry Men,Sidney Lumet,Drama,1957,1957-04-10,['US'],9.0,926K,397751.0,4360000.0,tt0050083,The defense and the prosecution have rested an...
...,...,...,...,...,...,...,...,...,...,...,...,...
245,Groundhog Day,Harold Ramis,"Comedy, Drama, Fantasy, Romance",1993,1993-02-11,['US'],8.0,717K,14600000.0,71108778.0,tt0107048,"A narcissistic TV weatherman, along with his a..."
246,The Help,Tate Taylor,Drama,2011,2011-08-09,['US'],8.1,513K,25000000.0,216600000.0,tt1454029,Aibileen Clark is a middle-aged African-Americ...
247,Amores Perros,Alejandro G. Iñárritu,"Drama, Thriller",2000,2000-06-16,['MX'],8.0,268K,2000000.0,20908467.0,tt0245712,A fatalistic car crash in Mexico city sets off...
248,Drishyam,Nishikant Kamat,"Drama, Thriller",2015,2015-07-30,['IN'],8.2,101K,4600000.0,18000000.0,tt4430212,A simple street-smart man tries to protect his...


In [61]:
df2.loc[df['Title'] == 'Gangs of Wasseypur', ['description']] = ['Spanning generations in the coal town of Wasseypur, the film follows the rise and fall of rival crime families locked in a violent feud over power, revenge, and survival. As one generation gives way to the next, the stakes grow deadlier, with sons inheriting the blood debts of their fathers and turning Wasseypur into a battlefield of ambition and betrayal.']
df.drop('Description', axis=1, inplace=True)
df2

Unnamed: 0,Title,Director,Genre,Year,Release date,Country,Rating,Votes,Budget,Revenue,IMDB id,description
0,The Shawshank Redemption,Frank Darabont,Drama,1994,1994-09-23,['US'],9.3,3M,25000000.0,28341469.0,tt0111161,Imprisoned in the 1940s for the double murder ...
1,The Godfather,Francis Ford Coppola,Drama,1972,1972-03-14,['US'],9.2,2.1M,6000000.0,245066411.0,tt0068646,"Spanning the years 1945 to 1955, a chronicle o..."
2,The Dark Knight,Christopher Nolan,"Action, Drama, Thriller",2008,2008-07-16,['US'],9.0,3M,185000000.0,1004558444.0,tt0468569,Batman raises the stakes in his war on crime. ...
3,The Godfather Part II,Francis Ford Coppola,Drama,1974,1974-12-20,['US'],9.0,1.4M,13000000.0,102600000.0,tt0071562,In the continuing saga of the Corleone crime f...
4,12 Angry Men,Sidney Lumet,Drama,1957,1957-04-10,['US'],9.0,926K,397751.0,4360000.0,tt0050083,The defense and the prosecution have rested an...
...,...,...,...,...,...,...,...,...,...,...,...,...
245,Groundhog Day,Harold Ramis,"Comedy, Drama, Fantasy, Romance",1993,1993-02-11,['US'],8.0,717K,14600000.0,71108778.0,tt0107048,"A narcissistic TV weatherman, along with his a..."
246,The Help,Tate Taylor,Drama,2011,2011-08-09,['US'],8.1,513K,25000000.0,216600000.0,tt1454029,Aibileen Clark is a middle-aged African-Americ...
247,Amores Perros,Alejandro G. Iñárritu,"Drama, Thriller",2000,2000-06-16,['MX'],8.0,268K,2000000.0,20908467.0,tt0245712,A fatalistic car crash in Mexico city sets off...
248,Drishyam,Nishikant Kamat,"Drama, Thriller",2015,2015-07-30,['IN'],8.2,101K,4600000.0,18000000.0,tt4430212,A simple street-smart man tries to protect his...


In [180]:
from sentence_transformers import SentenceTransformer, util
import torch

# Load model once
model = SentenceTransformer('all-MiniLM-L6-v2')

themes_dictionary = {
    "Addiction": [
        "addiction", "alcohol", "drugs", "narcotic", "overdose", "substance abuse"],
    "Aliens": [
        "alien", "extraterrestrial", "first contact", "space", "space invasion"],
    "Apocalypse": [
        "apocalypse", "catastrophe", "end of the world", "post-apocalyptic"],
    "Betrayal": [
        "backstab", "betrayal", "traitor"],
    "Comedy": [
        "comedy", "funny", "hilarious", "humor", "laugh"],
    "Coming of Age": [
        "adolescence", "coming of age", "growing up", "maturity", "teenager"],
    "Crime and Justice": [
        "crime", "detective", "investigation", "jury", "juror", "law", "mafia", "justice", "outlaw", "trial"],
    "Destiny": [
        "call to action", "chosen one", "destiny", "fate", "foreseen", "inevitable", "karma", "kismet", "oracle", "predetermined", "prophecy"],
    "Disability": [
        "amputee", "autism", "blind", "cerebral palsy", "deaf", "disability", "disabled", "handicap", "hearing loss",
        "impairment", "mute", "paralysis", "prosthetic", "speech disorder", "visual impairment", "wheelchair"],
    "Dystopia": [
        "authoritarian", "dystopia", "totalitarian"],
    "Environmental Disaster": [
        "climate", "disaster", "ecological", "environment", "pollution"],
    "Family": [
        "adoption", "family bond", "family conflict", "family drama", "family feud", "family loyalty", "family sacrifice",
        "family secret", "family support", "family ties", "family tradition", "family values", "foster care",
        "generational", "household", "inheritance", "motherhood", "parental", "parental love", "reunion", "stepfamily", "fatherhood"],
    "Friendship": [
        "allies", "companionship", "fellowship", "friend", "friendship"],
    "Happy Ending": [
        "feel good", "happy ending", "hopeful"],
    "Hero’s Journey": [
        "hero's journey", "heroic", "mission", "quest"],
    "Identity Crisis": [
        "crisis of identity", "existential", "feeling lost", "inner conflict", "introspection", "lost sense of self",
        "personal awakening", "personal truth", "purpose", "questioning reality", "redefine oneself", "search for meaning",
        "self-discovery", "self-exploration", "sense of self", "what am i"],
    "LGBTI+": [
        "bisexual", "gay", "gender identity", "lesbian", "lgbt", "queer", "sex change", "sex gender", "trans"],
    "Mental Health": [
        "low IQ", "anxiety", "autism", "bipolar", "depression", "down syndrome", "mental illness",
        "paranoi", "psychiatric", "psychological", "schizophrenia"],
    "Moral Dilemma": [
        "ethics", "moral dilemma", "right and wrong"],
    "Music": [
        "choir", "composer", "concert", "conductor", "drum", "guitar", "hip hop", "jazz", "musical", "musician",
        "opera", "orchestra", "performance", "pianist", "piano", "singer", "songwriter", "symphony", "violin"],
    "Overcoming Adversity": [
        "adversity", "overcome", "overcoming", "resilience", "struggle"],
    "Political Corruption": [
        "conspiracy", "corruption", "cover-up", "politician"],
    "Psychological": [
        "agonizing", "disturbing", "mind-bending", "nail-biting", "twisted"],
    "Psychological Abuse": [
        "bullying", "gaslight", "mentally abusive", "psychological abuse", "psychological harassment", "psychological manipulation", "verbal abuse"],
    "Redemption": [
        "atonement", "forgiveness", "redemption"],
    "Revenge": [
        "payback", "revenge", "vengeance"],
    "Social Inequality": [
        "capitalism", "upper class", "inequality", "injustice", "oppression", "poverty"],
    "Super-Hero": [
        "avengers", "batman", "dark knight", "dc", "marvel", "spider-man", "spiderman", "superman"],
    "Survival": [
        "danger", "escape", "stranded", "survival", "survivor", "wilderness"],
    "Technology and AI": [
        "android", "artificial intelligence", "biotechnology", "cybernetic", "interface",
        "machine uprising", "nanotechnology", "robot", "synthetic", "tech", "virtual reality"],
    "Violence": [
        "assault", "behead", "blood", "brutal", "decapitat", "gore", "homicide", "kill", "murder", "stabbing", "violence", "violent"],
    "War": [
        "battle", "combat", "military", "soldier", "world war", "world-war"],
    "Zombie": [
        "undead", "zombie"]
}


def assign_themes(description, themes_dict):
    matched_themes = []
    text = description.lower()
    for theme, keywords in themes_dict.items():
        if any(kw in text for kw in keywords):
            matched_themes.append(theme)
    return matched_themes if matched_themes else ['Uncategorized']

# Apply to your dataframe
df2['themes'] = df2['description'].fillna("").apply(lambda x: assign_themes(x, themes_dictionary))

df2['themes'].value_counts()

themes
[Friendship]                                                         15
[Overcoming Adversity]                                                8
[Survival]                                                            7
[Violence]                                                            7
[War]                                                                 6
                                                                     ..
[Family, Identity Crisis]                                             1
[Crime and Justice, Destiny, Super-Hero]                              1
[Comedy, Revenge, Violence, War]                                      1
[Crime and Justice, Overcoming Adversity, Social Inequality, War]     1
[Betrayal, Crime and Justice, Revenge, Survival, Violence, War]       1
Name: count, Length: 164, dtype: int64

In [181]:
from sentence_transformers import SentenceTransformer, util
import torch

# Load model (only if not already loaded)
model = SentenceTransformer('all-MiniLM-L6-v2')

semantic_labels = list(themes_dictionary.keys())
semantic_embeddings = model.encode(semantic_labels, convert_to_tensor=True)

def semantic_theme_match(description, model, semantic_labels, semantic_embeddings, threshold=0.45):
    if not description.strip():
        return ['Uncategorized']
    
    desc_embedding = model.encode(description, convert_to_tensor=True)
    similarities = util.cos_sim(desc_embedding, semantic_embeddings)[0]
    best_score, best_index = torch.max(similarities, dim=0)
    
    if best_score.item() > threshold:
        return [semantic_labels[best_index.item()]]
    else:
        return ['Uncategorized']

# Find rows that are still 'Uncategorized'
uncategorized_mask = df2['themes'].apply(lambda x: 'Uncategorized' in x)

# Apply semantic fallback only to those
for idx, row in df2[uncategorized_mask].iterrows():
    new_theme = semantic_theme_match(row['description'], model, semantic_labels, semantic_embeddings)

    if new_theme != ['Uncategorized']:
        print(f"✅ Updated: '{row['Title']}' → {new_theme}")
        df2.at[idx, 'themes'] = new_theme

# Print all still uncategorized
still_uncategorized = df2[df2['themes'].apply(lambda x: 'Uncategorized' in x)]
print(f"\n❗ Remaining Uncategorized Movies: {len(still_uncategorized)}")
print(still_uncategorized['Title'].tolist())


✅ Updated: 'Inside Out' → ['Identity Crisis']
✅ Updated: 'Mad Max: Fury Road' → ['Apocalypse']

❗ Remaining Uncategorized Movies: 0
[]


In [191]:
df3 = df2.copy()

# Convert Budget and Revenue to numeric (coerce errors)
df3['Budget'] = pd.to_numeric(df3['Budget'], errors='coerce')
df3['Revenue'] = pd.to_numeric(df3['Revenue'], errors='coerce')

# Handle Themes: ensure lists or empty list
df3['themes_clean'] = df3['Themes'].apply(
    lambda x: x if isinstance(x, list) else []
)

# Explode themes into separate rows
df_exploded = df3.explode('themes_clean')

# Calculate profit column
df_exploded['profit'] = df_exploded['Revenue'] - df_exploded['Budget']

# Group by theme and aggregate
summary = df_exploded.groupby('themes_clean').agg(
    movie_count=('themes_clean', 'count'),
    total_profit=('profit', 'sum'),
    average_rating=('Rating', 'mean')
).sort_values(by='movie_count', ascending=False)

summary.to_csv('themes_summary.csv')
summary


Unnamed: 0_level_0,movie_count,total_profit,average_rating
themes_clean,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Crime and Justice,57,6011680000.0,8.329825
Violence,50,7096710000.0,8.336
Friendship,49,12646410000.0,8.302041
Survival,36,8707576000.0,8.322222
Overcoming Adversity,33,5516913000.0,8.387879
Betrayal,30,2338507000.0,8.353333
Destiny,29,12410630000.0,8.403448
Hero’s Journey,26,9017728000.0,8.376923
Mental Health,24,3807240000.0,8.304167
War,21,4664872000.0,8.3
