# Setting up

In [3]:
# STEP 1: Install Required Libraries
# Run this cell ONCE, then follow the instructions below

print("üì¶ Installing dependencies (this takes ~30 seconds)...")
print("=" * 60)

# Uninstall conflicting packages
!pip uninstall -y numpy pandas scipy

# Install compatible versions together
!pip install -q numpy==1.26.4 pandas==2.2.2 scipy==1.13.1

# Install gensim and nltk
!pip install -q gensim==4.3.3 nltk

print("\n" + "=" * 60)
print("‚úÖ Installation complete!")
print("=" * 60)
print("\nüîÑ üîÑ üîÑ STOP! MANDATORY NEXT STEP üîÑ üîÑ üîÑ")
print("\nYou MUST restart the runtime before continuing:")
print("   1. Click 'Runtime' in the menu bar above")
print("   2. Select 'Restart runtime'")
print("   3. When prompted, click 'Yes' to confirm")
print("   4. Then run the NEXT cell to import libraries")
print("\n‚ö†Ô∏è  Do NOT skip this step or you will get errors!")
print("=" * 60)

üì¶ Installing dependencies (this takes ~30 seconds)...
Found existing installation: numpy 1.26.4
Uninstalling numpy-1.26.4:
  Successfully uninstalled numpy-1.26.4
Found existing installation: pandas 2.2.2
Uninstalling pandas-2.2.2:
  Successfully uninstalled pandas-2.2.2
Found existing installation: scipy 1.13.1
Uninstalling scipy-1.13.1:
  Successfully uninstalled scipy-1.13.1
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tsfresh 0.21.1 requires scipy>=1.14.0; python_version >= "3.10", but you have scipy 1.13.1 which is incompatible.
shap 0.50.0 requires numpy>=2, but you have numpy 1.26.4 which is incompatible.
pytensor 2.35.1 requires numpy>=2.0, but you have numpy 1.26.4 which is incompatible.
jax 0.7.2 requires numpy>=2.0, but you have numpy 1.26.4 which is incompatible.
jaxlib 0.7.2 requires numpy>=2.0, but you have numpy 1.26.4 which is incompa

In [4]:
# STEP 2: Import Libraries
# Run this cell ONLY AFTER restarting runtime

print("üìö Importing libraries...")

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
from collections import Counter

import gensim
from gensim import corpora
from gensim.models import LdaModel
from nltk.stem import WordNetLemmatizer

import nltk
nltk.download('wordnet', quiet=True)
nltk.download('omw-1.4', quiet=True)

print("=" * 60)
print("‚úÖ All libraries loaded and ready!")
print("=" * 60)
print(f"NumPy version: {np.__version__}")
print(f"Gensim version: {gensim.__version__}")
print("\nüéâ You're ready to proceed with the assignment!")

üìö Importing libraries...
‚úÖ All libraries loaded and ready!
NumPy version: 1.26.4
Gensim version: 4.3.3

üéâ You're ready to proceed with the assignment!


In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
# Load your CSV cleaned data
df = pd.read_csv('/content/drive/MyDrive/saltburn.csv')  # Replace with your cleaned CSV filename

print(f"‚úÖ Dataset loaded successfully!")
print(f"Dataset contains {len(df)} items")
print(f"\nColumns available: {df.columns.tolist()}")

‚úÖ Dataset loaded successfully!
Dataset contains 280 items

Columns available: ['review-data href', 'audience-reviews__name', 'audience-reviews__name href', 'audience-reviews__duration', 'audience-reviews__review']


# TOPIC MODELING

In [6]:
# Enhanced stopwords list for topic modeling
stopwords = [
    # Basic English stopwords
    "i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours",
    "yourself", "yourselves", "he", "him", "his", "himself", "she", "her", "hers",
    "herself", "it", "its", "itself", "they", "them", "their", "theirs", "themselves",
    "what", "which", "who", "whom", "this", "that", "these", "those", "am", "is",
    "are", "was", "were", "be", "been", "being", "have", "has", "had", "having",
    "do", "does", "did", "doing", "a", "an", "the", "and", "but", "if", "or",
    "because", "as", "until", "while", "of", "at", "by", "for", "with", "about",
    "against", "between", "into", "through", "during", "before", "after", "above",
    "below", "to", "from", "up", "down", "in", "out", "on", "off", "over", "under",
    "again", "further", "then", "once", "here", "there", "when", "where", "why",
    "how", "all", "both", "each", "few", "more", "most", "other", "some", "such",
    "no", "nor", "not", "only", "own", "same", "so", "than", "too", "very",
    "s", "t", "can", "will", "just", "don", "should", "now", "ve", "ll", "amp",
    "also", "would", "could","may", "said", "say", "new", "first", "last", "long", "little", "much",
    "well", "still", "even", "back", "good", "many", "make", "made", "us", "really"
]

# ADD YOUR OWN DOMAIN-SPECIFIC STOPWORDS HERE
# Examples: for restaurant reviews, add "restaurant", "food", "place"
#           for book reviews, add "book", "story", "read"
custom_stopwords = ['it']  # Fill in words specific to your dataset

stopwords.extend(custom_stopwords)

print(f"‚úÖ Stopwords list loaded: {len(stopwords)} words to filter out")
print(f"Custom stopwords added: {custom_stopwords}")

‚úÖ Stopwords list loaded: 152 words to filter out
Custom stopwords added: ['it']


In [7]:
# Initialize lemmatizer
lemmatizer = WordNetLemmatizer()

def preprocess_for_topics(text):
    """
    Aggressive text preprocessing for topic modeling:
    - Lowercase
    - Remove punctuation
    - Remove stopwords
    - Lemmatize (reduce to base form)
    """
    if pd.isna(text):
        return []

    # Convert to lowercase
    text = str(text).lower()

    # Remove punctuation and split into words
    words = re.findall(r'\b[a-z]+\b', text)

    # Remove stopwords and short words (< 3 characters)
    words = [word for word in words if word not in stopwords and len(word) >= 3]

    # Lemmatize words (reduce to base form)
    words = [lemmatizer.lemmatize(word) for word in words]

    return words

print("‚úÖ Preprocessing function ready")

‚úÖ Preprocessing function ready


In [8]:
# Test preprocessing on one text
text_column = 'audience-reviews__review'

sample_text = df[text_column].iloc[0]
processed = preprocess_for_topics(sample_text)

print("Text Preprocessing Test:")
print(f"Original: {sample_text[:150]}...")
print(f"\nProcessed words: {processed}")
print(f"\nNotice: lowercase, no punctuation, lemmatized, stopwords removed")

Text Preprocessing Test:
Original: A well-made, well acted melodrama that is stylish, provocative and deceptive....

Processed words: ['acted', 'melodrama', 'stylish', 'provocative', 'deceptive']

Notice: lowercase, no punctuation, lemmatized, stopwords removed


In [9]:
# Apply preprocessing to entire dataset
df['processed_for_topics'] = df[text_column].apply(preprocess_for_topics)

print("‚úÖ Preprocessing complete!")
print(f"\nProcessed {len(df)} documents")
print(f"\nExample processed documents:")
for i in range(3):
    print(f"{i+1}. {df['processed_for_topics'].iloc[i][:10]}...")

‚úÖ Preprocessing complete!

Processed 280 documents

Example processed documents:
1. ['acted', 'melodrama', 'stylish', 'provocative', 'deceptive']...
2. ['dont', 'like', 'jacob', 'elordi', 'play', 'character']...
3. ['great', 'plot', 'cinematography', 'quite', 'disgusting', 'disturbing', 'moment', 'ruined']...


### Technical Checkpoint 1: Data Preparation

In [None]:
# Checkpoint: Verify data is ready for topic modeling
doc_lengths = [len(doc) for doc in df['processed_for_topics']]
avg_length = np.mean(doc_lengths)
all_words = [word for doc in df['processed_for_topics'] for word in doc]
vocab_size = len(set(all_words))

print("üìä DATA PREPARATION CHECK")
print("=" * 40)
print(f"Number of documents: {len(df)}")
print(f"Vocabulary size: {vocab_size}")
print(f"Average document length: {avg_length:.1f} words")
print(f"Shortest document: {min(doc_lengths)} words")
print(f"Longest document: {max(doc_lengths)} words")

if avg_length < 10:
    print("\n‚ö†Ô∏è WARNING: Average document length is very short. Topic modeling may struggle.")
if vocab_size < 100:
    print("\n‚ö†Ô∏è WARNING: Vocabulary size is small. Consider reducing custom stopwords.")

Building Our Topic Model

In [None]:
# Create Gensim dictionary and corpus
dictionary = corpora.Dictionary(df['processed_for_topics'])
corpus = [dictionary.doc2bow(doc) for doc in df['processed_for_topics']]

print("üìñ Dictionary and corpus created!")
print(f"Total unique words in dictionary: {len(dictionary)}")
print(f"Total documents in corpus: {len(corpus)}")
print(f"\nExample word-to-ID mappings:")
for i, (word_id, word) in enumerate(list(dictionary.items())[:10]):
    print(f"  ID {word_id}: {word}")

**Experimenting with Number of Topics **

In [None]:
# Experiment: Try different numbers of topics
def train_and_display_topics(corpus, dictionary, num_topics):
    """
    Train an LDA model and display discovered topics
    """
    print(f"\n{'='*60}")
    print(f"MODEL WITH {num_topics} TOPICS")
    print(f"{'='*60}")

    model = LdaModel(
        corpus=corpus,
        id2word=dictionary,
        num_topics=num_topics,
        random_state=42,
        passes=15,
        alpha='auto',
        eta='auto'
    )

    for idx in range(num_topics):
        words = model.show_topic(idx, 10)
        word_list = [word for word, prob in words]
        print(f"Topic {idx}: {', '.join(word_list)}")

    return model

print("üß™ EXPERIMENTING WITH DIFFERENT NUMBERS OF TOPICS")
print("Watch how topics change as we increase the number...\n")

model_3 = train_and_display_topics(corpus, dictionary, 3)
model_4 = train_and_display_topics(corpus, dictionary, 4)
model_5 = train_and_display_topics(corpus, dictionary, 5)
model_7 = train_and_display_topics(corpus, dictionary, 7)
model_10 = train_and_display_topics(corpus, dictionary, 10)


# **Choose your best model**

In [None]:
# Train your final model with your chosen number of topics
num_topics = 7  # Fill in your chosen number (3, 5, or 7)

print(f"ü§ñ Training final LDA model with {num_topics} topics...\n")

lda_model = LdaModel(
    corpus=corpus,
    id2word=dictionary,
    num_topics=num_topics,
    random_state=42,
    passes=20,  # More passes for better final model
    alpha='auto',
    eta='auto'
)

print("‚úÖ Final model training complete!\n")
print("üéØ YOUR DISCOVERED TOPICS")
print("=" * 70)

for idx in range(num_topics):
    words = lda_model.show_topic(idx, 10)
    word_list = [word for word, prob in words]
    print(f"\nTopic {idx}: {', '.join(word_list)}")
    print(f"Your interpretation/label: _____________________")

In [None]:
# Visualize your topics
import matplotlib.pyplot as plt

fig, axes = plt.subplots(num_topics, 1, figsize=(12, 4*num_topics))

if num_topics == 1:
    axes = [axes]

for idx in range(num_topics):
    words_weights = lda_model.show_topic(idx, 10)
    words = [word for word, weight in words_weights]
    weights = [weight for word, weight in words_weights]

    axes[idx].barh(range(len(words)), weights, color='skyblue')
    axes[idx].set_yticks(range(len(words)))
    axes[idx].set_yticklabels(words)
    axes[idx].set_xlabel('Weight')
    axes[idx].set_title(f'Topic {idx} - [Add your label here]')
    axes[idx].invert_yaxis()

plt.tight_layout()
plt.show()

print("üìä Topic visualizations complete!")

In [None]:
# Get dominant topic for each document
def get_document_topics(lda_model, corpus):
    """
    Get dominant topic assignment for each document
    """
    topic_assignments = []

    for doc in corpus:
        topic_dist = lda_model.get_document_topics(doc)
        if topic_dist:  # Check if not empty
            dominant_topic = max(topic_dist, key=lambda x: x[1])
            topic_assignments.append({
                'topic_num': dominant_topic[0],
                'topic_prob': round(dominant_topic[1], 3)
            })
        else:
            topic_assignments.append({
                'topic_num': -1,
                'topic_prob': 0.0
            })

    return topic_assignments

# Get topic assignments
topic_info = get_document_topics(lda_model, corpus)
df['dominant_topic'] = [t['topic_num'] for t in topic_info]
df['topic_probability'] = [t['topic_prob'] for t in topic_info]

print("‚úÖ Topic assignments complete!")
print(f"\nTopic distribution across documents:")
print(df['dominant_topic'].value_counts().sort_index())

In [None]:
# Sample documents from each topic for validation
print("üîç DOCUMENT-TOPIC VALIDATION CHECK")
print("=" * 70)
print("For each topic, read sample documents and assess if the assignment makes sense:\n")

for topic_num in range(num_topics):
    print(f"\nüìå TOPIC {topic_num}")
    print("=" * 50)

    # Get top words for this topic
    topic_words = lda_model.show_topic(topic_num, 8)
    word_list = [word for word, prob in topic_words]
    print(f"Keywords: {', '.join(word_list)}")

    # Get sample documents from this topic
    topic_docs = df[df['dominant_topic'] == topic_num]

    if len(topic_docs) == 0:
        print("No documents assigned to this topic.")
        continue

    print(f"\nDocuments in this topic: {len(topic_docs)}")
    print(f"\nSample documents (read and assess if topic assignment makes sense):\n")

    for i, (idx, row) in enumerate(topic_docs.head(3).iterrows(), 1):
        print(f"  {i}. {row[text_column][:150]}...")
        print(f"     Probability: {row['topic_probability']:.3f}")
        print()

# Final Insights

One of the most noticeable themes is shock and disgust, especially around Barry Keoghan‚Äôs performance and the film‚Äôs disturbing scenes (Topic 0). Many reviews describe these moments as gross or unsettling, but they also highlight them as part of what makes the movie memorable. Instead of seeing the discomfort as a flaw, audiences often seem to appreciate it as a deliberate part of the film‚Äôs power.
Some focus on the story‚Äôs themes, like class politics and social manipulation, often comparing it to films like The Talented Mr. Ripley (Topic 1). These responses suggest that people enjoy the moral confusion in Saltburn because it gives them something to interpret and talk about, rather than providing a clear ‚Äúlesson.‚Äù
Several topics center on specific scenes, character development, and stylistic choices (Topics 2 and 5). Viewers often express strong reactions to these aspects, even if they don‚Äôt always like them. This mix of fascination and ambivalence shows that audiences are drawn to films that are psychologically rich and unpredictable.
People talk about the confusing morals in Saltburn as a key part of the movie‚Äôs appeal. They enjoy the strange, intense, and visually striking style even without a clear ending. And these reactions suggest that modern audiences are interested in stories that are morally complex, emotionally engaging, and intellectually stimulating, even if they don‚Äôt offer neat resolutions or traditional happy endings.




In [None]:
# STEP 1: Install Required Libraries
# Run this cell ONCE, then follow the instructions below

print("üì¶ Installing dependencies (this takes ~30 seconds)...")
print("=" * 60)

# Uninstall conflicting packages
!pip uninstall -y numpy pandas scipy

# Install compatible versions together
!pip install -q numpy==1.26.4 pandas==2.2.2 scipy==1.13.1

# Install gensim and nltk
!pip install -q gensim==4.3.3 nltk

print("\n" + "=" * 60)
print("‚úÖ Installation complete!")
print("=" * 60)
print("\nüîÑ üîÑ üîÑ STOP! MANDATORY NEXT STEP üîÑ üîÑ üîÑ")
print("\nYou MUST restart the runtime before continuing:")
print("   1. Click 'Runtime' in the menu bar above")
print("   2. Select 'Restart runtime'")
print("   3. When prompted, click 'Yes' to confirm")
print("   4. Then run the NEXT cell to import libraries")
print("\n‚ö†Ô∏è  Do NOT skip this step or you will get errors!")
print("=" * 60)

In [None]:
# STEP 2: Import Libraries
# Run this cell ONLY AFTER restarting runtime

print("üìö Importing libraries...")

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
from collections import Counter

import gensim
from gensim import corpora
from gensim.models import LdaModel
from nltk.stem import WordNetLemmatizer

import nltk
nltk.download('wordnet', quiet=True)
nltk.download('omw-1.4', quiet=True)

print("=" * 60)
print("‚úÖ All libraries loaded and ready!")
print("=" * 60)
print(f"NumPy version: {np.__version__}")
print(f"Gensim version: {gensim.__version__}")
print("\nüéâ You're ready to proceed with the assignment!")

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Load your CSV cleaned data
df = pd.read_csv('/content/youtube (1).csv')  # Replace with your cleaned CSV filename

print(f"‚úÖ Dataset loaded successfully!")
print(f"Dataset contains {len(df)} items")
print(f"\nColumns available: {df.columns.tolist()}")

# Topic Modeling 2: Youtube Comments

In [None]:
# Enhanced stopwords list for topic modeling
stopwords = [
    # Basic English stopwords
    "i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours",
    "yourself", "yourselves", "he", "him", "his", "himself", "she", "her", "hers",
    "herself", "it", "its", "itself", "they", "them", "their", "theirs", "themselves",
    "what", "which", "who", "whom", "this", "that", "these", "those", "am", "is",
    "are", "was", "were", "be", "been", "being", "have", "has", "had", "having",
    "do", "does", "did", "doing", "a", "an", "the", "and", "but", "if", "or",
    "because", "as", "until", "while", "of", "at", "by", "for", "with", "about",
    "against", "between", "into", "through", "during", "before", "after", "above",
    "below", "to", "from", "up", "down", "in", "out", "on", "off", "over", "under",
    "again", "further", "then", "once", "here", "there", "when", "where", "why",
    "how", "all", "both", "each", "few", "more", "most", "other", "some", "such",
    "no", "nor", "not", "only", "own", "same", "so", "than", "too", "very",
    "s", "t", "can", "will", "just", "don", "should", "now", "ve", "ll", "amp",
    "also", "would", "could","may", "said", "say", "new", "first", "last", "long", "little", "much",
    "well", "still", "even", "back", "good", "many", "make", "made", "us", "really"
]

# ADD YOUR OWN DOMAIN-SPECIFIC STOPWORDS HERE
# Examples: for restaurant reviews, add "restaurant", "food", "place"
#           for book reviews, add "book", "story", "read"
custom_stopwords = ['it']  # Fill in words specific to your dataset

stopwords.extend(custom_stopwords)

print(f"‚úÖ Stopwords list loaded: {len(stopwords)} words to filter out")
print(f"Custom stopwords added: {custom_stopwords}")

In [None]:
# Initialize lemmatizer
lemmatizer = WordNetLemmatizer()

def preprocess_for_topics(text):
    """
    Aggressive text preprocessing for topic modeling:
    - Lowercase
    - Remove punctuation
    - Remove stopwords
    - Lemmatize (reduce to base form)
    """
    if pd.isna(text):
        return []

    # Convert to lowercase
    text = str(text).lower()

    # Remove punctuation and split into words
    words = re.findall(r'\b[a-z]+\b', text)

    # Remove stopwords and short words (< 3 characters)
    words = [word for word in words if word not in stopwords and len(word) >= 3]

    # Lemmatize words (reduce to base form)
    words = [lemmatizer.lemmatize(word) for word in words]

    return words

print("‚úÖ Preprocessing function ready")

In [None]:
# Test preprocessing on one text
text_column = 'yt-core-attributed-string'

sample_text = df[text_column].iloc[0]
processed = preprocess_for_topics(sample_text)

print("Text Preprocessing Test:")
print(f"Original: {sample_text[:150]}...")
print(f"\nProcessed words: {processed}")
print(f"\nNotice: lowercase, no punctuation, lemmatized, stopwords removed")

In [None]:
# Apply preprocessing to entire dataset
df['processed_for_topics'] = df[text_column].apply(preprocess_for_topics)

print("‚úÖ Preprocessing complete!")
print(f"\nProcessed {len(df)} documents")
print(f"\nExample processed documents:")
for i in range(3):
    print(f"{i+1}. {df['processed_for_topics'].iloc[i][:10]}...")

# Technical Checkpoint 2: Data Preparation

In [None]:
# Checkpoint: Verify data is ready for topic modeling
doc_lengths = [len(doc) for doc in df['processed_for_topics']]
avg_length = np.mean(doc_lengths)
all_words = [word for doc in df['processed_for_topics'] for word in doc]
vocab_size = len(set(all_words))

print("üìä DATA PREPARATION CHECK")
print("=" * 40)
print(f"Number of documents: {len(df)}")
print(f"Vocabulary size: {vocab_size}")
print(f"Average document length: {avg_length:.1f} words")
print(f"Shortest document: {min(doc_lengths)} words")
print(f"Longest document: {max(doc_lengths)} words")

if avg_length < 10:
    print("\n‚ö†Ô∏è WARNING: Average document length is very short. Topic modeling may struggle.")
if vocab_size < 100:
    print("\n‚ö†Ô∏è WARNING: Vocabulary size is small. Consider reducing custom stopwords.")

Building our Topic Model

In [None]:
# Create Gensim dictionary and corpus
dictionary = corpora.Dictionary(df['processed_for_topics'])
corpus = [dictionary.doc2bow(doc) for doc in df['processed_for_topics']]

print("üìñ Dictionary and corpus created!")
print(f"Total unique words in dictionary: {len(dictionary)}")
print(f"Total documents in corpus: {len(corpus)}")
print(f"\nExample word-to-ID mappings:")
for i, (word_id, word) in enumerate(list(dictionary.items())[:10]):
    print(f"  ID {word_id}: {word}")

Experimenting with Number of Topics

In [None]:
# Experiment: Try different numbers of topics
def train_and_display_topics(corpus, dictionary, num_topics):
    """
    Train an LDA model and display discovered topics
    """
    print(f"\n{'='*60}")
    print(f"MODEL WITH {num_topics} TOPICS")
    print(f"{'='*60}")

    model = LdaModel(
        corpus=corpus,
        id2word=dictionary,
        num_topics=num_topics,
        random_state=42,
        passes=15,
        alpha='auto',
        eta='auto'
    )

    for idx in range(num_topics):
        words = model.show_topic(idx, 10)
        word_list = [word for word, prob in words]
        print(f"Topic {idx}: {', '.join(word_list)}")

    return model

print("üß™ EXPERIMENTING WITH DIFFERENT NUMBERS OF TOPICS")
print("Watch how topics change as we increase the number...\n")

model_3 = train_and_display_topics(corpus, dictionary, 3)
model_4 = train_and_display_topics(corpus, dictionary, 4)
model_5 = train_and_display_topics(corpus, dictionary, 5)
model_7 = train_and_display_topics(corpus, dictionary, 7)
model_10 = train_and_display_topics(corpus, dictionary, 10)

# **Choose your best model**

In [None]:
# Train your final model with your chosen number of topics
num_topics = 7  # Fill in your chosen number (3, 5, or 7)

print(f"ü§ñ Training final LDA model with {num_topics} topics...\n")

lda_model = LdaModel(
    corpus=corpus,
    id2word=dictionary,
    num_topics=num_topics,
    random_state=42,
    passes=20,  # More passes for better final model
    alpha='auto',
    eta='auto'
)

print("‚úÖ Final model training complete!\n")
print("üéØ YOUR DISCOVERED TOPICS")
print("=" * 70)

for idx in range(num_topics):
    words = lda_model.show_topic(idx, 10)
    word_list = [word for word, prob in words]
    print(f"\nTopic {idx}: {', '.join(word_list)}")
    print(f"Your interpretation/label: _____________________")

In [None]:
# Visualize your topics
import matplotlib.pyplot as plt

fig, axes = plt.subplots(num_topics, 1, figsize=(12, 4*num_topics))

if num_topics == 1:
    axes = [axes]

for idx in range(num_topics):
    words_weights = lda_model.show_topic(idx, 10)
    words = [word for word, weight in words_weights]
    weights = [weight for word, weight in words_weights]

    axes[idx].barh(range(len(words)), weights, color='skyblue')
    axes[idx].set_yticks(range(len(words)))
    axes[idx].set_yticklabels(words)
    axes[idx].set_xlabel('Weight')
    axes[idx].set_title(f'Topic {idx} - [Add your label here]')
    axes[idx].invert_yaxis()

plt.tight_layout()
plt.show()

print("üìä Topic visualizations complete!")

In [None]:
# Get dominant topic for each document
def get_document_topics(lda_model, corpus):
    """
    Get dominant topic assignment for each document
    """
    topic_assignments = []

    for doc in corpus:
        topic_dist = lda_model.get_document_topics(doc)
        if topic_dist:  # Check if not empty
            dominant_topic = max(topic_dist, key=lambda x: x[1])
            topic_assignments.append({
                'topic_num': dominant_topic[0],
                'topic_prob': round(dominant_topic[1], 3)
            })
        else:
            topic_assignments.append({
                'topic_num': -1,
                'topic_prob': 0.0
            })

    return topic_assignments

# Get topic assignments
topic_info = get_document_topics(lda_model, corpus)
df['dominant_topic'] = [t['topic_num'] for t in topic_info]
df['topic_probability'] = [t['topic_prob'] for t in topic_info]

print("‚úÖ Topic assignments complete!")
print(f"\nTopic distribution across documents:")
print(df['dominant_topic'].value_counts().sort_index())

In [None]:
# Sample documents from each topic for validation
print("üîç DOCUMENT-TOPIC VALIDATION CHECK")
print("=" * 70)
print("For each topic, read sample documents and assess if the assignment makes sense:\n")

for topic_num in range(num_topics):
    print(f"\nüìå TOPIC {topic_num}")
    print("=" * 50)

    # Get top words for this topic
    topic_words = lda_model.show_topic(topic_num, 8)
    word_list = [word for word, prob in topic_words]
    print(f"Keywords: {', '.join(word_list)}")

    # Get sample documents from this topic
    topic_docs = df[df['dominant_topic'] == topic_num]

    if len(topic_docs) == 0:
        print("No documents assigned to this topic.")
        continue

    print(f"\nDocuments in this topic: {len(topic_docs)}")
    print(f"\nSample documents (read and assess if topic assignment makes sense):\n")

    for i, (idx, row) in enumerate(topic_docs.head(3).iterrows(), 1):
        print(f"  {i}. {row[text_column][:150]}...")
        print(f"     Probability: {row['topic_probability']:.3f}")
        print()

# Final Insights

Overall, the YouTube comments reveal that viewers experience Saltburn as a film that provokes intense emotional, psychological, and narrative debate, and the seven discovered topics help show how those conversations cluster. Many comments focus on the complicated relationship between Oliver and Felix (Topic 6), with viewers arguing over whether Oliver‚Äôs actions stemmed from genuine love, obsessive admiration, or calculated manipulation, making this the most dominant theme in the dataset. A substantial portion of the discussion also centers on character morality and psychology, particularly whether Oliver is best understood as a psychopath, sociopath, or narcissist, which appears in both Topic 3‚Äôs spoiler-filled analysis and Topic 5‚Äôs mixture of diagnostic labeling and praise for Barry Keoghan‚Äôs performance. Commenters frequently reinterpret the plot and question Oliver‚Äôs long-term intentions (Topic 2), debating whether his final success was the result of careful planning or opportunistic improvisation. Other viewers compare family dynamics, betrayal, and character parallels to outside narratives like Devilman Crybaby (Topic 0), suggesting that the movie resonates with broader cultural stories about ambition and destruction. Reactions to symbolism, key scenes, and directorial choices, especially the mansion setting, Venetia‚Äôs role, and the film‚Äôs more shocking visual moments are captured in Topic 4, while Topic 1 shows that many viewers mix emotional responses with film critique, commenting on themes such as loss, social ambition, and the believability of Oliver‚Äôs rise. Taken together, the comments portray Saltburn as a movie that audiences find morally unsettling, psychologically fascinating, and narratively ambiguous, encouraging viewers to dissect motives, scenes, performances, and broader themes long after the credits roll.