In [1]:
import os
import string
import unicodedata
import requests
import re
import numpy as np
import pandas as pd

from nltk.corpus import stopwords
from gensim import corpora
from gensim.models import LdaModel, CoherenceModel
from nltk.tokenize import word_tokenize
from nltk.collocations import BigramCollocationFinder, BigramAssocMeasures, TrigramCollocationFinder, TrigramAssocMeasures
from nltk import pos_tag
from nltk.util import ngrams
from collections import Counter, defaultdict
from itertools import product, combinations


In [2]:
stop_words = set(stopwords.words("english"))

# Fetch stopwords
def fetch_stopwords_from_github(url):
    response = requests.get(url)
    return set(response.text.splitlines())

github_stopwords_url = 'https://raw.githubusercontent.com/stopwords-iso/stopwords-en/master/stopwords-en.txt'
github_stopwords = fetch_stopwords_from_github(github_stopwords_url)

stop_words = set(stopwords.words('english'))

with open("stop_words.txt", "r", encoding="utf-8") as file:
    custom_stop_words = set(file.read().splitlines())

stop_words.update(custom_stop_words, github_stopwords)

# Folder paths
#transcripts_folder_path = './Previous_THS-ST2_Files/standard_dataset_old/'
transcripts_folder_path = 'final_transcripts/'

tags_folder_path = 'video_tags/'
titles_folder_path = 'video_titles/'

# Function to load video tags only for fetched video IDs
def load_video_tags(folder_path, video_ids):
    video_tags = {}
    for video_id in video_ids:
        tag_file = os.path.join(folder_path, f"{video_id}.txt")
        if os.path.exists(tag_file):
            with open(tag_file, "r", encoding="utf-8") as file:
                tags_content = file.read().lower()
                video_tags[video_id] = tags_content.split()  # Store as list of words
        else:
            video_tags[video_id] = []  # Default to empty list if no tags
    return video_tags
# Function to load video titles
def load_video_titles(folder_path, video_ids):
    video_titles = {}
    for video_id in video_ids:
        title_file = os.path.join(folder_path, f"{video_id}")
        if os.path.exists(title_file):
            with open(title_file, "r", encoding="utf-8") as file:
                video_titles[video_id] = file.read().strip()  # Read full title
        else:
            video_titles[video_id] = "Unknown Title"  # Default if no title file
    return video_titles

video_ids = []
transcript_files = []
for file_name in os.listdir(transcripts_folder_path):
    if file_name.endswith('.txt'):
        video_id = file_name.split('_captions')[0]
        video_ids.append(video_id)
        transcript_files.append((video_id, file_name)) 

video_tags = load_video_tags(tags_folder_path, video_ids)
video_titles = load_video_titles(titles_folder_path, video_ids)

In [3]:
def is_latin_script(word):
    return all('LATIN' in unicodedata.name(char, '') or char.isdigit() for char in word)

# Function to detect both bigram and trigram collocations
def detect_collocations(tokens, min_freq=2):
    bigram_measures = BigramAssocMeasures()
    trigram_measures = TrigramAssocMeasures()

    # Find bigrams
    bigram_finder = BigramCollocationFinder.from_words(tokens)
    bigram_finder.apply_freq_filter(min_freq)
    bigrams = set(['_'.join(bigram) for bigram in bigram_finder.nbest(bigram_measures.pmi, 10)])

    # Find trigrams
    trigram_finder = TrigramCollocationFinder.from_words(tokens)
    trigram_finder.apply_freq_filter(min_freq)
    trigrams = set(['_'.join(trigram) for trigram in trigram_finder.nbest(trigram_measures.pmi, 10)])

    return bigrams, trigrams

def is_valid_ngram(ngram, existing_ngrams):
    """ Check if the n-gram contains alternating or duplicate words """
    words = ngram.split('_')
    unique_words = set(words)
    
    if len(unique_words) == 1:
        return False  
    
    if len(words) > 2 and words[0] == words[2]:  
        return False  
    
    if len(words) > 2 and words[0] == words[1]:
        return False

    if len(words) == 3 and words[1] == words[2]:
        return False 
    
    if len(words) == 2:
        for existing_ngram in existing_ngrams:
            if ngram in existing_ngram:
                return False  
    
    return True

def preprocess_text(doc, video_id, tag_weight=2, ngram_weight_factor=3):
    # Clean punctuation at the end of words
    doc = re.sub(r'([a-zA-Z]+)[,;:!?.]', r'\1', doc)

    # Lowercase and remove punctuation
    doc = doc.lower().translate(str.maketrans('', '', string.punctuation))
    tokens = word_tokenize(doc)

    # Remove stopwords & non-latin words
    tokens = [word for word in tokens if word not in stop_words and word.isalpha() and is_latin_script(word)]

    # POS tagging
    tokens_with_pos = pos_tag(tokens)

    # Remove verbs and adjectives before n-gram detection
    pos_exclude = {'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'JJ', 'JJR', 'JJS', 'RB', 'RBR', 'RBS'}
    filtered_tokens = [word for word, pos in tokens_with_pos if pos not in pos_exclude]

    # Include video tags in filtered tokens
    if video_id in video_tags:
        tags = video_tags[video_id]
        cleaned_tags = []
        
        for tag in tags:
            tag = re.sub(r'^[,;:!?.\'\"]*([a-zA-Z]+)[,;:!?.\'\"]*$', r'\1', tag)
            tag = tag.lower().translate(str.maketrans('', '', string.punctuation))  
            if tag.isalpha() and tag not in stop_words and is_latin_script(tag):  
                cleaned_tags.append(tag)

        filtered_tokens.extend(cleaned_tags)  

    # Detect meaningful bigram and trigram collocations (tags are included)
    bigrams, trigrams = detect_collocations(filtered_tokens)

    # Generate n-grams
    bigram_tokens = ['_'.join(gram) for gram in ngrams(filtered_tokens, 2)]
    trigram_tokens = ['_'.join(gram) for gram in ngrams(filtered_tokens, 3)]

    # Remove invalid n-grams (duplicates, alternating patterns)
    bigram_tokens = [bigram for bigram in bigram_tokens if is_valid_ngram(bigram, set())]
    trigram_tokens = [trigram for trigram in trigram_tokens if is_valid_ngram(trigram, bigram_tokens)]

    # Keep only meaningful n-grams from detect_collocations()
    bigram_tokens = [bigram for bigram in bigram_tokens if bigram in bigrams]
    trigram_tokens = [trigram for trigram in trigram_tokens if trigram in trigrams]

    # Remove n-grams that are just reordered versions
    unique_ngrams = set()
    filtered_bigrams = []
    filtered_trigrams = []

    for bigram in bigram_tokens:
        sorted_bigram = '_'.join(sorted(bigram.split('_')))
        if sorted_bigram not in unique_ngrams:
            unique_ngrams.add(sorted_bigram)
            filtered_bigrams.append(bigram)

    for trigram in trigram_tokens:
        sorted_trigram = '_'.join(sorted(trigram.split('_')))
        if sorted_trigram not in unique_ngrams:
            unique_ngrams.add(sorted_trigram)
            filtered_trigrams.append(trigram)
    
    # Count n-gram frequency
    bigram_frequencies = Counter(filtered_bigrams)
    trigram_frequencies = Counter(filtered_trigrams)

    # Merge n-grams into single tokens
    bigram_trigram_words = set()
    merged_tokens = []
    i = 0
    while i < len(filtered_tokens) - 2:  # Check for trigrams first
        trigram = f"{filtered_tokens[i]}_{filtered_tokens[i+1]}_{filtered_tokens[i+2]}"
        bigram = f"{filtered_tokens[i]}_{filtered_tokens[i+1]}"

        if trigram in filtered_trigrams:
            merged_tokens.append(trigram)
            bigram_trigram_words.add(trigram)
            i += 3  # Skip next two words since it's part of the trigram
        elif bigram in filtered_bigrams:
            merged_tokens.append(bigram)
            bigram_trigram_words.add(bigram)
            i += 2  # Skip next word since it's part of the bigram
        else:
            merged_tokens.append(filtered_tokens[i])
            i += 1
    
    # Append any remaining words
    while i < len(filtered_tokens):
        merged_tokens.append(filtered_tokens[i])
        i += 1

    # Store n-gram components to remove single tokens later
    ngram_components = set(word for bigram in bigram_tokens for word in bigram.split('_'))
    ngram_components.update(word for trigram in trigram_tokens for word in trigram.split('_'))

    # Remove single tokens if they appear in any n-gram
    filtered_tokens = [word for word in filtered_tokens if word not in ngram_components]

    # Remove duplicates before assigning weight
    unique_tokens = list(set(merged_tokens))

    # Assign weight based on n-gram occurrence
    weighted_tokens = []
    for token in unique_tokens:
        if token in trigram_frequencies:
            token_weight = trigram_frequencies[token] * 2 + ngram_weight_factor  
        elif token in bigram_frequencies:
            token_weight = bigram_frequencies[token] * ngram_weight_factor
        else:
            token_weight = 1
        weighted_tokens.extend([token] * int(token_weight))

    # # Include video tags with weight
    # if video_id in video_tags:
    #     for tag in cleaned_tags:
    #         weighted_tokens.extend([tag] * int(tag_weight))

    return ' '.join(weighted_tokens), list(bigram_trigram_words)

def topic_diversity(model, top_n=10):
    topic_words = [set([word for word, _ in model.show_topic(topic_id, top_n)]) for topic_id in range(model.num_topics)]
    unique_words = set().union(*topic_words)
    return len(unique_words) / (top_n * len(topic_words))

def jaccard_similarity(topic1, topic2):
    return len(set(topic1) & set(topic2)) / len(set(topic1) | set(topic2))

def avg_jaccard_similarity(model, num_words=10):
    topics = [set([word for word, _ in model.show_topic(topic_id, num_words)]) for topic_id in range(model.num_topics)]
    similarities = [jaccard_similarity(t1, t2) for t1, t2 in combinations(topics, 2)]
    return sum(similarities) / len(similarities)

In [58]:
all_documents = []
preprocessed_text = []
bigram_trigram_text = {}

for video_id, file_name in transcript_files:
    with open(os.path.join(transcripts_folder_path, file_name), 'r', encoding='utf-8') as file:
        content = file.read().lower()
        if len(content.split()) >= 100: 
            processed_text, bigram_trigram = preprocess_text(content, video_id)
            preprocessed_text.append((video_id, processed_text))
            all_documents.append(processed_text)
            bigram_trigram_text[video_id] = bigram_trigram

all_tokens = [token for doc in all_documents for token in doc.split()]
token_freq = Counter(all_tokens)

high_freq_tokens = {token for token, freq in token_freq.items() if freq > 200 and '_' not in token}

filtered_documents_with_id = []
for video_id, doc in preprocessed_text:
    filtered_doc = [token for token in doc.split() if token not in high_freq_tokens]
    filtered_documents_with_id.append((video_id, filtered_doc))  

filtered_documents_only = [doc for _, doc in filtered_documents_with_id]

filtered_documents = []
for doc in all_documents:
    filtered_doc = [token for token in doc.split() if token not in high_freq_tokens]
    filtered_documents.append(filtered_doc)


dictionary = corpora.Dictionary(filtered_documents)
corpus = [dictionary.doc2bow(doc) for doc in filtered_documents]

In [None]:
print("\nPreprocessed Text Per Video:")
for video_id, processed_text in preprocessed_text:
    video_title = video_titles.get(video_id, "Unknown Title") 
    print(f"\nVideo ID: {video_id} | {video_title}\n- {processed_text}\n")

In [None]:
sorted_high_freq_tokens = sorted(high_freq_tokens, key=lambda token: token_freq[token], reverse=True)
for token in sorted_high_freq_tokens:
    print(token + ":", token_freq[token])


In [None]:
token_freq_dict = {dictionary[id]: freq for id, freq in dictionary.cfs.items()}

print("Top 100 Most Frequent Tokens:")
sorted_tokens = sorted(token_freq_dict.items(), key=lambda x: x[1], reverse=True)
for token, freq in sorted_tokens[:1000]:
    print(f"{token}: {freq}")


In [None]:
for n in video_ids:
    video_title = video_titles.get(n, "Unknown Title")
    print("Video ID: ", n, video_title)
    
    if n in bigram_trigram_text:  
        print(f"{bigram_trigram_text[n]}")
    else:
        print("No bigram/trigram data available for this video.")

    print()

In [8]:
# # Train LDA Model
# # lda_model_12 = LdaModel(corpus, num_topics=20, id2word=dictionary, alpha='auto', eta='auto', passes=100, random_state=42)

# # # Compute Coherence Score
# # coherence_model = CoherenceModel(model=lda_model_12, corpus=corpus, dictionary=dictionary, coherence='u_mass')
# # coherence_score = coherence_model.get_coherence()
# # print(f"Coherence Score: {coherence_score}")

# num_topics_range = [15, 20, 25, 30] 
# alpha_values = ['symmetric', 'asymmetric', 0.01, 0.1, 0.5, 'auto']
# eta_values = ['symmetric', 0.01, 0.1, 0.5, 'auto']
# results = []

# # Grid Search
# for num_topics, alpha, eta in product(num_topics_range, alpha_values, eta_values):
#     print(f"Training LDA Model with num_topics={num_topics}, alpha={alpha}, eta={eta}...")
    
#     # Train LDA Model
#     lda_model = LdaModel(
#         corpus=corpus,
#         id2word=dictionary,
#         num_topics=num_topics,
#         alpha=alpha,
#         eta=eta,
#         passes=50,  
#         random_state=42
#     )

#     # Compute Coherence Scores
#     coherence_c_v = CoherenceModel(model=lda_model, texts=filtered_documents, dictionary=dictionary, coherence='c_v').get_coherence()
#     coherence_u_mass = CoherenceModel(model=lda_model, corpus=corpus, dictionary=dictionary, coherence='u_mass').get_coherence()
#     coherence_npmi = CoherenceModel(model=lda_model, texts=filtered_documents, dictionary=dictionary, coherence='c_npmi').get_coherence()

#     # Compute Topic Diversity
#     diversity_score = topic_diversity(lda_model)

#     # Compute Jaccard Similarity
#     jaccard_score = avg_jaccard_similarity(lda_model)

#     # Store results
#     results.append({
#         "num_topics": num_topics,
#         "alpha": alpha,
#         "eta": eta,
#         "coherence_c_v": coherence_c_v,
#         "coherence_u_mass": coherence_u_mass,
#         "coherence_npmi": coherence_npmi,
#         "topic_diversity": diversity_score,
#         "jaccard_similarity": jaccard_score
#     })

# df_results = pd.DataFrame(results)

# # Sort by best coherence and diversity balance
# df_sorted = df_results.sort_values(by=["coherence_c_v", "topic_diversity"], ascending=[False, False])
# print(df_sorted.head(10))  # Top 10 models

# # Print Topics
# # topics = lda_model_12.print_topics(num_words=100)
# # for topic_id, topic_words in topics:
# #     print(f"Topic {topic_id}: {topic_words}")


In [9]:
# df_results.to_csv("lda_hyperparameter_tuning_results.csv", index=False)

In [10]:
# # Find the best parameters per number of topics
# best_params_per_topic = df_results.loc[
#     df_results.groupby("num_topics")["coherence_c_v"].idxmax()
# ]

# # Sort by num_topics for clarity
# best_params_per_topic = best_params_per_topic.sort_values(by="num_topics")

# print("Best parameters per number of topics:")
# print(best_params_per_topic)


In [None]:
lda_model_30 = LdaModel(
        corpus=corpus,
        id2word=dictionary,
        num_topics=30,
        alpha=0.1,
        eta=0.1,
        passes=50,  
        random_state=42
    )

    # Compute Coherence Scores
coherence_c_v = CoherenceModel(model=lda_model_30, texts=filtered_documents, dictionary=dictionary, coherence='c_v').get_coherence()
coherence_u_mass = CoherenceModel(model=lda_model_30, corpus=corpus, dictionary=dictionary, coherence='u_mass').get_coherence()
coherence_npmi = CoherenceModel(model=lda_model_30, texts=filtered_documents, dictionary=dictionary, coherence='c_npmi').get_coherence()

# Compute Topic Diversity
diversity_score = topic_diversity(lda_model_30)

# Compute Jaccard Similarity
jaccard_score = avg_jaccard_similarity(lda_model_30)

print(f"Coherence Score c_v: {coherence_c_v}")
print(f"Coherence Score u_mass: {coherence_u_mass}")
print(f"Coherence Score npmi: {coherence_npmi}")
print(f"Topic Diversity: {diversity_score}")
print(f"Jaccard Similarity: {jaccard_score}")

In [None]:
topic_to_videos = defaultdict(list)

video_topic_mapping_30 = {}

# probability threshold for assigning multiple topics
prob_threshold = 0.2

# Dictionary to store topic words for each video
video_topic_words_LDA1_30 = {}

for idx, doc_bow in enumerate(corpus):
    video_id = video_ids[idx]  # Get video ID
    topic_distribution = lda_model_30.get_document_topics(doc_bow, minimum_probability=0)

    # Get topics where probability is above threshold
    assigned_topics = [topic for topic, prob in topic_distribution if prob >= prob_threshold]
    video_topic_mapping_30[video_id] = assigned_topics  # Store assigned topics per video

    for topic in assigned_topics:
        topic_to_videos[topic].append(video_id)

    # Get the representative words for each assigned topic
    topic_words = []
    for topic in assigned_topics:
        words = [word for word, _ in lda_model_30.show_topic(topic, topn=100)]  # Get top 100 words
        topic_words.append(", ".join(words))  # Convert list to string

    # Store the topic words as a string
    video_topic_words_LDA1_30[video_id] = "; ".join(topic_words)  

# Count occurrences of each topic
topic_counts_30 = Counter()

for topics in video_topic_mapping_30.values():
    for topic in topics:
        topic_counts_30[topic] += 1

# Print the number of videos per topic
print("\nNumber of Videos Per Topic:")
for topic, count in sorted(topic_counts_30.items()):
    print(f"Topic {topic}: {count} videos")

In [None]:
# Dictionary to store the mapping of topic words to preprocessed text for each video
video_topic_to_preprocessed_text = defaultdict(dict)

# Iterate through the corpus and map topic words to preprocessed text
for idx, doc_bow in enumerate(corpus):
    video_id, preprocessed_tokens = filtered_documents_with_id[idx]  # Get video_id and preprocessed tokens
    topic_distribution = lda_model_30.get_document_topics(doc_bow, minimum_probability=0)
    assigned_topics = [topic for topic, prob in topic_distribution if prob >= prob_threshold]

    for topic in assigned_topics:
        # Get the top 100 words for the topic
        topic_words = [word for word, _ in lda_model_30.show_topic(topic, topn=100)]
        
        # Find intersection between topic words and preprocessed tokens
        overlapping_words = set(topic_words).intersection(set(preprocessed_tokens))
        
        # Store the overlapping words
        video_topic_to_preprocessed_text[video_id][topic] = list(overlapping_words)

# Now you can analyze the results
for video_id, topic_mapping in video_topic_to_preprocessed_text.items():
    video_title = video_titles.get(video_id, "Unknown Title")
    print(f"\nVideo ID: {video_id} | {video_title}")
    for topic, words in topic_mapping.items():
        print(f"  Topic {topic}: {words}")

In [None]:
# Dictionary to store the probability distribution of topics for each video
video_topic_probabilities = {}

# Iterate through the corpus and map topic words to preprocessed text
for idx, doc_bow in enumerate(corpus):
    video_id, preprocessed_tokens = filtered_documents_with_id[idx]  # Get video_id and preprocessed tokens
    topic_distribution = lda_model_30.get_document_topics(doc_bow, minimum_probability=0)
    assigned_topics = [topic for topic, prob in topic_distribution if prob >= prob_threshold]

    # Store assigned topics for the video
    video_topic_mapping_30[video_id] = assigned_topics

    # Store the full topic probability distribution for the video
    video_topic_probabilities[video_id] = topic_distribution

    # Map videos to topics
    for topic in assigned_topics:
        topic_to_videos[topic].append(video_id)

    # Get the representative words for each assigned topic
    topic_words = []
    for topic in assigned_topics:
        words = [word for word, _ in lda_model_30.show_topic(topic, topn=100)]  # Get top 100 words
        topic_words.append(", ".join(words))  # Convert list to string

    # Store the topic words as a string
    video_topic_words_LDA1_30[video_id] = "; ".join(topic_words)

    # Map topic words to preprocessed text for the video
    for topic in assigned_topics:
        topic_words = [word for word, _ in lda_model_30.show_topic(topic, topn=100)]
        overlapping_words = set(topic_words).intersection(set(preprocessed_tokens))
        video_topic_to_preprocessed_text[video_id][topic] = list(overlapping_words)

# Count occurrences of each topic
topic_counts_30 = Counter()

for video_id, topics in video_topic_mapping_30.items():
    for topic in topics:
        topic_counts_30[topic] += 1


In [None]:
# Print the number of videos per topic
print("\nNumber of Videos Per Topic:")
for topic, count in sorted(topic_counts_30.items()):
    print(f"Topic {topic}: {count} videos")


In [None]:
# Print the mapping of topic words to preprocessed text for each video, including topic and word probabilities
print("\nMapping of Topic Words to Preprocessed Text (with Topic and Word Probabilities):")
for video_id, topic_mapping in video_topic_to_preprocessed_text.items():
    video_title = video_titles.get(video_id, "Unknown Title")
    print(f"\nVideo ID: {video_id} | {video_title}")
    
    # Get the topic probability distribution for the current video
    topic_distribution = video_topic_probabilities[video_id]
    
    # Iterate through the assigned topics for the video
    for topic, words in topic_mapping.items():
        # Get the probability of the current topic for the video
        topic_prob = next((prob for t, prob in topic_distribution if t == topic), 0.0)
        
        # Print the topic and its probability
        print(f"  Topic {topic} (Probability: {topic_prob:.4f}):")
        
        # Get the word probabilities for the current topic
        topic_words_with_probs = lda_model_30.show_topic(topic, topn=100)
        
        # Create a dictionary of word probabilities for easy lookup
        word_prob_dict = {word: prob for word, prob in topic_words_with_probs}
        
        # Filter to include only the overlapping words and their probabilities
        overlapping_words_with_probs = [(word, word_prob_dict[word]) for word in words if word in word_prob_dict]
        
        # Print the overlapping words and their probabilities
        for word, prob in overlapping_words_with_probs:
            print(f"    - {word}: {prob:.4f}")  # Print word and its probability

In [None]:
# Print videos per topic
print("\nTop Words Per Topic:")

for topic_id in sorted(topic_to_videos.keys()): 
    top_words = lda_model_30.show_topic(topic_id, 50)
    words_str = ', '.join([word for word, prob in top_words])
    print(f"Topic {topic_id}: {words_str}")

In [None]:
# Print the videos under each topic in the desired format
print("\nVideos Under Each Topic:")
for topic, videos in sorted(topic_to_videos.items()):
    # Remove duplicate video IDs
    unique_videos = list(set(videos))  

    # Get the number of unique videos for the current topic
    num_videos = len(unique_videos)
    print(f"Topic {topic} ({num_videos} videos):")

    for video_id in unique_videos:
        video_title = video_titles.get(video_id, "Unknown Title")  # Get the title or default to "Unknown Title"
        print(f"  - {video_id} | {video_title}")


In [None]:
# Print the probability distribution of topics for each video
print("\nProbability Distribution of Topics Per Video:")
for video_id, topic_distribution in video_topic_probabilities.items():
    video_title = video_titles.get(video_id, "Unknown Title")  # Get the title or default to "Unknown Title"
    print(f"Video ID: {video_id} | {video_title}")
    for topic, prob in sorted(topic_distribution, key=lambda x: x[0]):  # Sort by topic number
        print(f"  - Topic {topic}: {prob:.4f}")

In [None]:
# Print the mapping of topic words to preprocessed text for each video
print("\nMapping of Topic Words to Preprocessed Text:")
for video_id, topic_mapping in video_topic_to_preprocessed_text.items():
    print(f"Video ID: {video_id}")
    for topic, words in topic_mapping.items():
        print(f"  Topic {topic}: {words}")


In [None]:
# Print top words per topic with probabilities
print("\nTop Words Per Topic:")

for topic_id in sorted(topic_to_videos.keys()): 
    top_words = lda_model_30.show_topic(topic_id, 50)  # Get top 50 words with probabilities
    words_str = ', '.join([f"{word} ({prob:.4f})" for word, prob in top_words])
    print(f"Topic {topic_id}: {words_str}")


In [None]:
import random

print("\nVideos Assigned Per Topic:")
for topic, video_list in sorted(topic_to_videos.items()):
    print(f"\nTopic {topic}:")

    # Select up to 5 random videos for this topic
    random_videos = random.sample(video_list, min(10, len(video_list)))

    for video in random_videos:
        video_title = video_titles.get(video, "Unknown Title")
        print(f"{video} | {video_title}")


In [None]:
# Select 10 random videos overall
random_video_ids = random.sample(list(video_topic_mapping_30.keys()), min(10, len(video_topic_mapping_30)))

print("\nTopics Assigned Per 10 Random Videos:")
for video_id in random_video_ids:
    topic_list = ', '.join(map(str, video_topic_mapping_30[video_id])) if video_topic_mapping_30[video_id] else "No dominant topic"
    video_title = video_titles.get(video_id, "Unknown Title")
    print(f"Video ID: {video_id} | {video_title} → Topics: {topic_list}")


In [None]:
# Select 10 random videos for word contributions
random_contrib_videos = random.sample(list(topic_word_contributions_30.keys()), min(10, len(topic_word_contributions_30)))

print("\nWords Contributing to Topics Per 10 Random Videos:")
for video_id in random_contrib_videos:
    video_title = video_titles.get(video_id, "Unknown Title")
    print(f"\nVideo ID: {video_id} | {video_title}")

    for topic, word_scores in topic_word_contributions_30[video_id].items():
        # Ensure word_scores is a list of tuples
        if isinstance(word_scores, list):
            sorted_words = sorted(word_scores, key=lambda x: x[1], reverse=True)  # Sort tuples
            top_contributing_words = ", ".join([f"{word} ({round(score, 3)})" for word, score in sorted_words])
        else:
            top_contributing_words = "No words available"

        topic_prob = dict(video_topic_mapping_30[video_id]).get(topic, 0)  # Get probability safely

        if top_contributing_words.strip():
            print(f"  Topic {topic} (Probability: {round(topic_prob, 3)}):\n    Contributing Words: {top_contributing_words}\n")
        else:
            print(f"  Topic {topic} (Probability: {round(topic_prob, 3)}):\n    No detected contributing words!\n")


In [None]:
lda_model_25 = LdaModel(
        corpus=corpus,
        id2word=dictionary,
        num_topics=25,
        alpha='asymmetric',
        eta=0.1,
        passes=50,  
        random_state=42
    )

    # Compute Coherence Scores
coherence_c_v = CoherenceModel(model=lda_model_25, texts=filtered_documents, dictionary=dictionary, coherence='c_v').get_coherence()
coherence_u_mass = CoherenceModel(model=lda_model_25, corpus=corpus, dictionary=dictionary, coherence='u_mass').get_coherence()
coherence_npmi = CoherenceModel(model=lda_model_25, texts=filtered_documents, dictionary=dictionary, coherence='c_npmi').get_coherence()

# Compute Topic Diversity
diversity_score = topic_diversity(lda_model_25)

# Compute Jaccard Similarity
jaccard_score = avg_jaccard_similarity(lda_model_25)

print(f"Coherence Score c_v: {coherence_c_v}")
print(f"Coherence Score u_mass: {coherence_u_mass}")
print(f"Coherence Score npmi: {coherence_npmi}")
print(f"Topic Diversity: {diversity_score}")
print(f"Jaccard Similarity: {jaccard_score}")

topic_to_videos_25 = defaultdict(list)

video_topic_mapping_25 = {}

# probability threshold for assigning multiple topics
prob_threshold = 0.2

# Dictionary to store topic words for each video
video_topic_words_LDA1_25 = {}

for idx, doc_bow in enumerate(corpus):
    video_id = video_ids[idx]  # Get video ID
    topic_distribution = lda_model_25.get_document_topics(doc_bow, minimum_probability=0)

    # Get topics where probability is above threshold
    assigned_topics = [topic for topic, prob in topic_distribution if prob >= prob_threshold]
    video_topic_mapping_25[video_id] = assigned_topics  # Store assigned topics per video

    for topic in assigned_topics:
        topic_to_videos_25[topic].append(video_id)

    # Get the representative words for each assigned topic
    topic_words = []
    for topic in assigned_topics:
        words = [word for word, _ in lda_model_25.show_topic(topic, topn=50)]  # Get top 100 words
        topic_words.append(", ".join(words))  # Convert list to string

# Count occurrences of each topic
topic_counts_25 = Counter()

for topics in video_topic_mapping_25.values():
    for topic in topics:
        topic_counts_25[topic] += 1

# Print videos per topic
print("\nTop Words Per Topic:")

for topic_id in sorted(topic_to_videos_25.keys()): 
    top_words = lda_model_25.show_topic(topic_id, 50)
    words_str = ', '.join([word for word, prob in top_words])
    print(f"Topic {topic_id}: {words_str}")
    
# Print the number of videos per topic
print("\nNumber of Videos Per Topic:")
for topic, count in sorted(topic_counts_25.items()):
    print(f"Topic {topic}: {count} videos")


In [None]:
print("\nTopics Assigned Per Video:")
for video_id, topics in video_topic_mapping_25.items():
    topic_list = ', '.join(map(str, topics)) if topics else "No dominant topic"
    video_title = video_titles.get(video_id, "Unknown Title")
    print(f"Video ID: {video_id} | {video_title} → Topics: {topic_list}")

In [None]:
print("\nVideos Assigned Per Topic:")
for topic, video_list in sorted(topic_to_videos_25.items()):
    print(f"\nTopic {topic}:")
    for video in video_list:
        video_title = video_titles.get(video, "Unknown Title")
        print(f"\n{video} | {video_title}")


In [24]:
topic_to_videos_25 = defaultdict(list)
video_topic_mapping_25 = {}
prob_threshold = 0.2
video_topic_words_LDA1_25 = defaultdict(list)  # Store topics separately
topic_word_contributions_25 = {}

for idx, doc_bow in enumerate(corpus):
    video_id = video_ids[idx]  # Get video ID
    topic_distribution = lda_model_25.get_document_topics(doc_bow, minimum_probability=0)

    # Get topics above threshold
    assigned_topics = {topic: prob for topic, prob in topic_distribution if prob >= prob_threshold}
    video_topic_mapping_25[video_id] = assigned_topics

    for topic in assigned_topics:
        topic_to_videos_25[topic].append(video_id)

    # Identify words contributing to each assigned topic
    topic_word_contributions_25[video_id] = defaultdict(dict)
    bow_words = {dictionary[word_id] for word_id, _ in doc_bow}  # Convert doc_bow IDs to words

    for topic, prob in assigned_topics.items():
        topic_top_words = dict(lda_model_25.show_topic(topic, topn=2000))  # Get top words for the topic
        word_contributions = {}

        # Ensure we capture all contributing words
        for word in bow_words:
            if word in topic_top_words:
                word_contributions[word] = topic_top_words[word]

        # Store word contributions per topic
        topic_word_contributions_25[video_id][topic] = word_contributions
        topic_words = ", ".join(topic_top_words.keys())
        video_topic_words_LDA1_25[video_id].append(f"Topic {topic}: {topic_words}")

In [None]:
# Print contributing words per video
print("\nWords Contributing to Topics Per Video:")
for video_id, topic_data in topic_word_contributions_25.items():
    video_title = video_titles.get(video_id, "Unknown Title")
    print(f"\nVideo ID: {video_id} | {video_title}")
    for topic, word_scores in topic_data.items():
        sorted_words = sorted(word_scores.items(), key=lambda x: x[1], reverse=True)
        top_contributing_words = ", ".join([f"{word} ({round(score, 3)})" for word, score in sorted_words])
        topic_prob = video_topic_mapping_25[video_id][topic]
        
        # Ensure that all contributing words are displayed
        if top_contributing_words.strip():
            print(f"  Topic {topic} (Probability: {round(topic_prob, 3)}):\n    Contributing Words: {top_contributing_words}\n")
        else:
            print(f"  Topic {topic} (Probability: {round(topic_prob, 3)}):\n    No detected contributing words!\n")

In [None]:
print("\nVideos Assigned Per Topic:")
for topic, video_list in sorted(topic_to_videos_25.items()):
    print(f"\nTopic {topic}:")

    # Select up to 10 random videos for this topic
    random_videos = random.sample(video_list, min(10, len(video_list)))

    for video in random_videos:
        video_title = video_titles.get(video, "Unknown Title")
        print(f"{video} | {video_title}")


In [None]:
# Select 10 random videos overall
random_video_ids = random.sample(list(video_topic_mapping_25.keys()), min(10, len(video_topic_mapping_25)))

print("\nTopics Assigned Per 10 Random Videos:")
for video_id in random_video_ids:
    topic_list = ', '.join(map(str, video_topic_mapping_25[video_id])) if video_topic_mapping_25[video_id] else "No dominant topic"
    video_title = video_titles.get(video_id, "Unknown Title")
    print(f"Video ID: {video_id} | {video_title} → Topics: {topic_list}")


In [None]:
# Select 10 random videos for word contributions
random_contrib_videos = random.sample(list(topic_word_contributions_25.keys()), min(10, len(topic_word_contributions_25)))

print("\nWords Contributing to Topics Per 10 Random Videos:")
for video_id in random_contrib_videos:
    video_title = video_titles.get(video_id, "Unknown Title")
    print(f"\nVideo ID: {video_id} | {video_title}")
    
    for topic, word_scores in topic_word_contributions_25[video_id].items():
        sorted_words = sorted(word_scores.items(), key=lambda x: x[1], reverse=True)
        top_contributing_words = ", ".join([f"{word} ({round(score, 3)})" for word, score in sorted_words])
        topic_prob = video_topic_mapping_25[video_id][topic]

        if top_contributing_words.strip():
            print(f"  Topic {topic} (Probability: {round(topic_prob, 3)}):\n    Contributing Words: {top_contributing_words}\n")
        else:
            print(f"  Topic {topic} (Probability: {round(topic_prob, 3)}):\n    No detected contributing words!\n")


In [None]:

lda_model_20 = LdaModel(
        corpus=corpus,
        id2word=dictionary,
        num_topics=20,
        alpha=0.5,
        eta=0.01,
        passes=50,  
        random_state=42
    )

    # Compute Coherence Scores
coherence_c_v = CoherenceModel(model=lda_model_20, texts=filtered_documents, dictionary=dictionary, coherence='c_v').get_coherence()
coherence_u_mass = CoherenceModel(model=lda_model_20, corpus=corpus, dictionary=dictionary, coherence='u_mass').get_coherence()
coherence_npmi = CoherenceModel(model=lda_model_20, texts=filtered_documents, dictionary=dictionary, coherence='c_npmi').get_coherence()

# Compute Topic Diversity
diversity_score = topic_diversity(lda_model_20)

# Compute Jaccard Similarity
jaccard_score = avg_jaccard_similarity(lda_model_20)

print(f"Coherence Score c_v: {coherence_c_v}")
print(f"Coherence Score u_mass: {coherence_u_mass}")
print(f"Coherence Score npmi: {coherence_npmi}")
print(f"Topic Diversity: {diversity_score}")
print(f"Jaccard Similarity: {jaccard_score}")

topic_to_videos_20 = defaultdict(list)

video_topic_mapping_20 = {}

# probability threshold for assigning multiple topics
prob_threshold = 0.2

# Dictionary to store topic words for each video
video_topic_words_LDA1_20 = {}

for idx, doc_bow in enumerate(corpus):
    video_id = video_ids[idx]  # Get video ID
    topic_distribution = lda_model_20.get_document_topics(doc_bow, minimum_probability=0)

    # Get topics where probability is above threshold
    assigned_topics = [topic for topic, prob in topic_distribution if prob >= prob_threshold]
    video_topic_mapping_20[video_id] = assigned_topics  # Store assigned topics per video

    for topic in assigned_topics:
        topic_to_videos_20[topic].append(video_id)

    # Get the representative words for each assigned topic
    topic_words = []
    for topic in assigned_topics:
        words = [word for word, _ in lda_model_20.show_topic(topic, topn=50)]  # Get top 100 words
        topic_words.append(", ".join(words))  # Convert list to string

# Count occurrences of each topic
topic_counts_20 = Counter()

for topics in video_topic_mapping_20.values():
    for topic in topics:
        topic_counts_20[topic] += 1

# Print videos per topic
print("\nTop Words Per Topic:")

for topic_id in sorted(topic_to_videos_20.keys()): 
    top_words = lda_model_25.show_topic(topic_id, 50)
    words_str = ', '.join([word for word, prob in top_words])
    print(f"Topic {topic_id}: {words_str}")
    
# Print the number of videos per topic
print("\nNumber of Videos Per Topic:")
for topic, count in sorted(topic_counts_20.items()):
    print(f"Topic {topic}: {count} videos")

In [None]:
print("\nTopics Assigned Per Video:")
for video_id, topics in video_topic_mapping_20.items():
    topic_list = ', '.join(map(str, topics)) if topics else "No dominant topic"
    video_title = video_titles.get(video_id, "Unknown Title")
    print(f"Video ID: {video_id} | {video_title} → Topics: {topic_list}")

print("\nVideos Assigned Per Topic:")
for topic, video_list in sorted(topic_to_videos_20.items()):
    print(f"\nTopic {topic}:")
    for video in video_list:
        video_title = video_titles.get(video, "Unknown Title")
        print(f"\n{video} | {video_title}")

In [None]:
topic_to_videos_20 = defaultdict(list)
video_topic_mapping_20 = {}
prob_threshold = 0.2
video_topic_words_LDA1_20 = defaultdict(list)  # Store topics separately
topic_word_contributions_20 = {}

for idx, doc_bow in enumerate(corpus):
    video_id = video_ids[idx]  # Get video ID
    topic_distribution = lda_model_20.get_document_topics(doc_bow, minimum_probability=0)

    # Get topics above threshold
    assigned_topics = {topic: prob for topic, prob in topic_distribution if prob >= prob_threshold}
    video_topic_mapping_20[video_id] = assigned_topics

    for topic in assigned_topics:
        topic_to_videos_20[topic].append(video_id)

    # Identify words contributing to each assigned topic
    topic_word_contributions_20[video_id] = defaultdict(dict)
    bow_words = {dictionary[word_id] for word_id, _ in doc_bow}  # Convert doc_bow IDs to words

    for topic, prob in assigned_topics.items():
        topic_top_words = dict(lda_model_20.show_topic(topic, topn=2000))  # Get top words for the topic
        word_contributions = {}

        # Ensure we capture all contributing words
        for word in bow_words:
            if word in topic_top_words:
                word_contributions[word] = topic_top_words[word]

        # Store word contributions per topic
        topic_word_contributions_20[video_id][topic] = word_contributions
        topic_words = ", ".join(topic_top_words.keys())
        video_topic_words_LDA1_20[video_id].append(f"Topic {topic}: {topic_words}")

# Print contributing words per video
print("\nWords Contributing to Topics Per Video:")
for video_id, topic_data in topic_word_contributions_20.items():
    video_title = video_titles.get(video_id, "Unknown Title")
    print(f"\nVideo ID: {video_id} | {video_title}")
    for topic, word_scores in topic_data.items():
        sorted_words = sorted(word_scores.items(), key=lambda x: x[1], reverse=True)
        top_contributing_words = ", ".join([f"{word} ({round(score, 3)})" for word, score in sorted_words])
        topic_prob = video_topic_mapping_20[video_id][topic]
        
        # Ensure that all contributing words are displayed
        if top_contributing_words.strip():
            print(f"  Topic {topic} (Probability: {round(topic_prob, 3)}):\n    Contributing Words: {top_contributing_words}\n")
        else:
            print(f"  Topic {topic} (Probability: {round(topic_prob, 3)}):\n    No detected contributing words!\n")

In [None]:

lda_model_15 = LdaModel(
        corpus=corpus,
        id2word=dictionary,
        num_topics=15,
        alpha=0.5,
        eta=0.01,
        passes=50,  
        random_state=42
    )

    # Compute Coherence Scores
coherence_c_v = CoherenceModel(model=lda_model_15, texts=filtered_documents, dictionary=dictionary, coherence='c_v').get_coherence()
coherence_u_mass = CoherenceModel(model=lda_model_15, corpus=corpus, dictionary=dictionary, coherence='u_mass').get_coherence()
coherence_npmi = CoherenceModel(model=lda_model_15, texts=filtered_documents, dictionary=dictionary, coherence='c_npmi').get_coherence()

# Compute Topic Diversity
diversity_score = topic_diversity(lda_model_15)

# Compute Jaccard Similarity
jaccard_score = avg_jaccard_similarity(lda_model_15)

print(f"Coherence Score c_v: {coherence_c_v}")
print(f"Coherence Score u_mass: {coherence_u_mass}")
print(f"Coherence Score npmi: {coherence_npmi}")
print(f"Topic Diversity: {diversity_score}")
print(f"Jaccard Similarity: {jaccard_score}")

topic_to_videos_15 = defaultdict(list)

video_topic_mapping_15 = {}

# probability threshold for assigning multiple topics
prob_threshold = 0.2

# Dictionary to store topic words for each video
video_topic_words_LDA1_15 = {}

for idx, doc_bow in enumerate(corpus):
    video_id = video_ids[idx]  # Get video ID
    topic_distribution = lda_model_15.get_document_topics(doc_bow, minimum_probability=0)

    # Get topics where probability is above threshold
    assigned_topics = [topic for topic, prob in topic_distribution if prob >= prob_threshold]
    video_topic_mapping_15[video_id] = assigned_topics  # Store assigned topics per video

    for topic in assigned_topics:
        topic_to_videos_15[topic].append(video_id)

    # Get the representative words for each assigned topic
    topic_words = []
    for topic in assigned_topics:
        words = [word for word, _ in lda_model_15.show_topic(topic, topn=50)]  # Get top 100 words
        topic_words.append(", ".join(words))  # Convert list to string

# Count occurrences of each topic
topic_counts_15 = Counter()

for topics in video_topic_mapping_15.values():
    for topic in topics:
        topic_counts_15[topic] += 1

# Print videos per topic
print("\nTop Words Per Topic:")

for topic_id in sorted(topic_to_videos_15.keys()): 
    top_words = lda_model_15.show_topic(topic_id, 50)
    words_str = ', '.join([word for word, prob in top_words])
    print(f"Topic {topic_id}: {words_str}")
    
# Print the number of videos per topic
print("\nNumber of Videos Per Topic:")
for topic, count in sorted(topic_counts_15.items()):
    print(f"Topic {topic}: {count} videos")

In [None]:
print("\nTopics Assigned Per Video:")
for video_id, topics in video_topic_mapping_15.items():
    topic_list = ', '.join(map(str, topics)) if topics else "No dominant topic"
    video_title = video_titles.get(video_id, "Unknown Title")
    print(f"Video ID: {video_id} | {video_title} → Topics: {topic_list}")

print("\nVideos Assigned Per Topic:")
for topic, video_list in sorted(topic_to_videos_15.items()):
    print(f"\nTopic {topic}:")
    for video in video_list:
        video_title = video_titles.get(video, "Unknown Title")
        print(f"\n{video} | {video_title}")

In [None]:
topic_to_videos_15 = defaultdict(list)
video_topic_mapping_15 = {}
prob_threshold = 0.2
video_topic_words_LDA1_15 = defaultdict(list)  # Store topics separately
topic_word_contributions_15 = {}

for idx, doc_bow in enumerate(corpus):
    video_id = video_ids[idx]  # Get video ID
    topic_distribution = lda_model_15.get_document_topics(doc_bow, minimum_probability=0)

    # Get topics above threshold
    assigned_topics = {topic: prob for topic, prob in topic_distribution if prob >= prob_threshold}
    video_topic_mapping_15[video_id] = assigned_topics

    for topic in assigned_topics:
        topic_to_videos_15[topic].append(video_id)

    # Identify words contributing to each assigned topic
    topic_word_contributions_15[video_id] = defaultdict(dict)
    bow_words = {dictionary[word_id] for word_id, _ in doc_bow}  # Convert doc_bow IDs to words

    for topic, prob in assigned_topics.items():
        topic_top_words = dict(lda_model_15.show_topic(topic, topn=2000))  # Get top words for the topic
        word_contributions = {}

        # Ensure we capture all contributing words
        for word in bow_words:
            if word in topic_top_words:
                word_contributions[word] = topic_top_words[word]

        # Store word contributions per topic
        topic_word_contributions_15[video_id][topic] = word_contributions
        topic_words = ", ".join(topic_top_words.keys())
        video_topic_words_LDA1_15[video_id].append(f"Topic {topic}: {topic_words}")

# Print contributing words per video
print("\nWords Contributing to Topics Per Video:")
for video_id, topic_data in topic_word_contributions_15.items():
    video_title = video_titles.get(video_id, "Unknown Title")
    print(f"\nVideo ID: {video_id} | {video_title}")
    for topic, word_scores in topic_data.items():
        sorted_words = sorted(word_scores.items(), key=lambda x: x[1], reverse=True)
        top_contributing_words = ", ".join([f"{word} ({round(score, 3)})" for word, score in sorted_words])
        topic_prob = video_topic_mapping_15[video_id][topic]
        
        # Ensure that all contributing words are displayed
        if top_contributing_words.strip():
            print(f"  Topic {topic} (Probability: {round(topic_prob, 3)}):\n    Contributing Words: {top_contributing_words}\n")
        else:
            print(f"  Topic {topic} (Probability: {round(topic_prob, 3)}):\n    No detected contributing words!\n")

In [None]:
lda_model_10 = LdaModel(
        corpus=corpus,
        id2word=dictionary,
        num_topics=10,
        alpha='auto',
        eta='auto',
        passes=50,  
        random_state=42
    )

    # Compute Coherence Scores
coherence_c_v = CoherenceModel(model=lda_model_10, texts=filtered_documents, dictionary=dictionary, coherence='c_v').get_coherence()
coherence_u_mass = CoherenceModel(model=lda_model_10, corpus=corpus, dictionary=dictionary, coherence='u_mass').get_coherence()
coherence_npmi = CoherenceModel(model=lda_model_10, texts=filtered_documents, dictionary=dictionary, coherence='c_npmi').get_coherence()

# Compute Topic Diversity
diversity_score = topic_diversity(lda_model_10)

# Compute Jaccard Similarity
jaccard_score = avg_jaccard_similarity(lda_model_10)

print(f"Coherence Score c_v: {coherence_c_v}")
print(f"Coherence Score u_mass: {coherence_u_mass}")
print(f"Coherence Score npmi: {coherence_npmi}")
print(f"Topic Diversity: {diversity_score}")
print(f"Jaccard Similarity: {jaccard_score}")

topic_to_videos_10 = defaultdict(list)

video_topic_mapping_10 = {}

# probability threshold for assigning multiple topics
prob_threshold = 0.2

# Dictionary to store topic words for each video
video_topic_words_LDA1_10 = {}

for idx, doc_bow in enumerate(corpus):
    video_id = video_ids[idx]  # Get video ID
    topic_distribution = lda_model_10.get_document_topics(doc_bow, minimum_probability=0)

    # Get topics where probability is above threshold
    assigned_topics = [topic for topic, prob in topic_distribution if prob >= prob_threshold]
    video_topic_mapping_10[video_id] = assigned_topics  # Store assigned topics per video

    for topic in assigned_topics:
        topic_to_videos_10[topic].append(video_id)

    # Get the representative words for each assigned topic
    topic_words = []
    for topic in assigned_topics:
        words = [word for word, _ in lda_model_10.show_topic(topic, topn=50)]  # Get top 100 words
        topic_words.append(", ".join(words))  # Convert list to string

# Count occurrences of each topic
topic_counts_10 = Counter()

for topics in video_topic_mapping_10.values():
    for topic in topics:
        topic_counts_10[topic] += 1

# Print videos per topic
print("\nTop Words Per Topic:")

for topic_id in sorted(topic_to_videos_10.keys()): 
    top_words = lda_model_10.show_topic(topic_id, 50)
    words_str = ', '.join([word for word, prob in top_words])
    print(f"Topic {topic_id}: {words_str}")
    
# Print the number of videos per topic
print("\nNumber of Videos Per Topic:")
for topic, count in sorted(topic_counts_10.items()):
    print(f"Topic {topic}: {count} videos")

In [None]:
print("\nTopics Assigned Per Video:")
for video_id, topics in video_topic_mapping_10.items():
    topic_list = ', '.join(map(str, topics)) if topics else "No dominant topic"
    video_title = video_titles.get(video_id, "Unknown Title")
    print(f"Video ID: {video_id} | {video_title} → Topics: {topic_list}")

print("\nVideos Assigned Per Topic:")
for topic, video_list in sorted(topic_to_videos_10.items()):
    print(f"\nTopic {topic}:")
    for video in video_list:
        video_title = video_titles.get(video, "Unknown Title")
        print(f"\n{video} | {video_title}")

In [None]:
topic_to_videos_10 = defaultdict(list)
video_topic_mapping_10 = {}
prob_threshold = 0.2
video_topic_words_LDA1_10 = defaultdict(list)  # Store topics separately
topic_word_contributions_10 = {}

for idx, doc_bow in enumerate(corpus):
    video_id = video_ids[idx]  # Get video ID
    topic_distribution = lda_model_10.get_document_topics(doc_bow, minimum_probability=0)

    # Get topics above threshold
    assigned_topics = {topic: prob for topic, prob in topic_distribution if prob >= prob_threshold}
    video_topic_mapping_10[video_id] = assigned_topics

    for topic in assigned_topics:
        topic_to_videos_10[topic].append(video_id)

    # Identify words contributing to each assigned topic
    topic_word_contributions_10[video_id] = defaultdict(dict)
    bow_words = {dictionary[word_id] for word_id, _ in doc_bow}  # Convert doc_bow IDs to words

    for topic, prob in assigned_topics.items():
        topic_top_words = dict(lda_model_10.show_topic(topic, topn=2000))  # Get top words for the topic
        word_contributions = {}

        # Ensure we capture all contributing words
        for word in bow_words:
            if word in topic_top_words:
                word_contributions[word] = topic_top_words[word]

        # Store word contributions per topic
        topic_word_contributions_10[video_id][topic] = word_contributions
        topic_words = ", ".join(topic_top_words.keys())
        video_topic_words_LDA1_15[video_id].append(f"Topic {topic}: {topic_words}")

# Print contributing words per video
print("\nWords Contributing to Topics Per Video:")
for video_id, topic_data in topic_word_contributions_10.items():
    video_title = video_titles.get(video_id, "Unknown Title")
    print(f"\nVideo ID: {video_id} | {video_title}")
    for topic, word_scores in topic_data.items():
        sorted_words = sorted(word_scores.items(), key=lambda x: x[1], reverse=True)
        top_contributing_words = ", ".join([f"{word} ({round(score, 3)})" for word, score in sorted_words])
        topic_prob = video_topic_mapping_10[video_id][topic]
        
        # Ensure that all contributing words are displayed
        if top_contributing_words.strip():
            print(f"  Topic {topic} (Probability: {round(topic_prob, 3)}):\n    Contributing Words: {top_contributing_words}\n")
        else:
            print(f"  Topic {topic} (Probability: {round(topic_prob, 3)}):\n    No detected contributing words!\n")