In [1]:
import os
import string
import unicodedata
import requests
import re

from nltk.corpus import stopwords
from gensim import corpora
from gensim.models import LdaModel, CoherenceModel
from nltk.tokenize import word_tokenize
from nltk.collocations import BigramCollocationFinder, BigramAssocMeasures, TrigramCollocationFinder, TrigramAssocMeasures
from nltk import pos_tag
from nltk.util import ngrams
from collections import Counter, defaultdict


In [2]:
stop_words = set(stopwords.words("english"))

# Fetch stopwords
def fetch_stopwords_from_github(url):
    response = requests.get(url)
    return set(response.text.splitlines())

github_stopwords_url = 'https://raw.githubusercontent.com/stopwords-iso/stopwords-en/master/stopwords-en.txt'
github_stopwords = fetch_stopwords_from_github(github_stopwords_url)

stop_words = set(stopwords.words('english'))
custom_stop_words = ['like', 'yeah', 'know', 'um', 'uh', 'really', 'one', 'go', 'right', 'okay', 'well', 'said', 
                    'going', 'got', 'na', 'always', 'every', 'each', 'say', 'el', 'little', 'still', 
                    'best', 'dutch', 'nice', 'great', 'awesome', 'good', 'cool', 'love', 'amazing', 'wow',
                    'breaking news', 'report', 'coverage', 'investigation', 'interview', 'documentary', 'news', 'netherlands', 'psy', 'subtitle', 'description', 'link', 
                    'journalist', 'headline', 'reporter', 'current events', 'special report', 
                    'analysis', 'documented', 'broadcast', 'reporting', 'v', 'food', 'travel', 'react', 
                    'reacts', 'reaction', 'foreigner', 'thing', 'visit', 'dc', 'japan', 'first', 'fast', 
                    'asia', 'ang', 'indian', 'thai', 'vietnamese', 'russia', 'gon', 'canada', 'canadian', 'russian', 
                    'russia', 'guy', 'lot', 'bit', 'diba', 'ola', 'cuz', 'thai', 'thailand', 'person', 'citizen', 'foreigner', 'foreign', 'foreigners',
                    'facebook', 'filipinos', 'filipinas', 'vlog', 'vlogs', 'vlogging', 'hashtag', 'india', 'bro', 'dito', 'people', 'time', 'music', 'gonna', 'life', 
                    'lol', 'guys', 'tho', 'cute', 'hmm', 'huh', 'channel', 'subscribe', 'day6', 'mandarin', 'chinese', 'beautiful',
                    'chuckles', 'fbe', 'hit', 'laughs', 'yo', 'ka', 'word', 'living', 'boi', 'minimum', 'ya', 'successful', 'perfectly', 'yeap', 
                    'wondering', 'fantastic', 'hurry', 'german', 'age', 'country', 'subscribing', 'bluesy', 'jump', 'pretty', 'understanding', 'personalized',
                    'and', 'the', 'to', 'of', 'in', 'for', 'on', 'with', 'at', 'by', 'from', 'up', 'about', 'over', 'into', 'through', 'between', 'under', 'against', 'all',
                    'you', 'haha', 'hahaha', 'ha', 'hey', 'bye', 'hello', 'hi', 'oh', 'blah', 'easy', 'alright', 'ta', 'day', 'ooh', 'en', 'do', 'lot', 'comment', 'notification', 
                    'korean', 'jjajangmyeon', 'jajangmyeon', 'damn', 'yall', 'month', 'week', 'year', 'ohhh', 'pvf', 'dude', 'mmm', 'kagilagilalas', 'ofcourse', 'australia', 'uxo', 
                    'atleast', 'yusuf', 'bangkok', 'ot', 'anytime', 'allover', 'kala', 'nope', 'wan', 'brazil', 'smooth', 'ot', 'timeshere', 'batchof', 'yep', 'opo', 'del',
                    'gosh', 'po', 'ourself', 'wo', 'wait', 'ugh', 'nyc', 'whoa', 'nicaragua', 'yup', 'em', 'bout', 'le', 'omg', 'overwhelm', 'maam', 'nicer', 'haha', 'hahaha', 'ha', 
                    'nbcs', 'lana', 'rc', 'whatsoever', 'oxy', 'decade', 'whyd', 'unknown', 'ahhhhh', 'ohoh', 'ohto', 'ohhhh', 'bruh', 'ooe', 'ahmedabad', 'mexico', 
                    'understand', 'excuse', 'kinda', 'applause', 'oooh', 'thiswhat', 'nevermind', 'ahh', 'againthank', 'toto', 'aww', 'nah', 'bbmas', 'ay', 'op', 'huh', 'huhu',
                    'tada', 'beacuse', 'voila', 'upstairs', 'thatswhy', 'yea', 'that', 'armenia', 'or', 'not', 'funwhat', 'aka', 'armeniathat', 'woosexy', 'worth', 'laugh', 'box', 
                    'xd', 'vb', 'eff', 'ananya', 'welsh', 'latron', 'shout', 'whatwhat', 'what', 'pause', 'why', 'thats', 'byebye', 'iv', 'bye', 'ado', 'ownup', 'dom', 'jomm', 'sir', 
                    'budgie', 'nomac', 'lavocha', 'germany', 'why', 'walang', 'superduper', 'philip', 'mom', 'jre', 'giddy', 'intro', 'dupe', 'europe', 'dream', 'team', 'dislike', 'content', 
                    'yoongi', 'royale', 'ilu', 'jhope', 'day', 'jin', 'ecc', 'nyhs', 'nego', 'chavez', 'pb', 'everyones', 'epic', 'matter', 'oneonone', 'region', 'change', 'ho', 'seetoh', 
                    'atin', 'vpn', 'facetune', 'busu', 'mackie', 'clyd', 'china', 'rest', 'friend', 'woah', 'dindins', 'poster', 'vibe', 'woman', 'boss', 'woah', 'type', 'mahana', 'joke', 
                    'taller', 'insane', 'whang', 'psa', 'manatee', 'recommends', 'caesar', 'mmmhmm', 'mosul', 'dun', 'clue', 'naysayer', 'hindi', 'ko', 'pero', 'bulgaria', 'question', 'video', 
                    'yobi', 'hindu', 'expat', 'option', 'gap', 'eu', 'simo', 'kouignamann', 'bct', 'month', 'cfo', 'philippines', 'philippine', 'british', 'filipino', 'video', 
                    'http', 'korea', 'korean', 'youtube', 'google', 'united', 'america', 'american', 'kpop', '필리핀', 'bts', 'blackpink', 'twice', 'exo', 'k-pop', 
                    'seventeen', 'stray kids', 'nct', 'kdrama', 'aespa', 'taehyung', 'jimin', 'jungkook', 'ayo', 'favorite', 'ndo', 'baa', 'real', 'woooow', 'kung', 'yay', 'shy', 'kagap',
                    'kao', 'maghoya', 'leiva', 'kimetsu', 'boy', 'subscribers', 'hours', 'minutes', 'seconds', 'hour', 'minute', 'second', 'day', 'days', 'week', 'weeks', 'month', 'months',
                    'hmmm', 'hong', 'kong', 'tomorrow', 'night', 'fawkes', 'bum', 'stuff', 'comments', 'start', 'check', 'bring', 'button', 'yummy', 'guess', 'everythings', 'super', 'enjoy', 'bc', 'lots', 
                    'aaaaah','countries', 'reasons', 'style', 'eat', 'eats', 'taste', 'tastes', 'feel', 'ate', 'price', 'share', 'talk']
stop_words.update(custom_stop_words, github_stopwords)

# Folder paths
transcripts_folder_path = './Previous_THS-ST2_Files/standard_dataset_old/'
tags_folder_path = 'tags/'

# Function to load video tags only for fetched video IDs
def load_video_tags(folder_path, video_ids):
    video_tags = {}
    for video_id in video_ids:
        tag_file = os.path.join(folder_path, f"{video_id}.txt")
        if os.path.exists(tag_file):
            with open(tag_file, "r", encoding="utf-8") as file:
                tags_content = file.read().lower()
                video_tags[video_id] = tags_content.split()  # Store as list of words
        else:
            video_tags[video_id] = []  # Default to empty list if no tags
    return video_tags

video_ids = []
transcript_files = []
for file_name in os.listdir(transcripts_folder_path):
    if file_name.endswith('.txt'):
        video_id = file_name.split('_captions')[0]
        video_ids.append(video_id)
        transcript_files.append((video_id, file_name)) 

video_tags = load_video_tags(tags_folder_path, video_ids)

In [3]:
def is_latin_script(word):
    return all('LATIN' in unicodedata.name(char, '') or char.isdigit() for char in word)

# Function to detect both bigram and trigram collocations
def detect_collocations(tokens, min_freq=2):
    bigram_measures = BigramAssocMeasures()
    trigram_measures = TrigramAssocMeasures()

    # Find bigrams
    bigram_finder = BigramCollocationFinder.from_words(tokens)
    bigram_finder.apply_freq_filter(min_freq)
    bigrams = set(['_'.join(bigram) for bigram in bigram_finder.nbest(bigram_measures.pmi, 10)])

    # Find trigrams
    trigram_finder = TrigramCollocationFinder.from_words(tokens)
    trigram_finder.apply_freq_filter(min_freq)
    trigrams = set(['_'.join(trigram) for trigram in trigram_finder.nbest(trigram_measures.pmi, 10)])

    return bigrams, trigrams

def is_valid_ngram(ngram, existing_ngrams):
    """ Check if the n-gram contains alternating or duplicate words """
    words = ngram.split('_')
    unique_words = set(words)
    
    if len(unique_words) == 1:
        return False  
    
    if len(words) > 2 and words[0] == words[2]:  
        return False  

    if len(words) == 3 and words[1] == words[2]:
        return False 
    
    if len(words) == 2:
        for existing_ngram in existing_ngrams:
            if ngram in existing_ngram:
                return False  
    
    return True

def preprocess_text(doc, video_id, tag_weight=2, ngram_weight_factor=2):
    # Clean punctuation at the end of words
    doc = re.sub(r'([a-zA-Z]+)[,;:!?.]', r'\1', doc)

    # Lowercase and remove punctuation
    doc = doc.lower().translate(str.maketrans('', '', string.punctuation))
    tokens = word_tokenize(doc)

    # Remove stopwords & non-latin words
    tokens = [word for word in tokens if word not in stop_words and word.isalpha() and is_latin_script(word)]

    # POS tagging
    tokens_with_pos = pos_tag(tokens)

    # Remove verbs and adjectives before n-gram detection
    pos_exclude = {'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'JJ', 'JJR', 'JJS', 'RB', 'RBR', 'RBS'}
    filtered_tokens = [word for word, pos in tokens_with_pos if pos not in pos_exclude]

    # Detect meaningful bigram and trigram collocations
    bigrams, trigrams = detect_collocations(filtered_tokens)

    # Generate n-grams
    bigram_tokens = ['_'.join(gram) for gram in ngrams(filtered_tokens, 2)]
    trigram_tokens = ['_'.join(gram) for gram in ngrams(filtered_tokens, 3)]

    # Remove invalid n-grams (duplicates, alternating patterns)
    bigram_tokens = [bigram for bigram in bigram_tokens if is_valid_ngram(bigram, set())]
    trigram_tokens = [trigram for trigram in trigram_tokens if is_valid_ngram(trigram, bigram_tokens)]

    # Keep only meaningful n-grams from detect_collocations()
    bigram_tokens = [bigram for bigram in bigram_tokens if bigram in bigrams]
    trigram_tokens = [trigram for trigram in trigram_tokens if trigram in trigrams]

    # Remove n-grams that are just reordered versions
    unique_ngrams = set()
    filtered_bigrams = []
    filtered_trigrams = []

    for bigram in bigram_tokens:
        sorted_bigram = '_'.join(sorted(bigram.split('_')))
        if sorted_bigram not in unique_ngrams:
            unique_ngrams.add(sorted_bigram)
            filtered_bigrams.append(bigram)

    for trigram in trigram_tokens:
        sorted_trigram = '_'.join(sorted(trigram.split('_')))
        if sorted_trigram not in unique_ngrams:
            unique_ngrams.add(sorted_trigram)
            filtered_trigrams.append(trigram)

    # Count n-gram frequency
    bigram_frequencies = Counter(filtered_bigrams)
    trigram_frequencies = Counter(filtered_trigrams)

    # Merge n-grams into single tokens
    bigram_trigram_words = set()
    merged_tokens = []
    i = 0
    while i < len(filtered_tokens) - 2:  # Check for trigrams first
        trigram = f"{filtered_tokens[i]}_{filtered_tokens[i+1]}_{filtered_tokens[i+2]}"
        bigram = f"{filtered_tokens[i]}_{filtered_tokens[i+1]}"

        if trigram in filtered_trigrams:
            merged_tokens.append(trigram)
            bigram_trigram_words.add(trigram)
            i += 3  # Skip next two words since it's part of the trigram
        elif bigram in filtered_bigrams:
            merged_tokens.append(bigram)
            bigram_trigram_words.add(bigram)
            i += 2  # Skip next word since it's part of the bigram
        else:
            merged_tokens.append(filtered_tokens[i])
            i += 1

    # Append any remaining words
    while i < len(filtered_tokens):
        merged_tokens.append(filtered_tokens[i])
        i += 1

    # Remove duplicates before assigning weight
    unique_tokens = list(set(merged_tokens))

    # Assign weight based on n-gram occurrence
    weighted_tokens = []
    for token in unique_tokens:
        if token in trigram_frequencies:
            token_weight = trigram_frequencies[token] * ngram_weight_factor  
        elif token in bigram_frequencies:
            token_weight = bigram_frequencies[token] * (ngram_weight_factor)  
        else:
            token_weight = 1
        weighted_tokens.extend([token] * int(token_weight))

    # Include video tags
    if video_id in video_tags:
        tags = video_tags[video_id]
        for tag in tags:
            if tag.isalpha():
                weighted_tokens.extend([tag] * int(tag_weight))

    return ' '.join(weighted_tokens), list(bigram_trigram_words)

In [None]:
all_documents = []
preprocessed_text = []
bigram_trigram_text = {}

for video_id, file_name in transcript_files:
    with open(os.path.join(transcripts_folder_path, file_name), 'r', encoding='utf-8') as file:
        content = file.read().lower()
        if len(content.split()) >= 100: 
            processed_text, bigram_trigram = preprocess_text(content, video_id)
            preprocessed_text.append((video_id, processed_text))
            all_documents.append(processed_text)
            bigram_trigram_text[video_id] = bigram_trigram


# Create Dictionary and Corpus
dictionary = corpora.Dictionary([doc["preprocessed_text"].split() for doc in video_data.values()])
corpus = [dictionary.doc2bow(doc["preprocessed_text"].split()) for doc in video_data.values()]

In [5]:
for n in video_ids:
    print("Video ID:", n)
    print(f"{bigram_trigram_text[n]}")
    print()

Video ID: --8n6A8Q6M0_$200 Luxury Beach Hotel in The Philippines ðŸ‡µðŸ‡­
['chocolate_muffin', 'bread_roll', 'infinity_pool_beach', 'palm_trees', 'coffee_wake', 'wear_shoes', 'rainy_season', 'beer_bottle_goodness', 'wet_clothes', 'hair_mess']

Video ID: 0IMWasj76yU_Philippines Army vs Thailand Army
['salt_win', 'helicopters_power_troopers', 'sea_sea_ground', 'helicopters_power']

Video ID: 1kErCqgIVMk_Tour of The House We Built in The Philippines
['house_tour', 'hand_cut', 'tons_storage', 'master_bath']

Video ID: 2ftG8JuMzz4_Top 5 Exotic Foods in the Philippines The BRAVE MUST TRY!
['balut_duck', 'soup_egg', 'coconut_vinegar', 'kinilaw_tamilok', 'egg_egg_yolk']

Video ID: 2TmagN6RhkI_VOCAL COACH REACTS - REGINE VELASQUEZ - Araw-Gabi
['switches_voice', 'head_voice', 'voice_mix']

Video ID: 3n1nspLgfd4_Balut!!!!! - Manila, Philippines - Now United
[]

Video ID: 401ZvQ11euA_Did China Buy the Philippines With False Investment Promises
['dollars_aid', 'morning_post', 'plant_flag', 'hotel_s

In [None]:
# Train LDA Model
lda_model_12 = LdaModel(corpus, num_topics=12, id2word=dictionary, alpha='auto', eta='auto', passes=100, random_state=42)

# Compute Coherence Score
coherence_model = CoherenceModel(model=lda_model_12, corpus=corpus, dictionary=dictionary, coherence='u_mass')
coherence_score = coherence_model.get_coherence()
print(f"Coherence Score: {coherence_score}")

# Print Topics
topics = lda_model_12.print_topics(num_words=100)
for topic_id, topic_words in topics:
    print(f"Topic {topic_id}: {topic_words}")


Coherence Score: -2.9385038790588855
Topic 0: 0.003*"pesos" + 0.003*"sit" + 0.003*"streets" + 0.003*"market" + 0.002*"water" + 0.002*"videos" + 0.002*"baby" + 0.002*"idea" + 0.002*"eyes" + 0.002*"ice_cream" + 0.002*"trip" + 0.002*"manila" + 0.002*"air" + 0.002*"town" + 0.002*"film" + 0.002*"sort" + 0.002*"trees" + 0.002*"culture" + 0.002*"pay" + 0.002*"god" + 0.002*"bars" + 0.002*"sign" + 0.002*"clothes" + 0.002*"glass" + 0.002*"security" + 0.002*"degrees" + 0.002*"breeze" + 0.002*"walls" + 0.002*"bills" + 0.002*"future" + 0.002*"types" + 0.002*"route" + 0.002*"vehicle" + 0.002*"steel" + 0.002*"sounds" + 0.002*"spirits" + 0.002*"drinking" + 0.002*"bed" + 0.002*"mangoes" + 0.001*"size" + 0.001*"couple" + 0.001*"meal" + 0.001*"car" + 0.001*"transportation" + 0.001*"family" + 0.001*"islands" + 0.001*"cross" + 0.001*"bike" + 0.001*"head" + 0.001*"hand" + 0.001*"pick" + 0.001*"feet" + 0.001*"variety" + 0.001*"tea" + 0.001*"owner" + 0.001*"beach" + 0.001*"spot" + 0.001*"center" + 0.001*"coff

In [7]:
topic_to_videos = defaultdict(list)

video_topic_mapping_12 = {}

# probability threshold for assigning multiple topics
prob_threshold = 0.3

# Dictionary to store topic words for each video
video_topic_words_LDA1_12 = {}

for idx, doc_bow in enumerate(corpus):
    video_id = video_ids[idx]  # Get video ID
    topic_distribution = lda_model_12.get_document_topics(doc_bow, minimum_probability=0)

    # Get topics where probability is above threshold
    assigned_topics = [topic for topic, prob in topic_distribution if prob >= prob_threshold]
    video_topic_mapping_12[video_id] = assigned_topics  # Store assigned topics per video

    for topic in assigned_topics:
        topic_to_videos[topic].append(video_id)

    # Get the representative words for each assigned topic
    topic_words = []
    for topic in assigned_topics:
        words = [word for word, _ in lda_model_12.show_topic(topic, topn=100)]  # Get top 100 words
        topic_words.append(", ".join(words))  # Convert list to string

    # Store the topic words as a string
    video_topic_words_LDA1_12[video_id] = "; ".join(topic_words)  

# Count occurrences of each topic
topic_counts = Counter()

for topics in video_topic_mapping_12.values():
    for topic in topics:
        topic_counts[topic] += 1

# Print the number of videos per topic
print("\nNumber of Videos Per Topic:")
for topic, count in sorted(topic_counts.items()):
    print(f"Topic {topic}: {count} videos")

# Print topics assigned per video
print("\nTopics Assigned Per Video:")
for video_id, topics in video_topic_mapping_12.items():
    topic_list = ', '.join(map(str, topics)) if topics else "No dominant topic"
    print(f"Video ID: {video_id} → Topics: {topic_list}")

# Print videos per topic
print("\nTop Words Per Topic:")

for topic_id in sorted(topic_to_videos.keys()): 
    top_words = lda_model_12.show_topic(topic_id, 200)
    words_str = ', '.join([word for word, prob in top_words])
    print(f"Topic {topic_id}: {words_str}")

topic_word_contributions = {}
for idx, doc_bow in enumerate(corpus):
    video_id = video_ids[idx]
    topic_distribution = lda_model_12.get_document_topics(doc_bow, minimum_probability=0)
    assigned_topics = [topic for topic, prob in topic_distribution if prob >= prob_threshold]
    video_topic_mapping_12[video_id] = assigned_topics
    for topic in assigned_topics:
        topic_to_videos[topic].append(video_id)
    topic_word_contributions[video_id] = {topic: lda_model_12.show_topic(topic, topn=100) for topic in assigned_topics}


Number of Videos Per Topic:
Topic 0: 9 videos
Topic 1: 8 videos
Topic 2: 12 videos
Topic 3: 6 videos
Topic 4: 8 videos
Topic 5: 9 videos
Topic 6: 9 videos
Topic 7: 9 videos
Topic 8: 6 videos
Topic 9: 13 videos
Topic 10: 3 videos
Topic 11: 11 videos

Topics Assigned Per Video:
Video ID: --8n6A8Q6M0_$200 Luxury Beach Hotel in The Philippines ðŸ‡µðŸ‡­ → Topics: 2
Video ID: 0IMWasj76yU_Philippines Army vs Thailand Army → Topics: 0
Video ID: 1kErCqgIVMk_Tour of The House We Built in The Philippines → Topics: 0
Video ID: 2ftG8JuMzz4_Top 5 Exotic Foods in the Philippines The BRAVE MUST TRY! → Topics: 3
Video ID: 2TmagN6RhkI_VOCAL COACH REACTS - REGINE VELASQUEZ - Araw-Gabi → Topics: 11
Video ID: 3n1nspLgfd4_Balut!!!!! - Manila, Philippines - Now United → Topics: 0
Video ID: 401ZvQ11euA_Did China Buy the Philippines With False Investment Promises → Topics: 3
Video ID: 4OiOwDh5lKE_Chef Kuya Joey Is Flying To The Philippines!  ðŸ‡µðŸ‡­ → Topics: 1
Video ID: 6ftiWoCJ4dM_Filipino Food In The Neth

In [None]:
print("\nVideos Assigned Per Topic:")
for topic, video_list in sorted(topic_to_videos.items()):
    print(f"\nTopic {topic}:")
    for video in video_list:
        print(f"- {video}")


In [8]:
topic_to_videos = defaultdict(list)
video_topic_mapping_12 = {}
prob_threshold = 0.3
video_topic_words_LDA1_12 = defaultdict(list)
topic_word_contributions = {}

for idx, doc_bow in enumerate(corpus):
    video_id = video_ids[idx]  # Get video ID
    topic_distribution = lda_model_12.get_document_topics(doc_bow, minimum_probability=0)

    # Get topics above threshold
    assigned_topics = {topic: prob for topic, prob in topic_distribution if prob >= prob_threshold}
    video_topic_mapping_12[video_id] = assigned_topics

    for topic in assigned_topics:
        topic_to_videos[topic].append(video_id)

    # Identify words contributing to each assigned topic
    topic_word_contributions[video_id] = defaultdict(dict)
    bow_words = {dictionary[word_id] for word_id, _ in doc_bow}  # Convert doc_bow IDs to words

    for topic, prob in assigned_topics.items():
        topic_top_words = dict(lda_model_12.show_topic(topic, topn=2000))  # Get top words for the topic
        word_contributions = {}

        # Ensure we capture all contributing words
        for word in bow_words:
            if word in topic_top_words:
                word_contributions[word] = topic_top_words[word]
            else:
                print(f"Warning: Word '{word}' not found in Topic {topic} top words.")  # Debugging

        # Store word contributions per topic
        topic_word_contributions[video_id][topic] = word_contributions
        topic_words = ", ".join(topic_top_words.keys())
        video_topic_words_LDA1_12[video_id].append(f"Topic {topic}: {topic_words}")

# Print contributing words per video
print("\nWords Contributing to Topics Per Video:")
for video_id, topic_data in topic_word_contributions.items():
    print(f"Video ID: {video_id}")
    for topic, word_scores in topic_data.items():
        sorted_words = sorted(word_scores.items(), key=lambda x: x[1], reverse=True)
        top_contributing_words = ", ".join([f"{word} ({round(score, 3)})" for word, score in sorted_words])
        topic_prob = video_topic_mapping_12[video_id][topic]
        
        # Ensure that all contributing words are displayed
        if top_contributing_words.strip():
            print(f"  Topic {topic} (Probability: {round(topic_prob, 3)}):\n    Contributing Words: {top_contributing_words}\n")
        else:
            print(f"  Topic {topic} (Probability: {round(topic_prob, 3)}):\n    No detected contributing words!\n")



Words Contributing to Topics Per Video:
Video ID: --8n6A8Q6M0_$200 Luxury Beach Hotel in The Philippines ðŸ‡µðŸ‡­
  Topic 2 (Probability: 0.9860000014305115):
    Contributing Words: water (0.003000000026077032), idea (0.0020000000949949026), manila (0.0020000000949949026), boat (0.0020000000949949026), yesterday (0.0020000000949949026), stay (0.0020000000949949026), experience (0.0020000000949949026), kids (0.0020000000949949026), couple (0.0020000000949949026), milk (0.0020000000949949026), street (0.0020000000949949026), road (0.0020000000949949026), floor (0.0020000000949949026), tourist (0.0020000000949949026), dollars (0.0020000000949949026), pass (0.0020000000949949026), chicken (0.0020000000949949026), spending (0.0020000000949949026), level (0.0020000000949949026), types (0.0020000000949949026), ice (0.0020000000949949026), coffee (0.0020000000949949026), meet (0.0020000000949949026), sit (0.0020000000949949026), videos (0.0020000000949949026), balcony (0.0020000000949949026)

In [None]:
# Train LDA Model
lda_model_15 = LdaModel(corpus, num_topics=15, id2word=dictionary, alpha='auto', eta='auto', passes=100, random_state=42)

topic_to_videos_15 = defaultdict(list)

video_topic_mapping_15 = {}

# probability threshold for assigning multiple topics
prob_threshold = 0.2

# Dictionary to store topic words for each video
video_topic_words_LDA1_15 = {}

for idx, doc_bow in enumerate(corpus):
    video_id = video_ids[idx]  # Get video ID
    topic_distribution = lda_model_15.get_document_topics(doc_bow, minimum_probability=0)

    # Get topics where probability is above threshold
    assigned_topics = [topic for topic, prob in topic_distribution if prob >= prob_threshold]
    video_topic_mapping_15[video_id] = assigned_topics  # Store assigned topics per video

    for topic in assigned_topics:
        topic_to_videos_15[topic].append(video_id)

    # Get the representative words for each assigned topic
    topic_words = []
    for topic in assigned_topics:
        words = [word for word, _ in lda_model_15.show_topic(topic, topn=120)]  # Get top 100 words
        topic_words.append(", ".join(words))  # Convert list to string

# Count occurrences of each topic
topic_counts = Counter()

for topics in video_topic_mapping_15.values():
    for topic in topics:
        topic_counts[topic] += 1

# Print the number of videos per topic
print("\nNumber of Videos Per Topic:")
for topic, count in sorted(topic_counts.items()):
    print(f"Topic {topic}: {count} videos")

# Print topics assigned per video
print("\nTopics Assigned Per Video:")
for video_id, topics in video_topic_mapping_15.items():
    topic_list = ', '.join(map(str, topics)) if topics else "No dominant topic"
    print(f"Video ID: {video_id} → Topics: {topic_list}")

# Print videos per topic
print("\nTop Words Per Topic:")

for topic_id in sorted(topic_to_videos_15.keys()): 
    top_words = lda_model_15.show_topic(topic_id, 120)
    words_str = ', '.join([word for word, prob in top_words])
    print(f"Topic {topic_id}: {words_str}")


Number of Videos Per Topic:
Topic 0: 9 videos
Topic 1: 4 videos
Topic 2: 7 videos
Topic 3: 5 videos
Topic 4: 11 videos
Topic 5: 7 videos
Topic 6: 9 videos
Topic 7: 6 videos
Topic 8: 4 videos
Topic 9: 10 videos
Topic 10: 2 videos
Topic 11: 15 videos
Topic 12: 11 videos
Topic 13: 4 videos
Topic 14: 7 videos

Topics Assigned Per Video:
Video ID: --8n6A8Q6M0_$200 Luxury Beach Hotel in The Philippines ðŸ‡µðŸ‡­ → Topics: 8
Video ID: 0IMWasj76yU_Philippines Army vs Thailand Army → Topics: 0
Video ID: 1kErCqgIVMk_Tour of The House We Built in The Philippines → Topics: 0
Video ID: 2ftG8JuMzz4_Top 5 Exotic Foods in the Philippines The BRAVE MUST TRY! → Topics: 3
Video ID: 2TmagN6RhkI_VOCAL COACH REACTS - REGINE VELASQUEZ - Araw-Gabi → Topics: 7
Video ID: 3n1nspLgfd4_Balut!!!!! - Manila, Philippines - Now United → Topics: 0
Video ID: 401ZvQ11euA_Did China Buy the Philippines With False Investment Promises → Topics: 12
Video ID: 4OiOwDh5lKE_Chef Kuya Joey Is Flying To The Philippines!  ðŸ‡µðŸ‡­ →

In [None]:
print("\nVideos Assigned Per Topic:")
for topic, video_list in sorted(topic_to_videos_15.items()):
    print(f"\nTopic {topic}:")
    for video in video_list:
        print(f"- {video}")


In [10]:
topic_to_videos_15 = defaultdict(list)
video_topic_mapping_15 = {}
prob_threshold = 0.2
video_topic_words_LDA1_15 = defaultdict(list)  # Store topics separately
topic_word_contributions = {}

for idx, doc_bow in enumerate(corpus):
    video_id = video_ids[idx]  # Get video ID
    topic_distribution = lda_model_15.get_document_topics(doc_bow, minimum_probability=0)

    # Get topics above threshold
    assigned_topics = {topic: prob for topic, prob in topic_distribution if prob >= prob_threshold}
    video_topic_mapping_15[video_id] = assigned_topics

    for topic in assigned_topics:
        topic_to_videos_15[topic].append(video_id)

    # Identify words contributing to each assigned topic
    topic_word_contributions[video_id] = defaultdict(dict)
    bow_words = {dictionary[word_id] for word_id, _ in doc_bow}  # Convert doc_bow IDs to words

    for topic, prob in assigned_topics.items():
        topic_top_words = dict(lda_model_15.show_topic(topic, topn=2000))  # Get top words for the topic
        word_contributions = {}

        # Ensure we capture all contributing words
        for word in bow_words:
            if word in topic_top_words:
                word_contributions[word] = topic_top_words[word]

        # Store word contributions per topic
        topic_word_contributions[video_id][topic] = word_contributions
        topic_words = ", ".join(topic_top_words.keys())
        video_topic_words_LDA1_15[video_id].append(f"Topic {topic}: {topic_words}")

In [11]:
# Print contributing words per video
print("\nWords Contributing to Topics Per Video:")
for video_id, topic_data in topic_word_contributions.items():
    print(f"Video ID: {video_id}")
    for topic, word_scores in topic_data.items():
        sorted_words = sorted(word_scores.items(), key=lambda x: x[1], reverse=True)
        top_contributing_words = ", ".join([f"{word} ({round(score, 3)})" for word, score in sorted_words])
        topic_prob = video_topic_mapping_15[video_id][topic]
        
        # Ensure that all contributing words are displayed
        if top_contributing_words.strip():
            print(f"  Topic {topic} (Probability: {round(topic_prob, 3)}):\n    Contributing Words: {top_contributing_words}\n")
        else:
            print(f"  Topic {topic} (Probability: {round(topic_prob, 3)}):\n    No detected contributing words!\n")


Words Contributing to Topics Per Video:
Video ID: --8n6A8Q6M0_$200 Luxury Beach Hotel in The Philippines ðŸ‡µðŸ‡­
  Topic 8 (Probability: 0.6200000047683716):
    Contributing Words: bed (0.003000000026077032), combinations (0.003000000026077032), stand (0.003000000026077032), rainy_season (0.003000000026077032), chocolate_muffin (0.003000000026077032), infinity_pool_beach (0.003000000026077032), coffee_wake (0.003000000026077032), bread_roll (0.003000000026077032), beer_bottle_goodness (0.003000000026077032), hair_mess (0.003000000026077032), wet_clothes (0.003000000026077032), wear_shoes (0.003000000026077032), palm_trees (0.003000000026077032), breakfast (0.0020000000949949026), restaurant (0.0020000000949949026), bar (0.0020000000949949026), drive (0.0020000000949949026), office (0.0020000000949949026), grass (0.0020000000949949026), view (0.0020000000949949026), level (0.0020000000949949026), rate (0.0020000000949949026), version (0.0020000000949949026), restaurants (0.0020000000