In [67]:
import os
import re
import pandas as pd

# Create Initial Dataframe with all video IDs
dataset_folder = "./standard_dataset"
data_records = []

# Regex pattern to extract Video Id and Title from the filename
filename_pattern = re.compile(r"^(.*)_(.*?)_captions\.txt$")

for file in os.listdir(dataset_folder):
    if file.endswith("_captions.txt"):  # Ensure it's a valid transcript file
        match = filename_pattern.match(file)
        if match:
            video_id, video_title = match.groups()  # Extract Video Id and Title
            file_path = os.path.join(dataset_folder, file)

            # Append data to the list
            data_records.append({
                "Video Title": video_title,
                "Link": f"https://www.youtube.com/watch?v={video_id}",
                "LDA Topics": None,
                "BERTopic Topics": None,
                "HLTA Topics":[]
            })

# Convert to DataFrame
topics_per_video_df = pd.DataFrame(data_records)
topics_per_video_df.set_index("Video Title", inplace=True)

topics_per_video_df


Unnamed: 0_level_0,Link,LDA Topics,BERTopic Topics,HLTA Topics
Video Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
$200 Luxury Beach Hotel in The Philippines ðŸ‡µðŸ‡­,https://www.youtube.com/watch?v=--8n6A8Q6M0,,,[]
Philippines Army vs Thailand Army,https://www.youtube.com/watch?v=0IMWasj76yU,,,[]
Tour of The House We Built in The Philippines,https://www.youtube.com/watch?v=1kErCqgIVMk,,,[]
Top 5 Exotic Foods in the Philippines The BRAVE MUST TRY!,https://www.youtube.com/watch?v=2ftG8JuMzz4,,,[]
VOCAL COACH REACTS - REGINE VELASQUEZ - Araw-Gabi,https://www.youtube.com/watch?v=2TmagN6RhkI,,,[]
...,...,...,...,...
"Vi & I Visit the Roxas Night Market, Davao",https://www.youtube.com/watch?v=Z9Uz-NUaCG8,,,[]
Filipina British Life in UK May pa request si mister!ðŸ¤,https://www.youtube.com/watch?v=ZFpnR0xpdRI,,,[]
"WILD WINGS DAVAO - BEST CHICKEN WINGS, BILAO FILIPINO FOOD FEAST - DAVAO FOOD - ANUJ GABA",https://www.youtube.com/watch?v=zTcCSyucYqI,,,[]
(ENG) SB19 Dance Cover BTS Boy With Luv & Idol - Indonesian Reaction,https://www.youtube.com/watch?v=zwOJjQuL4i4,,,[]


### HLTA

In [68]:
# Store results as a flat list
import json

nodes_filename = 'standard_test_1.nodes.json'
nodes_filepath = './HLTM/output-jsons/' + nodes_filename

with open(nodes_filepath, 'r') as f:
    data = json.load(f)

def flatten_tree(node, depth=1, result=[]):
    """Recursively flattens a tree structure into a list."""
    result.append({
        "id": node["id"],
        "texts": node["text"].split(),
        "level": depth
    })
    
    for child in node.get("children", []):
        flatten_tree(child, depth + 1, result)
    
    return result

# Process each root node in the JSON
flat_list = []
for root in data:
    flat_list.extend(flatten_tree(root))

print(flat_list)

[{'id': 'Z23', 'texts': ['sing', 'singer', 'voice', 'singing', 'performance', 'vocal', 'bell'], 'level': 1}, {'id': 'Z117', 'texts': ['singer', 'voice', 'performance', 'bell', 'regine-velasquez', 'morissette-amon', 'morissette'], 'level': 2}, {'id': 'Z118', 'texts': ['world', 'talent', 'record', 'shoot', 'personal', 'public', 'luke'], 'level': 2}, {'id': 'Z120', 'texts': ['boil', 'vegetable', 'waste', 'original', 'largest', 'decide', 'guest'], 'level': 2}, {'id': 'Z116', 'texts': ['sing', 'singing', 'vocal', 'stage', 'tnt-boy', 'note', 'male'], 'level': 2}, {'id': 'Z119', 'texts': ['primate', 'tarsier', 'peace', 'animal', 'legend', 'banana-ketchup', 'band'], 'level': 2}, {'id': 'Z23', 'texts': ['sing', 'singer', 'voice', 'singing', 'performance', 'vocal', 'bell'], 'level': 1}, {'id': 'Z117', 'texts': ['singer', 'voice', 'performance', 'bell', 'regine-velasquez', 'morissette-amon', 'morissette'], 'level': 2}, {'id': 'Z118', 'texts': ['world', 'talent', 'record', 'shoot', 'personal', 'pu

In [69]:
from collections import defaultdict

topic_map_filepath = './HLTM/output-jsons/' + 'standard_test_1.topics.json'

with open(topic_map_filepath, 'r') as f:
    topic_mapping_data = json.load(f)

topic_ids_per_vid_mapping = defaultdict(list)

# use dictionary to change mapping to list topics per video
# doc id serves as the grouping key
for topic_obj in topic_mapping_data:
    topic = topic_obj["topic"]
    
    for doc_entry in topic_obj["doc"]:
        doc_number, probability = doc_entry
        topic_ids_per_vid_mapping[doc_number].append((topic, probability))


# Document numbers are based on the order they are fed into the HLTA model or which line they are in orig text file, starts from 0 til n

In [70]:
# link the top words to the video instead of just the topic id
topic_to_words = {entry["id"]: (", ".join(entry["texts"]), entry["level"]) for entry in flat_list}


# Step 2: Convert topic_mapping_data into topics_per_vid_mapping
for topic_obj in topic_mapping_data:
    topic = topic_obj["topic"]
    for doc_entry in topic_obj["doc"]:
        doc_number, probability = doc_entry
        topic_ids_per_vid_mapping[doc_number].append((topic, probability))

# Step 3: Store transformed data in a new dictionary
topic_words_per_vid_mapping = defaultdict(list)

for doc, topic_list in topic_ids_per_vid_mapping.items():
    for t, p in topic_list:
        if t in topic_to_words:
            topic_words, level = topic_to_words[t]
            topic_words_per_vid_mapping[doc].append((topic_words, level, p))
        else:
            topic_words_per_vid_mapping[doc].append((t, None, p))  # Handle case where topic ID is not found

# Step 4: Print the results
for doc, topic_list in topic_words_per_vid_mapping.items():
    topics_str = ", ".join(f"{t} (Level {lvl}): {p:.2f}" if lvl is not None else f"{t}: {p:.2f}" for t, lvl, p in topic_list)
    print(f"Document {doc} - {topics_str}")

Document 34 - yesterday, camera, shopping, coffee, intestine, option, main-street (Level 2): 1.00, beach, explore, weather, road, ride, hotel, car (Level 2): 1.00, photo, session, goodness, hair (Level 2): 0.97, private, wake, entrance, cheap, south, cafe, build (Level 2): 1.00, colt, brooklyn, bgc, month, baby, late, carry (Level 2): 0.98, pay, drive, tourist, chocolate-hill, boat, ticket, real (Level 2): 1.00, chicken-chicken, tasty, singapore, mushroom, dish, honest, mix (Level 2): 1.00, swim, arrive, typhoon, rent, nature, coconut, cuisine (Level 2): 0.98, remind, fried, bite, roasted, bit, wrong, pie (Level 2): 0.98, quick, busy, bed, close, toilet, kitchen, bedroom (Level 2): 1.00, pizza, foot, scared, tricycle, wave, scary, pasta (Level 2): 1.00, random, extra, finally, taxi, fridge, driver, milk (Level 2): 1.00, peach, italian, choice, garlic-rice, laughter, dinner, juicy (Level 2): 1.00, previous, climb, cooky, hmmm, surprise, request, maja (Level 2): 0.54, airport, chill, fli

In [None]:
# append into main dataframe
for doc, topic_list in topic_words_per_vid_mapping.items():
    t = ""
    for topic in topic_list:
       t += topic[0] + " : Level " + str(topic[1]) + " ({:.2f})".format(topic[2]) + '\n'

    topics_per_video_df.iloc[int(doc), 3] = t

topics_per_video_df


### BERTopic

### LDA

In [1]:
import os
import re
import pandas as pd

# Create Initial Dataframe with all video IDs
dataset_folder = "./standard_dataset"
data_records = []

# Regex pattern to extract Video Id and Title from the filename
filename_pattern = re.compile(r"^(.*)_(.*?)_captions\.txt$")

for file in os.listdir(dataset_folder):
    if file.endswith("_captions.txt"):  # Ensure it's a valid transcript file
        match = filename_pattern.match(file)
        if match:
            video_id, video_title = match.groups()  # Extract Video Id and Title
            file_path = os.path.join(dataset_folder, file)

            # Append data to the list
            data_records.append({
                "Video Title": video_title,
                "Link": f"https://www.youtube.com/watch?v={video_id}",
                "LDA Topics": None,
                "BERTopic Topics": None,
                "HLTA Topics":[]
            })

# Convert to DataFrame
topics_per_video_df = pd.DataFrame(data_records)
topics_per_video_df.set_index("Video Title", inplace=True)

In [2]:
import os
import random
import requests
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from gensim import corpora
from gensim.models.ldamodel import LdaModel
from gensim.models import FastText,  CoherenceModel
import numpy as np
import spacy
import string
from wordcloud import WordCloud
import unicodedata
import matplotlib.pyplot as plt
from nltk.util import ngrams
from collections import defaultdict, Counter
import itertools

In [3]:
# Fetch stopwords
def fetch_stopwords_from_github(url):
    response = requests.get(url)
    return set(response.text.splitlines())

github_stopwords_url = 'https://raw.githubusercontent.com/stopwords-iso/stopwords-en/master/stopwords-en.txt'
github_stopwords = fetch_stopwords_from_github(github_stopwords_url)

stop_words = set(stopwords.words('english'))
custom_stop_words = ['like', 'yeah', 'know', 'um', 'uh', 'really', 'one', 'go', 'right', 'okay', 'well', 'said', 
                    'going', 'got', 'na', 'always', 'every', 'each', 'say', 'el', 'little', 'still', 
                    'best', 'dutch', 'nice', 'great', 'awesome', 'good', 'cool', 'love', 'amazing', 'wow',
                    'breaking news', 'report', 'coverage', 'investigation', 'interview', 'documentary', 'news', 'netherlands', 'psy', 'subtitle', 'description', 'link', 
                    'journalist', 'headline', 'reporter', 'current events', 'special report', 
                    'analysis', 'documented', 'broadcast', 'reporting', 'v', 'food', 'travel', 'react', 
                    'reacts', 'reaction', 'foreigner', 'thing', 'visit', 'dc', 'japan', 'first', 'fast', 
                    'asia', 'ang', 'indian', 'thai', 'vietnamese', 'russia', 'gon', 'canada', 'canadian', 'russian', 
                    'russia', 'guy', 'lot', 'bit', 'diba', 'ola', 'cuz', 'thai', 'thailand', 'person', 'citizen', 'foreigner', 'foreign', 'foreigners',
                    'facebook', 'filipinos', 'filipinas', 'vlog', 'vlogs', 'vlogging', 'hashtag', 'india', 'bro', 'dito', 'people', 'time', 'music', 'gonna', 'life', 
                    'lol', 'guys', 'tho', 'cute', 'hmm', 'huh', 'channel', 'subscribe', 'day6', 'mandarin', 'chinese', 'beautiful',
                    'chuckles', 'fbe', 'hit', 'laughs', 'yo', 'ka', 'word', 'living', 'boi', 'minimum', 'ya', 'successful', 'perfectly', 'yeap', 
                    'wondering', 'fantastic', 'hurry', 'german', 'age', 'country', 'subscribing', 'bluesy', 'jump', 'pretty', 'understanding', 'personalized',
                    'and', 'the', 'to', 'of', 'in', 'for', 'on', 'with', 'at', 'by', 'from', 'up', 'about', 'over', 'into', 'through', 'between', 'under', 'against', 'all',
                    'you', 'haha', 'hahaha', 'ha', 'hey', 'bye', 'hello', 'hi', 'oh', 'blah', 'easy', 'alright', 'ta', 'day', 'ooh', 'en', 'do', 'lot', 'comment', 'notification', 
                    'korean', 'jjajangmyeon', 'jajangmyeon', 'damn', 'yall', 'month', 'week', 'year', 'ohhh', 'pvf', 'dude', 'mmm', 'kagilagilalas', 'ofcourse', 'australia', 'uxo', 
                    'atleast', 'yusuf', 'bangkok', 'ot', 'anytime', 'allover', 'kala', 'nope', 'wan', 'brazil', 'smooth', 'ot', 'timeshere', 'batchof', 'yep', 'opo', 'del',
                    'gosh', 'po', 'ourself', 'wo', 'wait', 'ugh', 'nyc', 'whoa', 'nicaragua', 'yup', 'em', 'bout', 'le', 'omg', 'overwhelm', 'maam', 'nicer', 'haha', 'hahaha', 'ha', 
                    'nbcs', 'lana', 'rc', 'whatsoever', 'oxy', 'decade', 'whyd', 'unknown', 'ahhhhh', 'ohoh', 'ohto', 'ohhhh', 'bruh', 'ooe', 'ahmedabad', 'mexico', 
                    'understand', 'excuse', 'kinda', 'applause', 'oooh', 'thiswhat', 'nevermind', 'ahh', 'againthank', 'toto', 'aww', 'nah', 'bbmas', 'ay', 'op', 'huh', 'huhu',
                    'tada', 'beacuse', 'voila', 'upstairs', 'thatswhy', 'yea', 'that', 'armenia', 'or', 'not', 'funwhat', 'aka', 'armeniathat', 'woosexy', 'worth', 'laugh', 'box', 
                    'xd', 'vb', 'eff', 'ananya', 'welsh', 'latron', 'shout', 'whatwhat', 'what', 'pause', 'why', 'thats', 'byebye', 'iv', 'bye', 'ado', 'ownup', 'dom', 'jomm', 'sir', 
                    'budgie', 'nomac', 'lavocha', 'germany', 'why', 'walang', 'superduper', 'philip', 'mom', 'jre', 'giddy', 'intro', 'dupe', 'europe', 'dream', 'team', 'dislike', 'content', 
                    'yoongi', 'royale', 'ilu', 'jhope', 'day', 'jin', 'ecc', 'nyhs', 'nego', 'chavez', 'pb', 'everyones', 'epic', 'matter', 'oneonone', 'region', 'change', 'ho', 'seetoh', 
                    'atin', 'vpn', 'facetune', 'busu', 'mackie', 'clyd', 'china', 'rest', 'friend', 'woah', 'dindins', 'poster', 'vibe', 'woman', 'boss', 'woah', 'type', 'mahana', 'joke', 
                    'taller', 'insane', 'whang', 'psa', 'manatee', 'recommend', 'caesar', 'mmmhmm', 'mosul', 'dun', 'clue', 'naysayer', 'hindi', 'ko', 'pero', 'bulgaria', 'question', 'video', 
                    'yobi', 'hindu', 'expat', 'option', 'gap', 'eu', 'simo', 'kouignamann', 'bct', 'month', 'cfo', 'philippines', 'philippine', 'british', 'filipino', 'video', 
                    'http', 'korea', 'korean', 'youtube', 'google', 'united', 'america', 'american', 'kpop', '필리핀', 'bts', 'blackpink', 'twice', 'exo', 'k-pop', 
                    'seventeen', 'stray kids', 'nct', 'kdrama', 'aespa', 'taehyung', 'jimin', 'jungkook']
stop_words.update(custom_stop_words, github_stopwords)

lemmatizer = WordNetLemmatizer()

In [35]:
# Helper function to check if a word contains only Latin script characters
def is_latin_script(word):
    for char in word:
        if not ('LATIN' in unicodedata.name(char, '') or char.isdigit()):
            return False
    return True

def preprocess_text(doc, video_id, ngram_range=(1, 2), tag_weight=1, ngram_weight_factor=1):
    doc = doc.lower().translate(str.maketrans('', '', string.punctuation))
    tokens = word_tokenize(doc)
    tokens_with_pos = pos_tag(tokens)

    # Apply lemmatization
    lemmatized_tokens = [lemmatizer.lemmatize(word, get_wordnet_pos(pos)) for word, pos in tokens_with_pos]

    # Generate n-grams
    ngram_tokens = []
    for n in range(ngram_range[0], ngram_range[1] + 1):
        ngram_tokens.extend([' '.join(gram) for gram in ngrams(lemmatized_tokens, n)])

    # Count n-gram frequency
    ngram_frequencies = Counter(ngram_tokens)

    # Filter stopwords and non-useful words
    filtered_tokens = []
    for word, pos in tokens_with_pos:
        if not word.isalpha() or word in stop_words:
            continue
        if pos in ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'JJ', 'JJR', 'JJS', 'RB', 'RBR', 'RBS']:
            continue  # Remove verbs and adjectives
        filtered_tokens.append(word)

    # Assign weight based on n-grams 
    weighted_tokens = []
    for token in filtered_tokens:
        token_weight = 1 + ngram_frequencies.get(token, 0) * ngram_weight_factor
        weighted_tokens.extend([token] * int(token_weight))

    # Include video tags
    if video_id in video_tags:
        tags = video_tags[video_id]
        for tag in tags:
            if tag.isalpha():
                tag_weight_multiplier = tag_weight
                weighted_tokens.extend([tag] * int(tag_weight_multiplier))
    return ' '.join(weighted_tokens)

def get_wordnet_pos(treebank_tag):
    """Convert POS tag to WordNet format for lemmatization."""
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN  # Default to noun
    
# Folder paths
transcripts_folder_path = 'standard_dataset/'
tags_folder_path = 'tags/'

# Function to load video tags only for fetched video IDs
def load_video_tags(folder_path, video_ids):
    video_tags = {}
    for video_id in video_ids:
        tag_file = os.path.join(folder_path, f"{video_id}.txt")
        if os.path.exists(tag_file):
            with open(tag_file, "r", encoding="utf-8") as file:
                tags_content = file.read().lower()
                video_tags[video_id] = tags_content.split()  # Store as list of words
        else:
            video_tags[video_id] = []  # Default to empty list if no tags
    return video_tags

video_ids = []
transcript_files = []
for file_name in os.listdir(transcripts_folder_path):
    if file_name.endswith('.txt'):
        video_id = file_name.split('_captions')[0]
        video_ids.append(video_id)
        transcript_files.append((video_id, file_name)) 

video_tags = load_video_tags(tags_folder_path, video_ids)

all_documents = []
for video_id, file_name in transcript_files:
    with open(os.path.join(transcripts_folder_path, file_name), 'r', encoding='utf-8') as file:
        content = file.read().lower()
        if len(content.split()) >= 100: 
            processed_text = preprocess_text(content, video_id)  
            all_documents.append(processed_text)

In [39]:
# Create Dictionary and Corpus for LDA
dictionary = corpora.Dictionary([doc.split() for doc in all_documents])
corpus = [dictionary.doc2bow(doc.split()) for doc in all_documents]

# Train LDA model
lda_model = LdaModel(corpus, num_topics=3, id2word=dictionary, alpha='auto', eta='symmetric', passes=100)

# Compute coherence score
coherence_model = CoherenceModel(model=lda_model, corpus=corpus, dictionary=dictionary, coherence='u_mass')
coherence_score = coherence_model.get_coherence()

print(f"Coherence Score: {coherence_score}")

Coherence Score: -3.6442045237457896


In [40]:
for idx, doc_bow in enumerate(corpus):
    video_id = video_ids[idx]  
    topic_distribution = lda_model.get_document_topics(doc_bow, minimum_probability=0)
    print(f"Video {video_id}: {topic_distribution}")

topics = lda_model.print_topics(num_words=20) 
for topic_id, topic_words in topics:
    print(f"Topic {topic_id}: {topic_words}")


Video --8n6A8Q6M0_$200 Luxury Beach Hotel in The Philippines ðŸ‡µðŸ‡­: [(0, 6.2686395e-06), (1, 0.9999883), (2, 5.355367e-06)]
Video 0IMWasj76yU_Philippines Army vs Thailand Army: [(0, 0.00020046456), (1, 8.2193e-05), (2, 0.9997173)]
Video 1kErCqgIVMk_Tour of The House We Built in The Philippines: [(0, 0.99993473), (1, 2.115899e-05), (2, 4.408735e-05)]
Video 2ftG8JuMzz4_Top 5 Exotic Foods in the Philippines The BRAVE MUST TRY!: [(0, 4.1793166e-05), (1, 1.7135724e-05), (2, 0.99994105)]
Video 2TmagN6RhkI_VOCAL COACH REACTS - REGINE VELASQUEZ - Araw-Gabi: [(0, 0.9997975), (1, 6.565967e-05), (2, 0.00013680996)]
Video 3n1nspLgfd4_Balut!!!!! - Manila, Philippines - Now United: [(0, 0.0017646838), (1, 0.0007235419), (2, 0.99751174)]
Video 401ZvQ11euA_Did China Buy the Philippines With False Investment Promises: [(0, 5.651661e-05), (1, 0.9998952), (2, 4.8282764e-05)]
Video 4OiOwDh5lKE_Chef Kuya Joey Is Flying To The Philippines!  ðŸ‡µðŸ‡­: [(0, 0.2750583), (1, 0.23262753), (2, 0.49231413)]
Vid

In [41]:
for idx, doc_bow in enumerate(corpus):
    video_id = video_ids[idx]
    topic_distribution = lda_model.get_document_topics(doc_bow, minimum_probability=0)
    dominant_topic = sorted(topic_distribution, key=lambda x: x[1], reverse=True)[0]  # Highest probability
    print(f"Video {video_id}: Dominant Topic {dominant_topic[0]} with probability {dominant_topic[1]:.4f}")


Video --8n6A8Q6M0_$200 Luxury Beach Hotel in The Philippines ðŸ‡µðŸ‡­: Dominant Topic 1 with probability 1.0000
Video 0IMWasj76yU_Philippines Army vs Thailand Army: Dominant Topic 2 with probability 0.9997
Video 1kErCqgIVMk_Tour of The House We Built in The Philippines: Dominant Topic 0 with probability 0.9999
Video 2ftG8JuMzz4_Top 5 Exotic Foods in the Philippines The BRAVE MUST TRY!: Dominant Topic 2 with probability 0.9999
Video 2TmagN6RhkI_VOCAL COACH REACTS - REGINE VELASQUEZ - Araw-Gabi: Dominant Topic 0 with probability 0.9998
Video 3n1nspLgfd4_Balut!!!!! - Manila, Philippines - Now United: Dominant Topic 2 with probability 0.9975
Video 401ZvQ11euA_Did China Buy the Philippines With False Investment Promises: Dominant Topic 1 with probability 0.9999
Video 4OiOwDh5lKE_Chef Kuya Joey Is Flying To The Philippines!  ðŸ‡µðŸ‡­: Dominant Topic 2 with probability 0.4923
Video 6ftiWoCJ4dM_Filipino Food In The Netherlands  Cafe Nordrick!!: Dominant Topic 0 with probability 0.9993
Video 7j

In [42]:
from collections import defaultdict

# Dictionary to store the count of videos per topic
topic_video_count = defaultdict(int)

# Mapping of video ID to its dominant topic
video_topic_mapping = {}

for idx, doc_bow in enumerate(corpus):
    video_id = video_ids[idx]  # Get video ID
    topic_distribution = lda_model.get_document_topics(doc_bow, minimum_probability=0)

    # Get the most dominant topic (highest probability)
    dominant_topic = sorted(topic_distribution, key=lambda x: x[1], reverse=True)[0][0]
    
    # Store the mapping
    video_topic_mapping[video_id] = dominant_topic
    
    # Increase count for that topic
    topic_video_count[dominant_topic] += 1

# Print number of videos assigned to each topic
print("\nNumber of Videos per Topic:")
for topic, count in sorted(topic_video_count.items()):
    print(f"Topic {topic}: {count} videos")



Number of Videos per Topic:
Topic 0: 41 videos
Topic 1: 13 videos
Topic 2: 46 videos


In [None]:
topic_to_videos = defaultdict(list)

video_topic_mapping = {}

# probability threshold for assigning multiple topics
prob_threshold = 0.2

# Dictionary to store topic words for each video
video_topic_words_LDA1 = {}

for idx, doc_bow in enumerate(corpus):
    video_id = video_ids[idx]  # Get video ID
    topic_distribution = lda_model.get_document_topics(doc_bow, minimum_probability=0)

    # Get topics where probability is above threshold
    assigned_topics = [topic for topic, prob in topic_distribution if prob >= prob_threshold]
    video_topic_mapping[video_id] = assigned_topics  # Store assigned topics per video

    # Get the representative words for each assigned topic
    topic_words = []
    for topic in assigned_topics:
        words = [word for word, _ in lda_model.show_topic(topic, topn=10)]  # Get top 10 words
        topic_words.append(", ".join(words))  # Convert list to string

    # Store the topic words as a string
    video_topic_words_LDA1[video_id] = "; ".join(topic_words)  # Separate topics with `;`

# Count occurrences of each topic
topic_counts = Counter()

for topic in assigned_topics:
    topic_to_videos[topic].append(video_id)

for topics in video_topic_mapping.values():
    for topic in topics:
        topic_counts[topic] += 1

# Print the number of videos per topic
print("\nNumber of Videos Per Topic:")
for topic, count in sorted(topic_counts.items()):
    print(f"Topic {topic}: {count} videos")

# Print topics assigned per video
print("\nTopics Assigned Per Video:")
for video_id, topics in video_topic_mapping.items():
    topic_list = ', '.join(map(str, topics)) if topics else "No dominant topic"
    print(f"Video ID: {video_id} → Topics: {topic_list}")

# Print videos per topic
print("\nTop Words Per Topic:")
num_words = 30  

for topic_id in sorted(topic_to_videos.keys()): 
    top_words = lda_model.show_topic(topic_id, num_words)
    words_str = ', '.join([word for word, prob in top_words])
    print(f"Topic {topic_id}: {words_str}")



Number of Videos Per Topic:

Topics Assigned Per Video:

Top Words Per Topic:


In [45]:
# Folder paths
transcripts_folder_path = 'standard_dataset/'
tags_folder_path = 'tags/'

# Load video tags
def load_video_tags(folder_path, video_ids):
    video_tags = {}
    for video_id in video_ids:
        tag_file = os.path.join(folder_path, f"{video_id}.txt")
        if os.path.exists(tag_file):
            with open(tag_file, "r", encoding="utf-8") as file:
                tags_content = file.read().lower()
                video_tags[video_id] = tags_content.split()  
        else:
            video_tags[video_id] = []  
    return video_tags

# Load transcripts
video_ids = []
transcript_files = []
for file_name in os.listdir(transcripts_folder_path):
    if file_name.endswith('.txt'):
        video_id = file_name.split('_captions')[0]
        video_ids.append(video_id)
        transcript_files.append((video_id, file_name)) 

video_tags = load_video_tags(tags_folder_path, video_ids)

video_topics = {}

for video_id, file_name in transcript_files:
    with open(os.path.join(transcripts_folder_path, file_name), 'r', encoding='utf-8') as file:
        content = file.read().lower()
        
        if len(content.split()) >= 100:  
            processed_text = preprocess_text(content, video_id)
            
            # Create Dictionary and Corpus
            tokens = processed_text.split()
            corpus = [dictionary.doc2bow(tokens)]
            
            # Train LDA Model
            num_topics = 3  # Limit topics to at most 5 or vocab size
            lda_model_2 = LdaModel(corpus, num_topics=3, id2word=dictionary, alpha='auto', eta='symmetric', passes=100)
            
            # Compute Coherence Score
            coherence_model = CoherenceModel(model=lda_model_2, corpus=corpus, dictionary=dictionary, coherence='u_mass')
            coherence_score = coherence_model.get_coherence()
            
            # Get topic distribution for the video
            topic_distribution = lda_model_2.get_document_topics(corpus[0], minimum_probability=0)
            assigned_topics = {topic: prob for topic, prob in topic_distribution if prob > 0.5}  # Threshold for multiple topics
            
            # Store video topic info
            video_topics[video_id] = {
                "topics": assigned_topics,
                "top_words": {i: lda_model_2.show_topic(i, 20) for i in range(num_topics)},
                "coherence": coherence_score
            }

# Dictionary to store LDA2 topic words for each video
video_topic_words_lda2 = {}

for video_id, data in video_topics.items():
    topic_words = []
    
    for topic_id, prob in data["topics"].items():
        words = [word for word, _ in data["top_words"][topic_id]]  # Extract top words
        topic_words.append(", ".join(words))  # Convert list to string

    # Store the topic words as a string
    video_topic_words_lda2[video_id] = "; ".join(topic_words)  # Separate topics with `;`
    
# Print topics per video
for video_id, data in video_topics.items():
    print(f"\nVideo ID: {video_id}")
    print(f"Coherence Score: {data['coherence']:.4f}")
    print("Assigned Topics:")
    for topic_id, prob in data["topics"].items():
        print(f"  Topic {topic_id} ({prob:.2f} probability): {[word for word, _ in data['top_words'][topic_id]]}")


Video ID: --8n6A8Q6M0_$200 Luxury Beach Hotel in The Philippines ðŸ‡µðŸ‡­
Coherence Score: -4.7991
Assigned Topics:
  Topic 2 (1.00 probability): ['beach', 'hotel', 'water', 'jet', 'god', 'sand', 'tide', 'couple', 'fun', 'hour', 'pool', 'night', 'hat', 'power', 'ski', 'view', 'bed', 'yesterday', 'margarita', 'ocean']

Video ID: 0IMWasj76yU_Philippines Army vs Thailand Army
Coherence Score: -4.8475
Assigned Topics:
  Topic 0 (1.00 probability): ['salt', 'sad', 'military', 'sea', 'power', 'ground', 'air', 'force', 'helicopters', 'forces', 'troopers', 'tank', 'helicopt', 'naval', 'helicop', 'tanks', 'massage', 'motoribikes', 'motivation', 'membership']

Video ID: 1kErCqgIVMk_Tour of The House We Built in The Philippines
Coherence Score: -4.8475
Assigned Topics:
  Topic 1 (1.00 probability): ['water', 'house', 'floor', 'glass', 'door', 'steel', 'tile', 'grill', 'finish', 'light', 'sun', 'storage', 'stuff', 'tour', 'kubo', 'job', 'bathroom', 'handmade', 'leak', 'hand']

Video ID: 2ftG8JuMz

In [None]:
import re

def extract_video_id_from_key(key):
    """Extracts only the video ID from the dictionary key."""
    match = re.match(r"([A-Za-z0-9_-]{11})", key) 
    return match.group(1) if match else key  

video_topic_words_LDA1_fixed = {extract_video_id_from_key(k): v for k, v in video_topic_words_LDA1.items()}
video_topic_words_LDA2_fixed = {extract_video_id_from_key(k): v for k, v in video_topic_words_lda2.items()}

topics_per_video_df["LDA1 Topics"] = topics_per_video_df["Video ID"].map(video_topic_words_LDA1_fixed)
topics_per_video_df["LDA2 Topics"] = topics_per_video_df["Video ID"].map(video_topic_words_LDA2_fixed)


In [66]:
topics_per_video_df 

Unnamed: 0_level_0,Link,BERTopic Topics,HLTA Topics,LDA1 Topics,LDA2 Topics
Video Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
$200 Luxury Beach Hotel in The Philippines ðŸ‡µðŸ‡­,https://www.youtube.com/watch?v=--8n6A8Q6M0,,[],"baguio, beach, stuff, market, hotel, strawberry, view, water, park, dale","beach, hotel, water, jet, god, sand, tide, couple, fun, hour, pool, night, hat, power, ski, view, bed, yesterday, margarita, ocean"
Philippines Army vs Thailand Army,https://www.youtube.com/watch?v=0IMWasj76yU,,[],"roline, pizza, christmas, chef, pork, chicken, sauce, restaurant, egg, hour","salt, sad, military, sea, power, ground, air, force, helicopters, forces, troopers, tank, helicopt, naval, helicop, tanks, massage, motoribikes, motivation, membership"
Tour of The House We Built in The Philippines,https://www.youtube.com/watch?v=1kErCqgIVMk,,[],"island, chicken, rice, damian, manila, lechon, water, spot, brother, father","water, house, floor, glass, door, steel, tile, grill, finish, light, sun, storage, stuff, tour, kubo, job, bathroom, handmade, leak, hand"
Top 5 Exotic Foods in the Philippines The BRAVE MUST TRY!,https://www.youtube.com/watch?v=2ftG8JuMzz4,,[],"roline, pizza, christmas, chef, pork, chicken, sauce, restaurant, egg, hour","meat, balut, tamilok, soup, etag, egg, kamaru, taste, dish, list, duck, salt, vinegar, woodworm, delicacy, oyster, insect, pampanga, coconut, darker"
VOCAL COACH REACTS - REGINE VELASQUEZ - Araw-Gabi,https://www.youtube.com/watch?v=2TmagN6RhkI,,[],"island, chicken, rice, damian, manila, lechon, water, spot, brother, father","voice, song, forget, mix, melody, falsetto, vocalist, control, schedule, rasp, head, stream, switches, singer, color, belt, god, intricate, tons, regina"
"Balut!!!!! - Manila, Philippines - Now United",https://www.youtube.com/watch?v=3n1nspLgfd4,,[],"roline, pizza, christmas, chef, pork, chicken, sauce, restaurant, egg, hour","night, duck, egg, baby, vice, book, level, boy, pilot, engine, bullet, ate, cebu, mangoes, bananas, crickets, balloons, traffics, tiktoks, tam"
Did China Buy the Philippines With False Investment Promises,https://www.youtube.com/watch?v=401ZvQ11euA,,[],"baguio, beach, stuff, market, hotel, strawberry, view, water, park, dale","duterte, investment, dollar, province, jetski, hotel, infrastructure, money, sea, hand, aid, episode, jia, flag, plant, support, horse, ambassador, debt, trap"
Chef Kuya Joey Is Flying To The Philippines! ðŸ‡µðŸ‡­,https://www.youtube.com/watch?v=4OiOwDh5lKE,,[],"island, chicken, rice, damian, manila, lechon, water, spot, brother, father; baguio, beach, stuff, market, hotel, strawberry, view, water, park, dale; roline, pizza, christmas, chef, pork, chicken, sauce, restaurant, egg, hour","hotel, mango, flavor, cumin, salad, peach, cebu, rice, iinit, grape, butter, ganda, pool, airplane, coffee, trip, potato, pinas, potatoes, taste"
Filipino Food In The Netherlands Cafe Nordrick!!,https://www.youtube.com/watch?v=6ftiWoCJ4dM,,[],"island, chicken, rice, damian, manila, lechon, water, spot, brother, father","bread, hague, coffee, cafe, longsiloc, atmosphere, husband, longganisa, kababayans, combination, son, ensaymada, breakfast, groningen, menu, expats, tocino, barako, nordrick, customers"
Approve ba ako sa anak at mama ni misterðŸ¤”Mia Sandoval,https://www.youtube.com/watch?v=7jDrzdX2Ucc,,[],"island, chicken, rice, damian, manila, lechon, water, spot, brother, father","night, john, phone, fawkes, family, visa, seafood, morning, son, battery, bonfire, wind, eye, girl, pallets, burn, plan, maja, competition, davao"


In [87]:
pd.set_option("display.max_rows", None) 
pd.set_option("display.max_columns", None) 
pd.set_option("display.max_colwidth", None)  
pd.set_option("display.width", 1000)  

topics_per_video_df.to_csv("topics_per_video.csv")
