In [None]:
import os
import re
import pandas as pd

# Create Initial Dataframe with all video IDs
dataset_folder = "./standard_dataset"
data_records = []

# Regex pattern to extract Video Id and Title from the filename
filename_pattern = re.compile(r"^(.*)_(.*?)_captions\.txt$")

for file in sorted(os.listdir(dataset_folder)): 
    if file.endswith("_captions.txt"):  # Ensure it's a valid transcript file
        match = filename_pattern.match(file)
        if match:
            video_id, video_title = match.groups()  # Extract Video Id and Title
            file_path = os.path.join(dataset_folder, file)

            # Append data to the list
            data_records.append({
                "Video Title": video_title,
                "Link": f"https://www.youtube.com/watch?v={video_id}",
                "LDA1 Topics": None,
                "LDA2 Topics": None,
                "BERTopic Topics": None,
                "HLTA Topics":[]
            })

# Convert to DataFrame
topics_per_video_df = pd.DataFrame(data_records)
topics_per_video_df.set_index("Video Title", inplace=True)

topics_per_video_df


### HLTA

In [None]:
# Store results as a flat list
import json

nodes_filename = 'fil-revised-1.nodes.json'
nodes_filepath = './HLTM/output-jsons/' + nodes_filename

with open(nodes_filepath, 'r') as f:
    data = json.load(f)

def flatten_tree(node, depth=1, result=[]):
    """Recursively flattens a tree structure into a list."""
    result.append({
        "id": node["id"],
        "texts": node["text"].split(),
        "level": depth
    })
    
    for child in node.get("children", []):
        flatten_tree(child, depth + 1, result)
    
    return result

# Process each root node in the JSON
flat_list = []
for root in data:
    flat_list.extend(flatten_tree(root))

print(flat_list)

In [None]:
from collections import defaultdict

topic_map_filepath = './HLTM/output-jsons/' + 'fil-revised-1.topics.json'

with open(topic_map_filepath, 'r') as f:
    topic_mapping_data = json.load(f)

topic_ids_per_vid_mapping = defaultdict(list)

# use dictionary to change mapping to list topics per video
# doc id serves as the grouping key
for topic_obj in topic_mapping_data:
    topic = topic_obj["topic"]
    
    for doc_entry in topic_obj["doc"]:
        doc_number, probability = doc_entry
        topic_ids_per_vid_mapping[doc_number].append((topic, probability))


# Document numbers are based on the order they are fed into the HLTA model or which line they are in orig text file, starts from 0 til n

In [None]:
# link the top words to the video instead of just the topic id
topic_to_words = {entry["id"]: (", ".join(entry["texts"]), entry["level"]) for entry in flat_list}


# Step 2: Convert topic_mapping_data into topics_per_vid_mapping
for topic_obj in topic_mapping_data:
    topic = topic_obj["topic"]
    for doc_entry in topic_obj["doc"]:
        doc_number, probability = doc_entry
        topic_ids_per_vid_mapping[doc_number].append((topic, probability))

# Step 3: Store transformed data in a new dictionary
topic_words_per_vid_mapping = defaultdict(list)

for doc, topic_list in topic_ids_per_vid_mapping.items():
    for t, p in topic_list:
        if t in topic_to_words:
            topic_words, level = topic_to_words[t]
            topic_words_per_vid_mapping[doc].append((topic_words, level, p))
        else:
            topic_words_per_vid_mapping[doc].append((t, None, p))  # Handle case where topic ID is not found

# Step 4: Print the results
for doc, topic_list in topic_words_per_vid_mapping.items():
    topics_str = ", ".join(f"{t} (Level {lvl}): {p:.2f}" if lvl is not None else f"{t}: {p:.2f}" for t, lvl, p in topic_list)
    print(f"Document {doc} - {topics_str}")

In [None]:
# append into main dataframe
for doc, topic_list in topic_words_per_vid_mapping.items():
    t = ""
    for topic in topic_list:
       t += topic[0] + " : Level " + str(topic[1]) + " ({:.2f})".format(topic[2]) + '\n'

    topics_per_video_df.iloc[int(doc), 4] = t

topics_per_video_df


### BERTopic

In [None]:
# Initialize list to store data
experiment_records = []

# Iterate through transcript files
for file in os.listdir(dataset_folder):
    if file.endswith("_captions.txt"):  # Ensure it's a valid transcript file
        match = filename_pattern.match(file)
        if match:
            video_id, _ = match.groups()  # Extract Video ID
            file_path = os.path.join(dataset_folder, file)

            # Read transcript content
            with open(file_path, "r", encoding="utf-8") as f:
                transcript = f.read().strip()  # Remove extra spaces/newlines

            # Append data to list
            experiment_records.append({
                "Video Id": video_id,
                "Transcript": transcript
            })

# Convert to DataFrame
experiment_df = pd.DataFrame(experiment_records)

# Display the first few rows to verify
experiment_df

In [None]:
# initial pre-processing (minimal)

# Define a minimal preprocessing function
def minimal_preprocessing(text):
    # Remove timestamps and other non-verbal elements (like "(music)")
    text = re.sub(r'\[.*?\]', '', text)  # Remove content in square brackets (e.g., music)
    text = re.sub(r'\(.*?\)', '', text)  # Remove content in parentheses (e.g., [music], (child speaking))
    text = re.sub(r'\d+', '', text)     # Remove numbers
    text = re.sub(r'\s+', ' ', text)    # Replace multiple spaces with one
    text = text.strip()                 # Remove leading and trailing whitespace
    return text

# Apply the preprocessing to the 'Transcript' column
experiment_df['cleaned_transcript'] = experiment_df['Transcript'].apply(minimal_preprocessing)

# Display the cleaned data
experiment_df[['Transcript', 'cleaned_transcript']]

In [None]:
import torch
import nltk
import swifter
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity

from nltk.tokenize import sent_tokenize

# Load mBERT tokenizer and model
model_name = "bert-base-multilingual-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

# Use Apple's Metal (MPS) for acceleration if available
device = "mps" if torch.backends.mps.is_available() else "cpu"
model.to(device)

In [None]:
def encode_sentences(sentences, batch_size=16):
    """Encodes sentences into vector representations using mBERT."""
    embeddings = []
    for i in range(0, len(sentences), batch_size):
        batch = sentences[i:i+batch_size]
        encoded_inputs = tokenizer(batch, padding=True, truncation=True, return_tensors="pt").to(device)
        with torch.no_grad():
            outputs = model(**encoded_inputs)
        batch_embeddings = outputs.last_hidden_state.mean(dim=1).cpu()  # Move to CPU for further processing
        embeddings.append(batch_embeddings)
    return torch.cat(embeddings, dim=0)


In [None]:
def compute_cosine_similarities(embeddings):
    """Computes cosine similarity efficiently in batch."""
    return cosine_similarity(embeddings[:-1], embeddings[1:]).diagonal()

In [None]:
def segment_transcript(transcript, threshold=0.7, min_length=30):
    """Segments transcript into meaningful chunks based on sentence similarity."""
    sentences = nltk.sent_tokenize(transcript)
    sentences = [s.strip() for s in sentences if s.strip()]

    if len(sentences) < 2:
        return [transcript]  # Return the whole text if it's too short

    # Encode sentences
    sentence_embeddings = encode_sentences(sentences)

    # Compute cosine similarity between consecutive sentences
    similarities = compute_cosine_similarities(sentence_embeddings)

    # Identify segment boundaries where similarity drops below threshold
    boundaries = [0] + [i + 1 for i, sim in enumerate(similarities) if sim < threshold] + [len(sentences)]

    # Create segments
    segments = [" ".join(sentences[boundaries[i]:boundaries[i + 1]]) for i in range(len(boundaries) - 1)]

    # Merge small segments
    processed_segments, temp_segment = [], ""
    for segment in segments:
        if len(segment) + len(temp_segment) < min_length:
            temp_segment += " " + segment
        else:
            if temp_segment:
                processed_segments.append(temp_segment.strip())
            temp_segment = segment
    if temp_segment:
        processed_segments.append(temp_segment.strip())

    # Ensure at least two segments
    if len(processed_segments) == 1:
        mid = len(sentences) // 2
        processed_segments = [" ".join(sentences[:mid]), " ".join(sentences[mid:])]

    return processed_segments


In [None]:
# Apply segmentation using parallel processing
experiment_df['Segments'] = experiment_df['cleaned_transcript'].swifter.apply(segment_transcript)

In [None]:
# Convert to list of dictionaries
segmented_transcripts = [
    {"Video Id": row["Video Id"], "Segment #": i + 1, "Segment": segment}
    for _, row in experiment_df.iterrows()
    for i, segment in enumerate(row["Segments"])
]

In [None]:
# Convert to DataFrame
segmented_df = pd.DataFrame(segmented_transcripts)

# Display segmented transcripts
print(segmented_df)

In [None]:
# Display segmented transcripts in a structured way
for video_id, group in segmented_df.groupby("Video Id"):
    print(f"\n==== Transcript for Video Id: {video_id} ====")
    for index, row in group.iterrows():
        print(f"\nSegment {row['Segment #']}:\n{row['Segment']}")
    print("\n" + "=" * 50)  # Separator for clarity
    
    
# txt

In [None]:
# BERTopic on Segmented Transcripts

from bertopic import BERTopic
from sentence_transformers import SentenceTransformer

# Load multilingual embedding model
embedding_model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')

# Initialize BERTopic with the multilingual embedding model
topic_model = BERTopic(embedding_model=embedding_model, language="multilingual")

# Extract segments and their corresponding Video Ids
segments = []
video_ids = []

for idx, row in segmented_df.iterrows():
    segments.append(row['Segment'])
    video_ids.append(row['Video Id'])

# Fit the BERTopic model
topics, _ = topic_model.fit_transform(segments)

# Create a DataFrame to track topics with Video Ids and Segment numbers
topic_data = {
    "Video Id": video_ids,
    "Segment": segments,
    "Topic": topics
}

In [None]:
# INITIAL

topic_model.get_topic_info()

In [None]:
# Create a DataFrame to organize the results
topic_df = pd.DataFrame(topic_data)

topic_df

In [None]:
# Display topics along with their corresponding Video Ids and Segment Numbers
for topic in set(topics):
    print(f"\n==== Topic {topic} ====")
    topic_segments = topic_df[topic_df['Topic'] == topic]
    for _, row in topic_segments.iterrows():
        print(f"Video Id: {row['Video Id']}, Segment: {row['Segment']}")
    print("=" * 50)

In [None]:
# UPDATE

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

from bertopic.representation import PartOfSpeech

import requests

# Custom stopwords list
custom_stopwords = ['like', 'yeah', 'yes', 'let', 'okay', 'gonna', 'ok', 'bye', 'just', 'got', 'right', 'mmm', 'oh', 'ah', 'gosh']

# Fetch stopwords from GitHub
github_stopwords_url = "https://raw.githubusercontent.com/stopwords-iso/stopwords-en/master/stopwords-en.txt"
github_stopwords = set(requests.get(github_stopwords_url).text.splitlines())

# Append GitHub stopwords to the custom stopwords set
custom_stopwords = set(custom_stopwords)  # Convert to a set
custom_stopwords.update(github_stopwords)  # ✅ This now works because custom_stopwords is a set

# Combine predefined stopwords (ENGLISH_STOP_WORDS) with custom stopwords
combined_stopwords = list(ENGLISH_STOP_WORDS.union(set(custom_stopwords)))

# Create a CountVectorizer with combined stopwords as a list
vectorizer_model = CountVectorizer(stop_words=combined_stopwords)


In [None]:
representation_model = PartOfSpeech("en_core_web_sm")

In [None]:
topic_model.update_topics(segments, vectorizer_model=vectorizer_model, representation_model=representation_model)

In [None]:
# UPDATED

topic_model.get_topic_info()

In [None]:
# Visualize the topic distribution
topic_model.visualize_barchart(top_n_topics=160)

In [None]:
# Group by Video Id and get the unique topics for each Video Id
video_topics = topic_df.groupby('Video Id')['Topic'].unique()

# Display the topics for each Video Id
for video_id, topics in video_topics.items():
    print(f"Transcript Video {video_id}:")
    print(f"Topics Present: {', '.join(map(str, topics))}\n")

In [None]:
# Retrieve topic names from BERTopic
topic_info = topic_model.get_topic_info()
topic_mapping = {row["Topic"]: row["Name"] for _, row in topic_info.iterrows()}

# Count topic occurrences per Video Id
topic_counts = topic_df.groupby(["Video Id", "Topic"]).size().reset_index(name="Count")

# Calculate percentage per Video Id
topic_counts["Percentage"] = topic_counts.groupby("Video Id")["Count"].transform(lambda x: (x / x.sum()) * 100)

# Map topic names
topic_counts["Topic Name"] = topic_counts["Topic"].map(topic_mapping)

# Aggregate topics into a single row per Video Id with newline separation
video_topics = topic_counts.groupby("Video Id").apply(
    lambda group: "\n".join(
        [f"{row['Topic Name']} ({row['Percentage']:.2f}%)" for _, row in group.iterrows()]
    )
).reset_index(name="List of Topics with %")

# Display the final DataFrame
print(video_topics)

In [None]:
from gensim.models import CoherenceModel
from gensim.corpora import Dictionary

# Prepare the documents (segments) for gensim coherence model
documents = [segment.split() for segment in segments]  # Assuming `segments` is your list of sentences

# Create the dictionary from the segments
dictionary = Dictionary(documents)

# Extract the topics and top words
topics = topic_model.get_topics()  # Get topics and their words

# Prepare a list of coherence scores for each topic
topic_coherence_scores = []

for topic, words in topics.items():
    # Convert words to token IDs (ignore words not in dictionary)
    top_words_ids = [dictionary.token2id[word] for word, _ in words if word in dictionary.token2id]
    
    if top_words_ids:
        # Compute the coherence for the current topic
        coherence_model = CoherenceModel(
            topics=[top_words_ids],  # List of token IDs for this topic
            texts=documents,         # Segments as tokenized documents
            dictionary=dictionary,   # Dictionary for token mapping
            coherence='c_v'          # You can change this to other methods like 'u_mass', 'c_uci', or 'c_npmi'
        )
        
        # Append the coherence score for the current topic
        topic_coherence_scores.append(coherence_model.get_coherence())

In [None]:
# Print coherence scores per topic
for idx, score in enumerate(topic_coherence_scores):
    print(f"Coherence Score for Topic {idx + 1}: {score}")

In [None]:
topic_info = topic_model.get_topic_info()

topic_info.to_csv("WEEK6_BERTopic_info.csv", index=False)

In [None]:
# Extract Video Id from the 'Link' column in topics_per_video_df
topics_per_video_df["Video Id"] = topics_per_video_df["Link"].str.extract(r"v=([a-zA-Z0-9_-]+)")

# Create a mapping of Video Id to its BERTopic topics
video_topic_mapping = dict(zip(video_topics["Video Id"], video_topics["List of Topics with %"]))

# Update the 'BERTopic Topics' column based on matching Video Ids
topics_per_video_df["BERTopic Topics"] = topics_per_video_df["Video Id"].map(video_topic_mapping)

# Drop the temporary 'Video Id' column (keeping all other columns intact)
topics_per_video_df.drop(columns=["Video Id"], inplace=True)


In [None]:
# Display updated DataFrame
topics_per_video_df

In [None]:
topics_per_video_df.to_csv("topics_per_video.csv")

### LDA

In [38]:
import os
import requests
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from gensim import corpora
from gensim.models.ldamodel import LdaModel
from gensim.models import  CoherenceModel
import string
import unicodedata
from nltk.util import ngrams
from collections import  Counter
from nltk.collocations import BigramAssocMeasures, BigramCollocationFinder
from nltk.collocations import TrigramAssocMeasures, TrigramCollocationFinder


In [39]:
# Fetch stopwords
def fetch_stopwords_from_github(url):
    response = requests.get(url)
    return set(response.text.splitlines())

github_stopwords_url = 'https://raw.githubusercontent.com/stopwords-iso/stopwords-en/master/stopwords-en.txt'
github_stopwords = fetch_stopwords_from_github(github_stopwords_url)

stop_words = set(stopwords.words('english'))
custom_stop_words = ['like', 'yeah', 'know', 'um', 'uh', 'really', 'one', 'go', 'right', 'okay', 'well', 'said', 
                    'going', 'got', 'na', 'always', 'every', 'each', 'say', 'el', 'little', 'still', 
                    'best', 'dutch', 'nice', 'great', 'awesome', 'good', 'cool', 'love', 'amazing', 'wow',
                    'breaking news', 'report', 'coverage', 'investigation', 'interview', 'documentary', 'news', 'netherlands', 'psy', 'subtitle', 'description', 'link', 
                    'journalist', 'headline', 'reporter', 'current events', 'special report', 
                    'analysis', 'documented', 'broadcast', 'reporting', 'v', 'food', 'travel', 'react', 
                    'reacts', 'reaction', 'foreigner', 'thing', 'visit', 'dc', 'japan', 'first', 'fast', 
                    'asia', 'ang', 'indian', 'thai', 'vietnamese', 'russia', 'gon', 'canada', 'canadian', 'russian', 
                    'russia', 'guy', 'lot', 'bit', 'diba', 'ola', 'cuz', 'thai', 'thailand', 'person', 'citizen', 'foreigner', 'foreign', 'foreigners',
                    'facebook', 'filipinos', 'filipinas', 'vlog', 'vlogs', 'vlogging', 'hashtag', 'india', 'bro', 'dito', 'people', 'time', 'music', 'gonna', 'life', 
                    'lol', 'guys', 'tho', 'cute', 'hmm', 'huh', 'channel', 'subscribe', 'day6', 'mandarin', 'chinese', 'beautiful',
                    'chuckles', 'fbe', 'hit', 'laughs', 'yo', 'ka', 'word', 'living', 'boi', 'minimum', 'ya', 'successful', 'perfectly', 'yeap', 
                    'wondering', 'fantastic', 'hurry', 'german', 'age', 'country', 'subscribing', 'bluesy', 'jump', 'pretty', 'understanding', 'personalized',
                    'and', 'the', 'to', 'of', 'in', 'for', 'on', 'with', 'at', 'by', 'from', 'up', 'about', 'over', 'into', 'through', 'between', 'under', 'against', 'all',
                    'you', 'haha', 'hahaha', 'ha', 'hey', 'bye', 'hello', 'hi', 'oh', 'blah', 'easy', 'alright', 'ta', 'day', 'ooh', 'en', 'do', 'lot', 'comment', 'notification', 
                    'korean', 'jjajangmyeon', 'jajangmyeon', 'damn', 'yall', 'month', 'week', 'year', 'ohhh', 'pvf', 'dude', 'mmm', 'kagilagilalas', 'ofcourse', 'australia', 'uxo', 
                    'atleast', 'yusuf', 'bangkok', 'ot', 'anytime', 'allover', 'kala', 'nope', 'wan', 'brazil', 'smooth', 'ot', 'timeshere', 'batchof', 'yep', 'opo', 'del',
                    'gosh', 'po', 'ourself', 'wo', 'wait', 'ugh', 'nyc', 'whoa', 'nicaragua', 'yup', 'em', 'bout', 'le', 'omg', 'overwhelm', 'maam', 'nicer', 'haha', 'hahaha', 'ha', 
                    'nbcs', 'lana', 'rc', 'whatsoever', 'oxy', 'decade', 'whyd', 'unknown', 'ahhhhh', 'ohoh', 'ohto', 'ohhhh', 'bruh', 'ooe', 'ahmedabad', 'mexico', 
                    'understand', 'excuse', 'kinda', 'applause', 'oooh', 'thiswhat', 'nevermind', 'ahh', 'againthank', 'toto', 'aww', 'nah', 'bbmas', 'ay', 'op', 'huh', 'huhu',
                    'tada', 'beacuse', 'voila', 'upstairs', 'thatswhy', 'yea', 'that', 'armenia', 'or', 'not', 'funwhat', 'aka', 'armeniathat', 'woosexy', 'worth', 'laugh', 'box', 
                    'xd', 'vb', 'eff', 'ananya', 'welsh', 'latron', 'shout', 'whatwhat', 'what', 'pause', 'why', 'thats', 'byebye', 'iv', 'bye', 'ado', 'ownup', 'dom', 'jomm', 'sir', 
                    'budgie', 'nomac', 'lavocha', 'germany', 'why', 'walang', 'superduper', 'philip', 'mom', 'jre', 'giddy', 'intro', 'dupe', 'europe', 'dream', 'team', 'dislike', 'content', 
                    'yoongi', 'royale', 'ilu', 'jhope', 'day', 'jin', 'ecc', 'nyhs', 'nego', 'chavez', 'pb', 'everyones', 'epic', 'matter', 'oneonone', 'region', 'change', 'ho', 'seetoh', 
                    'atin', 'vpn', 'facetune', 'busu', 'mackie', 'clyd', 'china', 'rest', 'friend', 'woah', 'dindins', 'poster', 'vibe', 'woman', 'boss', 'woah', 'type', 'mahana', 'joke', 
                    'taller', 'insane', 'whang', 'psa', 'manatee', 'recommend', 'caesar', 'mmmhmm', 'mosul', 'dun', 'clue', 'naysayer', 'hindi', 'ko', 'pero', 'bulgaria', 'question', 'video', 
                    'yobi', 'hindu', 'expat', 'option', 'gap', 'eu', 'simo', 'kouignamann', 'bct', 'month', 'cfo', 'philippines', 'philippine', 'british', 'filipino', 'video', 
                    'http', 'korea', 'korean', 'youtube', 'google', 'united', 'america', 'american', 'kpop', '필리핀', 'bts', 'blackpink', 'twice', 'exo', 'k-pop', 
                    'seventeen', 'stray kids', 'nct', 'kdrama', 'aespa', 'taehyung', 'jimin', 'jungkook']
stop_words.update(custom_stop_words, github_stopwords)

lemmatizer = WordNetLemmatizer()

In [48]:
# Helper function to check if a word contains only Latin script characters
def is_latin_script(word):
    for char in word:
        if not ('LATIN' in unicodedata.name(char, '') or char.isdigit()):
            return False
    return True

def detect_collocations(tokens, min_freq=3):
    """Identify frequent bigrams and join them into single tokens."""
    bigram_measures = BigramAssocMeasures()
    
    # Detect Bigrams
    bigram_finder = BigramCollocationFinder.from_words(tokens)
    bigram_finder.apply_freq_filter(min_freq)  # Consider only frequent bigrams
    bigrams = set(['_'.join(bigram) for bigram in bigram_finder.nbest(bigram_measures.pmi, 10)])
    
    return bigrams  # Return only bigrams

def preprocess_text(doc, video_id, ngram_range=(1, 2), tag_weight=1, ngram_weight_factor=1):
    doc = doc.lower().translate(str.maketrans('', '', string.punctuation))
    tokens = word_tokenize(doc)

    # Remove stopwords before detecting collocations
    tokens = [word for word in tokens if word not in stop_words and word.isalpha()]
    
    # Detect collocations (Only Bigrams)
    collocations = detect_collocations(tokens)
    
    # Merge collocations into single tokens
    merged_tokens = []
    i = 0
    while i < len(tokens):
        if i < len(tokens) - 1 and f"{tokens[i]}_{tokens[i+1]}" in collocations:
            merged_tokens.append(f"{tokens[i]}_{tokens[i+1]}")
            i += 2
        else:
            merged_tokens.append(tokens[i])
            i += 1
    
    # POS tagging after merging
    tokens_with_pos = pos_tag(merged_tokens)

    # Apply lemmatization
    lemmatized_tokens = [lemmatizer.lemmatize(word, get_wordnet_pos(pos)) for word, pos in tokens_with_pos]

    # Generate n-grams
    ngram_tokens = []
    for n in range(ngram_range[0], ngram_range[1] + 1):
        ngram_tokens.extend([' '.join(gram) for gram in ngrams(lemmatized_tokens, n)])

    # Count n-gram frequency
    ngram_frequencies = Counter(ngram_tokens)

    # Filter out verbs, adjectives, and adverbs
    filtered_tokens = [
        word for word, pos in tokens_with_pos
        if pos not in ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'JJ', 'JJR', 'JJS', 'RB', 'RBR', 'RBS']
    ]

    # Assign weight based on n-grams 
    weighted_tokens = []
    for token in filtered_tokens:
        token_weight = 1 + ngram_frequencies.get(token, 0) * ngram_weight_factor
        weighted_tokens.extend([token] * int(token_weight))

    # Include video tags
    if video_id in video_tags:
        tags = video_tags[video_id]
        for tag in tags:
            if tag.isalpha():
                tag_weight_multiplier = tag_weight
                weighted_tokens.extend([tag] * int(tag_weight_multiplier))
    
    return ' '.join(weighted_tokens)

def get_wordnet_pos(treebank_tag):
    """Convert POS tag to WordNet format for lemmatization."""
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN  # Default to noun
    
# Folder paths
transcripts_folder_path = 'standard_dataset/'
tags_folder_path = 'tags/'

# Function to load video tags only for fetched video IDs
def load_video_tags(folder_path, video_ids):
    video_tags = {}
    for video_id in video_ids:
        tag_file = os.path.join(folder_path, f"{video_id}.txt")
        if os.path.exists(tag_file):
            with open(tag_file, "r", encoding="utf-8") as file:
                tags_content = file.read().lower()
                video_tags[video_id] = tags_content.split()  # Store as list of words
        else:
            video_tags[video_id] = []  # Default to empty list if no tags
    return video_tags

video_ids = []
transcript_files = []
for file_name in os.listdir(transcripts_folder_path):
    if file_name.endswith('.txt'):
        video_id = file_name.split('_captions')[0]
        video_ids.append(video_id)
        transcript_files.append((video_id, file_name)) 

video_tags = load_video_tags(tags_folder_path, video_ids)

all_documents = []
for video_id, file_name in transcript_files:
    with open(os.path.join(transcripts_folder_path, file_name), 'r', encoding='utf-8') as file:
        content = file.read().lower()
        if len(content.split()) >= 100: 
            processed_text = preprocess_text(content, video_id)  
            all_documents.append(processed_text)

In [49]:
# Create Dictionary and Corpus for LDA
dictionary = corpora.Dictionary([doc.split() for doc in all_documents])
corpus = [dictionary.doc2bow(doc.split()) for doc in all_documents]

# Train LDA model
lda_model = LdaModel(corpus, num_topics=3, id2word=dictionary, alpha='auto', eta='symmetric', passes=100)

# Compute coherence score
coherence_model = CoherenceModel(model=lda_model, corpus=corpus, dictionary=dictionary, coherence='u_mass')
coherence_score = coherence_model.get_coherence()

print(f"Coherence Score: {coherence_score}")

Coherence Score: -3.703327429208097


In [50]:
for idx, doc_bow in enumerate(corpus):
    video_id = video_ids[idx]  
    topic_distribution = lda_model.get_document_topics(doc_bow, minimum_probability=0)
    print(f"Video {video_id}: {topic_distribution}")

topics = lda_model.print_topics(num_words=20) 
for topic_id, topic_words in topics:
    print(f"Topic {topic_id}: {topic_words}")


Video --8n6A8Q6M0_$200 Luxury Beach Hotel in The Philippines ðŸ‡µðŸ‡­: [(0, 4.2391953e-06), (1, 0.99998766), (2, 8.045148e-06)]
Video 0IMWasj76yU_Philippines Army vs Thailand Army: [(0, 0.99914974), (1, 0.000438952), (2, 0.00041132353)]
Video 1kErCqgIVMk_Tour of The House We Built in The Philippines: [(0, 3.468915e-05), (1, 0.9998995), (2, 6.583309e-05)]
Video 2ftG8JuMzz4_Top 5 Exotic Foods in the Philippines The BRAVE MUST TRY!: [(0, 2.7568594e-05), (1, 5.5834033e-05), (2, 0.9999166)]
Video 2TmagN6RhkI_VOCAL COACH REACTS - REGINE VELASQUEZ - Araw-Gabi: [(0, 0.00010877535), (1, 0.99968475), (2, 0.00020643373)]
Video 3n1nspLgfd4_Balut!!!!! - Manila, Philippines - Now United: [(0, 0.0009609475), (1, 0.2630458), (2, 0.73599327)]
Video 401ZvQ11euA_Did China Buy the Philippines With False Investment Promises: [(0, 3.8923743e-05), (1, 0.9998872), (2, 7.3869465e-05)]
Video 4OiOwDh5lKE_Chef Kuya Joey Is Flying To The Philippines!  ðŸ‡µðŸ‡­: [(0, 0.00014899168), (1, 0.56293297), (2, 0.43691805)

In [51]:
for idx, doc_bow in enumerate(corpus):
    video_id = video_ids[idx]
    topic_distribution = lda_model.get_document_topics(doc_bow, minimum_probability=0)
    dominant_topic = sorted(topic_distribution, key=lambda x: x[1], reverse=True)[0]  # Highest probability
    print(f"Video {video_id}: Dominant Topic {dominant_topic[0]} with probability {dominant_topic[1]:.4f}")


Video --8n6A8Q6M0_$200 Luxury Beach Hotel in The Philippines ðŸ‡µðŸ‡­: Dominant Topic 1 with probability 1.0000
Video 0IMWasj76yU_Philippines Army vs Thailand Army: Dominant Topic 0 with probability 0.9991
Video 1kErCqgIVMk_Tour of The House We Built in The Philippines: Dominant Topic 1 with probability 0.9999
Video 2ftG8JuMzz4_Top 5 Exotic Foods in the Philippines The BRAVE MUST TRY!: Dominant Topic 2 with probability 0.9999
Video 2TmagN6RhkI_VOCAL COACH REACTS - REGINE VELASQUEZ - Araw-Gabi: Dominant Topic 1 with probability 0.9997
Video 3n1nspLgfd4_Balut!!!!! - Manila, Philippines - Now United: Dominant Topic 2 with probability 0.7360
Video 401ZvQ11euA_Did China Buy the Philippines With False Investment Promises: Dominant Topic 1 with probability 0.9999
Video 4OiOwDh5lKE_Chef Kuya Joey Is Flying To The Philippines!  ðŸ‡µðŸ‡­: Dominant Topic 1 with probability 0.5629
Video 6ftiWoCJ4dM_Filipino Food In The Netherlands  Cafe Nordrick!!: Dominant Topic 1 with probability 0.9988
Video 7j

In [52]:
from collections import defaultdict

# Dictionary to store the count of videos per topic
topic_video_count = defaultdict(int)

# Mapping of video ID to its dominant topic
video_topic_mapping = {}

for idx, doc_bow in enumerate(corpus):
    video_id = video_ids[idx]  # Get video ID
    topic_distribution = lda_model.get_document_topics(doc_bow, minimum_probability=0)

    # Get the most dominant topic (highest probability)
    dominant_topic = sorted(topic_distribution, key=lambda x: x[1], reverse=True)[0][0]
    
    # Store the mapping
    video_topic_mapping[video_id] = dominant_topic
    
    # Increase count for that topic
    topic_video_count[dominant_topic] += 1

# Print number of videos assigned to each topic
print("\nNumber of Videos per Topic:")
for topic, count in sorted(topic_video_count.items()):
    print(f"Topic {topic}: {count} videos")



Number of Videos per Topic:
Topic 0: 21 videos
Topic 1: 40 videos
Topic 2: 39 videos


In [53]:
topic_to_videos = defaultdict(list)

video_topic_mapping = {}

# probability threshold for assigning multiple topics
prob_threshold = 0.2

# Dictionary to store topic words for each video
video_topic_words_LDA1 = {}

for idx, doc_bow in enumerate(corpus):
    video_id = video_ids[idx]  # Get video ID
    topic_distribution = lda_model.get_document_topics(doc_bow, minimum_probability=0)

    # Get topics where probability is above threshold
    assigned_topics = [topic for topic, prob in topic_distribution if prob >= prob_threshold]
    video_topic_mapping[video_id] = assigned_topics  # Store assigned topics per video

    for topic in assigned_topics:
        topic_to_videos[topic].append(video_id)

    # Get the representative words for each assigned topic
    topic_words = []
    for topic in assigned_topics:
        words = [word for word, _ in lda_model.show_topic(topic, topn=10)]  # Get top 10 words
        topic_words.append(", ".join(words))  # Convert list to string

    # Store the topic words as a string
    video_topic_words_LDA1[video_id] = "; ".join(topic_words)  # Separate topics with `;`

# Count occurrences of each topic
topic_counts = Counter()

for topics in video_topic_mapping.values():
    for topic in topics:
        topic_counts[topic] += 1

# Print the number of videos per topic
print("\nNumber of Videos Per Topic:")
for topic, count in sorted(topic_counts.items()):
    print(f"Topic {topic}: {count} videos")

# Print topics assigned per video
print("\nTopics Assigned Per Video:")
for video_id, topics in video_topic_mapping.items():
    topic_list = ', '.join(map(str, topics)) if topics else "No dominant topic"
    print(f"Video ID: {video_id} → Topics: {topic_list}")

# Print videos per topic
print("\nTop Words Per Topic:")
num_words = 30  

for topic_id in sorted(topic_to_videos.keys()): 
    top_words = lda_model.show_topic(topic_id, num_words)
    words_str = ', '.join([word for word, prob in top_words])
    print(f"Topic {topic_id}: {words_str}")



Number of Videos Per Topic:
Topic 0: 23 videos
Topic 1: 51 videos
Topic 2: 46 videos

Topics Assigned Per Video:
Video ID: --8n6A8Q6M0_$200 Luxury Beach Hotel in The Philippines ðŸ‡µðŸ‡­ → Topics: 1
Video ID: 0IMWasj76yU_Philippines Army vs Thailand Army → Topics: 0
Video ID: 1kErCqgIVMk_Tour of The House We Built in The Philippines → Topics: 1
Video ID: 2ftG8JuMzz4_Top 5 Exotic Foods in the Philippines The BRAVE MUST TRY! → Topics: 2
Video ID: 2TmagN6RhkI_VOCAL COACH REACTS - REGINE VELASQUEZ - Araw-Gabi → Topics: 1
Video ID: 3n1nspLgfd4_Balut!!!!! - Manila, Philippines - Now United → Topics: 1, 2
Video ID: 401ZvQ11euA_Did China Buy the Philippines With False Investment Promises → Topics: 1
Video ID: 4OiOwDh5lKE_Chef Kuya Joey Is Flying To The Philippines!  ðŸ‡µðŸ‡­ → Topics: 1, 2
Video ID: 6ftiWoCJ4dM_Filipino Food In The Netherlands  Cafe Nordrick!! → Topics: 1
Video ID: 7jDrzdX2Ucc_Approve ba ako sa anak at mama ni misterðŸ¤”Mia Sandoval → Topics: 1, 2
Video ID: 7zaMbzqj8AI_Rentin

In [54]:
# Folder paths
transcripts_folder_path = 'standard_dataset/'
tags_folder_path = 'tags/'

# Load video tags
def load_video_tags(folder_path, video_ids):
    video_tags = {}
    for video_id in video_ids:
        tag_file = os.path.join(folder_path, f"{video_id}.txt")
        if os.path.exists(tag_file):
            with open(tag_file, "r", encoding="utf-8") as file:
                tags_content = file.read().lower()
                video_tags[video_id] = tags_content.split()  
        else:
            video_tags[video_id] = []  
    return video_tags

# Load transcripts
video_ids = []
transcript_files = []
for file_name in os.listdir(transcripts_folder_path):
    if file_name.endswith('.txt'):
        video_id = file_name.split('_captions')[0]
        video_ids.append(video_id)
        transcript_files.append((video_id, file_name)) 

video_tags = load_video_tags(tags_folder_path, video_ids)

video_topics = {}

for video_id, file_name in transcript_files:
    with open(os.path.join(transcripts_folder_path, file_name), 'r', encoding='utf-8') as file:
        content = file.read().lower()
        
        if len(content.split()) >= 100:  
            processed_text = preprocess_text(content, video_id)
            
            # Create Dictionary and Corpus
            tokens = processed_text.split()
            corpus = [dictionary.doc2bow(tokens)]
            
            # Train LDA Model
            num_topics = 3  # Limit topics to at most 5 or vocab size
            lda_model_2 = LdaModel(corpus, num_topics=3, id2word=dictionary, alpha='auto', eta='symmetric', passes=100)
            
            # Compute Coherence Score
            coherence_model = CoherenceModel(model=lda_model_2, corpus=corpus, dictionary=dictionary, coherence='u_mass')
            coherence_score = coherence_model.get_coherence()
            
            # Get topic distribution for the video
            topic_distribution = lda_model_2.get_document_topics(corpus[0], minimum_probability=0)
            assigned_topics = {topic: prob for topic, prob in topic_distribution if prob > 0.5}  # Threshold for multiple topics
            
            # Store video topic info
            video_topics[video_id] = {
                "topics": assigned_topics,
                "top_words": {i: lda_model_2.show_topic(i, 20) for i in range(num_topics)},
                "coherence": coherence_score
            }

# Dictionary to store LDA2 topic words for each video
video_topic_words_lda2 = {}

for video_id, data in video_topics.items():
    topic_words = []
    
    for topic_id, prob in data["topics"].items():
        words = [word for word, _ in data["top_words"][topic_id]]  # Extract top words
        topic_words.append(", ".join(words))  # Convert list to string

    # Store the topic words as a string
    video_topic_words_lda2[video_id] = "; ".join(topic_words)  # Separate topics with `;`
    
# Print topics per video
for video_id, data in video_topics.items():
    print(f"\nVideo ID: {video_id}")
    print(f"Coherence Score: {data['coherence']:.4f}")
    print("Assigned Topics:")
    for topic_id, prob in data["topics"].items():
        print(f"  Topic {topic_id} ({prob:.2f} probability): {[word for word, _ in data['top_words'][topic_id]]}")


Video ID: --8n6A8Q6M0_$200 Luxury Beach Hotel in The Philippines ðŸ‡µðŸ‡­
Coherence Score: 0.0000
Assigned Topics:
  Topic 2 (1.00 probability): ['beach', 'water', 'hotel', 'jet', 'god', 'sand', 'couple', 'head', 'hour', 'bed', 'night', 'power', 'hat', 'fun', 'pool', 'breakfast', 'yesterday', 'tide', 'view', 'eat']

Video ID: 0IMWasj76yU_Philippines Army vs Thailand Army
Coherence Score: -7.3198
Assigned Topics:
  Topic 0 (1.00 probability): ['sad_salt', 'armed_forces', 'air', 'sea_ground', 'puppy_win', 'sea', 'troopers', 'helicop', 'helicopt', 'force', 'tanks', 'helicopters', 'xtra', 'cities', 'bruno', 'launch', 'yaaaaaassss', 'makati', 'percent', 'attractions']

Video ID: 1kErCqgIVMk_Tour of The House We Built in The Philippines
Coherence Score: -4.6536
Assigned Topics:
  Topic 2 (1.00 probability): ['water', 'house', 'floor', 'door', 'tile', 'glass', 'handmade', 'grill', 'storage', 'pump', 'job', 'hand', 'steel_steel', 'bathroom', 'kubo', 'cut', 'sun', 'stuff', 'tour', 'build']

Vi

In [55]:
import re

def extract_and_match_topic(url, topic_dict):
    match = re.search(r"(?:v=|\/(?:embed|shorts|v)\/|youtu\.be\/)([0-9A-Za-z_-]{11})", str(url))
    video_id = match.group(1) if match else None  # Extract valid 11-char video ID
    
    if video_id:
        for key in topic_dict.keys(): 
            if key.startswith(video_id):
                return topic_dict[key]  

    return None  

# Apply the function to extract Video ID and map topics
topics_per_video_df["LDA1 Topics"] = topics_per_video_df["Link"].apply(lambda url: extract_and_match_topic(url, video_topic_words_LDA1))
topics_per_video_df["LDA2 Topics"] = topics_per_video_df["Link"].apply(lambda url: extract_and_match_topic(url, video_topic_words_lda2))


In [56]:
pd.set_option("display.max_rows", None) 
pd.set_option("display.max_columns", None) 
pd.set_option("display.max_colwidth", 100)  
pd.set_option("display.width", 1000)  

topics_per_video_df 

Unnamed: 0_level_0,Link,LDA1 Topics,LDA2 Topics,BERTopic Topics,HLTA Topics
Video Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
$200 Luxury Beach Hotel in The Philippines ðŸ‡µðŸ‡­,https://www.youtube.com/watch?v=--8n6A8Q6M0,"beach, water, hotel, island, spot, rain, jet, god, couple, night","beach, water, hotel, jet, god, sand, couple, head, hour, bed, night, power, hat, fun, pool, brea...",,"yesterday, camera, shopping, coffee, intestine, option, main-street : Level 2 (1.00)\ndavao, dav..."
Philippines Army vs Thailand Army,https://www.youtube.com/watch?v=0IMWasj76yU,"baguio, market, city, stuff, strawberry, park, view, enjoy, manila, cream","sad_salt, armed_forces, air, sea_ground, puppy_win, sea, troopers, helicop, helicopt, force, tan...",,[]
Tour of The House We Built in The Philippines,https://www.youtube.com/watch?v=1kErCqgIVMk,"beach, water, hotel, island, spot, rain, jet, god, couple, night","water, house, floor, door, tile, glass, handmade, grill, storage, pump, job, hand, steel_steel, ...",,"yesterday, camera, shopping, coffee, intestine, option, main-street : Level 2 (0.87)\nice, deep,..."
VOCAL COACH REACTS - REGINE VELASQUEZ - Araw-Gabi,https://www.youtube.com/watch?v=2TmagN6RhkI,"beach, water, hotel, island, spot, rain, jet, god, couple, night","voice, song, feel, forget, melody, hear, fluent, sit, falsetto, schedule, head, stream, vocalist...",,"ice, deep, pour, storm, breakfast, super-super, happen : Level 2 (0.51)\nadobo, food, insect, at..."
Top 5 Exotic Foods in the Philippines The BRAVE MUST TRY!,https://www.youtube.com/watch?v=2ftG8JuMzz4,"eat, chicken, rice, taste, pork, chef_roline, sauce, restaurant, meat, pizza","meat, tamilok, balut, soup, egg, taste, eat, duck, etag, dish, salt, kamaru, vinegar, coming_lis...",,"singer, voice, performance, bell, regine-velasquez, morissette-amon, morissette : Level 2 (1.00)..."
"Balut!!!!! - Manila, Philippines - Now United",https://www.youtube.com/watch?v=3n1nspLgfd4,"beach, water, hotel, island, spot, rain, jet, god, couple, night; eat, chicken, rice, taste, por...","night, tastes, duck, baby, level, fetus, egg, bullet, ate, pilot, vice, wake, engine, book, mang...",,[]
Did China Buy the Philippines With False Investment Promises,https://www.youtube.com/watch?v=401ZvQ11euA,"beach, water, hotel, island, spot, rain, jet, god, couple, night","duterte, investment, dollar, hotel, province, jetski, money, hand, south_sea, episode, jia, infr...",,"ceremony, dress, trade, sponsor, special, chain, pocket : Level 2 (1.00)\ntown, short, character..."
Chef Kuya Joey Is Flying To The Philippines! ðŸ‡µðŸ‡­,https://www.youtube.com/watch?v=4OiOwDh5lKE,"beach, water, hotel, island, spot, rain, jet, god, couple, night; eat, chicken, rice, taste, por...","hotel, mashed_potatoes, peach, cumin, flavor, mango, chicken, trip, pinas, leaf, pool, iinit, ri...",,"swim, arrive, typhoon, rent, nature, coconut, cuisine : Level 2 (0.72)\npeach, italian, choice, ..."
Filipino Food In The Netherlands Cafe Nordrick!!,https://www.youtube.com/watch?v=6ftiWoCJ4dM,"beach, water, hotel, island, spot, rain, jet, god, couple, night","bread, coffee, hague, husband, longsiloc, pandesal, menu, tapsilog, offering, caldereta, adobo, ...",,[]
Approve ba ako sa anak at mama ni misterðŸ¤”Mia Sandoval,https://www.youtube.com/watch?v=7jDrzdX2Ucc,"beach, water, hotel, island, spot, rain, jet, god, couple, night; eat, chicken, rice, taste, por...","john, phone, night, family, proper, bonfire, battery, palanca, son, eye, wind, morning, apply_vi...",,"english, tagalog, accent, proper, earlier, walk-hill, sunday : Level 2 (1.00)\neye, son, traditi..."


In [None]:
topics_per_video_df.to_csv("topics_per_video.csv")
