In [1]:
import os
import re
import pandas as pd

# Create Initial Dataframe with all video IDs
dataset_folder = "./standard_dataset"
data_records = []

# Regex pattern to extract Video Id and Title from the filename
filename_pattern = re.compile(r"^(.*)_(.*?)_captions\.txt$")

for file in os.listdir(dataset_folder):
    if file.endswith("_captions.txt"):  # Ensure it's a valid transcript file
        match = filename_pattern.match(file)
        if match:
            video_id, video_title = match.groups()  # Extract Video Id and Title
            file_path = os.path.join(dataset_folder, file)

            # Append data to the list
            data_records.append({
                "Video Title": video_title,
                "Link": f"https://www.youtube.com/watch?v={video_id}",
                "LDA1 Topics": None,
                "LDA2 Topics": None,
                "BERTopic Topics": None,
                "HLTA Topics":[]
            })

# Convert to DataFrame
topics_per_video_df = pd.DataFrame(data_records)
topics_per_video_df.set_index("Video Title", inplace=True)

topics_per_video_df


Unnamed: 0_level_0,Link,LDA1 Topics,LDA2 Topics,BERTopic Topics,HLTA Topics
Video Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Is this Asia's BIGGEST mall in Manila THE PHILIPPINES! (didnâ€™t go to plan) ðŸ‡µðŸ‡­,https://www.youtube.com/watch?v=MBtWxCM7NYg,,,,[]
TOP 6 METHODS I USED TO ACQUIRE TAGALOGFILIPINO - (Tagalog with English Subtitles),https://www.youtube.com/watch?v=O0cIJww_amw,,,,[]
Boracay is PARADISE in The Philippines! (canâ€™t believe I did this..)ðŸ‡µðŸ‡­,https://www.youtube.com/watch?v=R8-LyHgcl5I,,,,[]
"First Time in Manila, Philippines! Eating Jollibee, taking the metro, and visiting tourist places",https://www.youtube.com/watch?v=Gl2RpA1xS6M,,,,[]
Chef Kuya Joey Is Flying To The Philippines! ðŸ‡µðŸ‡­,https://www.youtube.com/watch?v=4OiOwDh5lKE,,,,[]
...,...,...,...,...,...
STRANGEST Things Found In The Philippines!,https://www.youtube.com/watch?v=vwIIeiOhTJ0,,,,[]
Filipinas Do Not Show Up LATE - Dating A Foreigner,https://www.youtube.com/watch?v=8DYPxsr7kDI,,,,[]
Vocal Coach REACTS to TNT Boys BANG BANG + 5 songs #Yourfacesoundsfamiliar Lucia Sinatra,https://www.youtube.com/watch?v=S4bNEgM483k,,,,[]
After Spending Years With Filipino's My English Sounds...,https://www.youtube.com/watch?v=NdLjKxqzDbA,,,,[]


### HLTA

In [2]:
# Store results as a flat list
import json

nodes_filename = 'standard_test_1.nodes.json'
nodes_filepath = './HLTM/output-jsons/' + nodes_filename

with open(nodes_filepath, 'r') as f:
    data = json.load(f)

def flatten_tree(node, depth=1, result=[]):
    """Recursively flattens a tree structure into a list."""
    result.append({
        "id": node["id"],
        "texts": node["text"].split(),
        "level": depth
    })
    
    for child in node.get("children", []):
        flatten_tree(child, depth + 1, result)
    
    return result

# Process each root node in the JSON
flat_list = []
for root in data:
    flat_list.extend(flatten_tree(root))

print(flat_list)

[{'id': 'Z23', 'texts': ['sing', 'singer', 'voice', 'singing', 'performance', 'vocal', 'bell'], 'level': 1}, {'id': 'Z117', 'texts': ['singer', 'voice', 'performance', 'bell', 'regine-velasquez', 'morissette-amon', 'morissette'], 'level': 2}, {'id': 'Z118', 'texts': ['world', 'talent', 'record', 'shoot', 'personal', 'public', 'luke'], 'level': 2}, {'id': 'Z120', 'texts': ['boil', 'vegetable', 'waste', 'original', 'largest', 'decide', 'guest'], 'level': 2}, {'id': 'Z116', 'texts': ['sing', 'singing', 'vocal', 'stage', 'tnt-boy', 'note', 'male'], 'level': 2}, {'id': 'Z119', 'texts': ['primate', 'tarsier', 'peace', 'animal', 'legend', 'banana-ketchup', 'band'], 'level': 2}, {'id': 'Z23', 'texts': ['sing', 'singer', 'voice', 'singing', 'performance', 'vocal', 'bell'], 'level': 1}, {'id': 'Z117', 'texts': ['singer', 'voice', 'performance', 'bell', 'regine-velasquez', 'morissette-amon', 'morissette'], 'level': 2}, {'id': 'Z118', 'texts': ['world', 'talent', 'record', 'shoot', 'personal', 'pu

In [3]:
from collections import defaultdict

topic_map_filepath = './HLTM/output-jsons/' + 'standard_test_1.topics.json'

with open(topic_map_filepath, 'r') as f:
    topic_mapping_data = json.load(f)

topic_ids_per_vid_mapping = defaultdict(list)

# use dictionary to change mapping to list topics per video
# doc id serves as the grouping key
for topic_obj in topic_mapping_data:
    topic = topic_obj["topic"]
    
    for doc_entry in topic_obj["doc"]:
        doc_number, probability = doc_entry
        topic_ids_per_vid_mapping[doc_number].append((topic, probability))


# Document numbers are based on the order they are fed into the HLTA model or which line they are in orig text file, starts from 0 til n

In [4]:
# link the top words to the video instead of just the topic id
topic_to_words = {entry["id"]: (", ".join(entry["texts"]), entry["level"]) for entry in flat_list}


# Step 2: Convert topic_mapping_data into topics_per_vid_mapping
for topic_obj in topic_mapping_data:
    topic = topic_obj["topic"]
    for doc_entry in topic_obj["doc"]:
        doc_number, probability = doc_entry
        topic_ids_per_vid_mapping[doc_number].append((topic, probability))

# Step 3: Store transformed data in a new dictionary
topic_words_per_vid_mapping = defaultdict(list)

for doc, topic_list in topic_ids_per_vid_mapping.items():
    for t, p in topic_list:
        if t in topic_to_words:
            topic_words, level = topic_to_words[t]
            topic_words_per_vid_mapping[doc].append((topic_words, level, p))
        else:
            topic_words_per_vid_mapping[doc].append((t, None, p))  # Handle case where topic ID is not found

# Step 4: Print the results
for doc, topic_list in topic_words_per_vid_mapping.items():
    topics_str = ", ".join(f"{t} (Level {lvl}): {p:.2f}" if lvl is not None else f"{t}: {p:.2f}" for t, lvl, p in topic_list)
    print(f"Document {doc} - {topics_str}")

Document 34 - yesterday, camera, shopping, coffee, intestine, option, main-street (Level 2): 1.00, beach, explore, weather, road, ride, hotel, car (Level 2): 1.00, photo, session, goodness, hair (Level 2): 0.97, private, wake, entrance, cheap, south, cafe, build (Level 2): 1.00, colt, brooklyn, bgc, month, baby, late, carry (Level 2): 0.98, pay, drive, tourist, chocolate-hill, boat, ticket, real (Level 2): 1.00, chicken-chicken, tasty, singapore, mushroom, dish, honest, mix (Level 2): 1.00, swim, arrive, typhoon, rent, nature, coconut, cuisine (Level 2): 0.98, remind, fried, bite, roasted, bit, wrong, pie (Level 2): 0.98, quick, busy, bed, close, toilet, kitchen, bedroom (Level 2): 1.00, pizza, foot, scared, tricycle, wave, scary, pasta (Level 2): 1.00, random, extra, finally, taxi, fridge, driver, milk (Level 2): 1.00, peach, italian, choice, garlic-rice, laughter, dinner, juicy (Level 2): 1.00, previous, climb, cooky, hmmm, surprise, request, maja (Level 2): 0.54, airport, chill, fli

In [5]:
# append into main dataframe
for doc, topic_list in topic_words_per_vid_mapping.items():
    t = ""
    for topic in topic_list:
       t += topic[0] + " : Level " + str(topic[1]) + " ({:.2f})".format(topic[2]) + '\n'

    topics_per_video_df.iloc[int(doc), 4] = t

topics_per_video_df


Unnamed: 0_level_0,Link,LDA1 Topics,LDA2 Topics,BERTopic Topics,HLTA Topics
Video Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Is this Asia's BIGGEST mall in Manila THE PHILIPPINES! (didnâ€™t go to plan) ðŸ‡µðŸ‡­,https://www.youtube.com/watch?v=MBtWxCM7NYg,,,,"yesterday, camera, shopping, coffee, intestine..."
TOP 6 METHODS I USED TO ACQUIRE TAGALOGFILIPINO - (Tagalog with English Subtitles),https://www.youtube.com/watch?v=O0cIJww_amw,,,,[]
Boracay is PARADISE in The Philippines! (canâ€™t believe I did this..)ðŸ‡µðŸ‡­,https://www.youtube.com/watch?v=R8-LyHgcl5I,,,,"yesterday, camera, shopping, coffee, intestine..."
"First Time in Manila, Philippines! Eating Jollibee, taking the metro, and visiting tourist places",https://www.youtube.com/watch?v=Gl2RpA1xS6M,,,,"ice, deep, pour, storm, breakfast, super-super..."
Chef Kuya Joey Is Flying To The Philippines! ðŸ‡µðŸ‡­,https://www.youtube.com/watch?v=4OiOwDh5lKE,,,,"singer, voice, performance, bell, regine-velas..."
...,...,...,...,...,...
STRANGEST Things Found In The Philippines!,https://www.youtube.com/watch?v=vwIIeiOhTJ0,,,,"davao, davao-city, island-hop, lagoon, siargao..."
Filipinas Do Not Show Up LATE - Dating A Foreigner,https://www.youtube.com/watch?v=8DYPxsr7kDI,,,,"charge, funny, hold, notice, mango, kuala-lump..."
Vocal Coach REACTS to TNT Boys BANG BANG + 5 songs #Yourfacesoundsfamiliar Lucia Sinatra,https://www.youtube.com/watch?v=S4bNEgM483k,,,,"davao, davao-city, island-hop, lagoon, siargao..."
After Spending Years With Filipino's My English Sounds...,https://www.youtube.com/watch?v=NdLjKxqzDbA,,,,"photo, session, goodness, hair : Level 2 (1.00..."


### BERTopic

In [6]:
# Initialize list to store data
experiment_records = []

# Iterate through transcript files
for file in os.listdir(dataset_folder):
    if file.endswith("_captions.txt"):  # Ensure it's a valid transcript file
        match = filename_pattern.match(file)
        if match:
            video_id, _ = match.groups()  # Extract Video ID
            file_path = os.path.join(dataset_folder, file)

            # Read transcript content
            with open(file_path, "r", encoding="utf-8") as f:
                transcript = f.read().strip()  # Remove extra spaces/newlines

            # Append data to list
            experiment_records.append({
                "Video Id": video_id,
                "Transcript": transcript
            })

# Convert to DataFrame
experiment_df = pd.DataFrame(experiment_records)

# Display the first few rows to verify
experiment_df

Unnamed: 0,Video Id,Transcript
0,MBtWxCM7NYg,this did not go to plan when i was like it's \...
1,O0cIJww_amw,"I once talked to a filipina. Then she said, wh..."
2,R8-LyHgcl5I,hi my name is josie and this is \nboracay isla...
3,Gl2RpA1xS6M,good morning everyone we're Nicole and Mico \n...
4,4OiOwDh5lKE,today is the day we're going to the philippine...
...,...,...
95,vwIIeiOhTJ0,"From dogs riding motorcycles, to inmates danci..."
96,8DYPxsr7kDI,[Music]\ntoday's video is going to primarily b...
97,S4bNEgM483k,Hello and welcome back to my brand new channel...
98,NdLjKxqzDbA,welcome welcome back to Pinoy Planet I'm \nexc...


In [7]:
# initial pre-processing (minimal)

# Define a minimal preprocessing function
def minimal_preprocessing(text):
    # Remove timestamps and other non-verbal elements (like "(music)")
    text = re.sub(r'\[.*?\]', '', text)  # Remove content in square brackets (e.g., music)
    text = re.sub(r'\(.*?\)', '', text)  # Remove content in parentheses (e.g., [music], (child speaking))
    text = re.sub(r'\d+', '', text)     # Remove numbers
    text = re.sub(r'\s+', ' ', text)    # Replace multiple spaces with one
    text = text.strip()                 # Remove leading and trailing whitespace
    return text

# Apply the preprocessing to the 'Transcript' column
experiment_df['cleaned_transcript'] = experiment_df['Transcript'].apply(minimal_preprocessing)

# Display the cleaned data
experiment_df[['Transcript', 'cleaned_transcript']]

Unnamed: 0,Transcript,cleaned_transcript
0,this did not go to plan when i was like it's \...,this did not go to plan when i was like it's g...
1,"I once talked to a filipina. Then she said, wh...","I once talked to a filipina. Then she said, wh..."
2,hi my name is josie and this is \nboracay isla...,hi my name is josie and this is boracay island...
3,good morning everyone we're Nicole and Mico \n...,good morning everyone we're Nicole and Mico we...
4,today is the day we're going to the philippine...,today is the day we're going to the philippine...
...,...,...
95,"From dogs riding motorcycles, to inmates danci...","From dogs riding motorcycles, to inmates danci..."
96,[Music]\ntoday's video is going to primarily b...,today's video is going to primarily be towards...
97,Hello and welcome back to my brand new channel...,Hello and welcome back to my brand new channel...
98,welcome welcome back to Pinoy Planet I'm \nexc...,welcome welcome back to Pinoy Planet I'm excit...


In [8]:
import torch
import nltk
import swifter
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity

from nltk.tokenize import sent_tokenize

# Load mBERT tokenizer and model
model_name = "bert-base-multilingual-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

# Use Apple's Metal (MPS) for acceleration if available
device = "mps" if torch.backends.mps.is_available() else "cpu"
model.to(device)

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(119547, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=Fals

In [9]:
def encode_sentences(sentences, batch_size=16):
    """Encodes sentences into vector representations using mBERT."""
    embeddings = []
    for i in range(0, len(sentences), batch_size):
        batch = sentences[i:i+batch_size]
        encoded_inputs = tokenizer(batch, padding=True, truncation=True, return_tensors="pt").to(device)
        with torch.no_grad():
            outputs = model(**encoded_inputs)
        batch_embeddings = outputs.last_hidden_state.mean(dim=1).cpu()  # Move to CPU for further processing
        embeddings.append(batch_embeddings)
    return torch.cat(embeddings, dim=0)


In [10]:
def compute_cosine_similarities(embeddings):
    """Computes cosine similarity efficiently in batch."""
    return cosine_similarity(embeddings[:-1], embeddings[1:]).diagonal()

In [11]:
def segment_transcript(transcript, threshold=0.7, min_length=30):
    """Segments transcript into meaningful chunks based on sentence similarity."""
    sentences = nltk.sent_tokenize(transcript)
    sentences = [s.strip() for s in sentences if s.strip()]

    if len(sentences) < 2:
        return [transcript]  # Return the whole text if it's too short

    # Encode sentences
    sentence_embeddings = encode_sentences(sentences)

    # Compute cosine similarity between consecutive sentences
    similarities = compute_cosine_similarities(sentence_embeddings)

    # Identify segment boundaries where similarity drops below threshold
    boundaries = [0] + [i + 1 for i, sim in enumerate(similarities) if sim < threshold] + [len(sentences)]

    # Create segments
    segments = [" ".join(sentences[boundaries[i]:boundaries[i + 1]]) for i in range(len(boundaries) - 1)]

    # Merge small segments
    processed_segments, temp_segment = [], ""
    for segment in segments:
        if len(segment) + len(temp_segment) < min_length:
            temp_segment += " " + segment
        else:
            if temp_segment:
                processed_segments.append(temp_segment.strip())
            temp_segment = segment
    if temp_segment:
        processed_segments.append(temp_segment.strip())

    # Ensure at least two segments
    if len(processed_segments) == 1:
        mid = len(sentences) // 2
        processed_segments = [" ".join(sentences[:mid]), " ".join(sentences[mid:])]

    return processed_segments


In [12]:
# Apply segmentation using parallel processing
experiment_df['Segments'] = experiment_df['cleaned_transcript'].swifter.apply(segment_transcript)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Pandas Apply:   0%|          | 0/100 [00:00<?, ?it/s]

In [13]:
# Convert to list of dictionaries
segmented_transcripts = [
    {"Video Id": row["Video Id"], "Segment #": i + 1, "Segment": segment}
    for _, row in experiment_df.iterrows()
    for i, segment in enumerate(row["Segments"])
]

In [14]:
# Convert to DataFrame
segmented_df = pd.DataFrame(segmented_transcripts)

# Display segmented transcripts
print(segmented_df)

         Video Id  Segment #  \
0     MBtWxCM7NYg          1   
1     MBtWxCM7NYg          2   
2     O0cIJww_amw          1   
3     O0cIJww_amw          2   
4     O0cIJww_amw          3   
...           ...        ...   
6696  REvaHqT3uro         24   
6697  REvaHqT3uro         25   
6698  REvaHqT3uro         26   
6699  REvaHqT3uro         27   
6700  REvaHqT3uro         28   

                                                Segment  
0     this did not go to plan when i was like it's g...  
1     in australia so pretty expensive not the cheap...  
2                          I once talked to a filipina.  
3     Then she said, why are you like that? Then I a...  
4     She said, because of your accent. Why is your ...  
...                                                 ...  
6696  liters per person and basically put so we almo...  
6697                                                Oh!  
6698  it looks like they're building a brand new Spa...  
6699                               

In [15]:
# Display segmented transcripts in a structured way
for video_id, group in segmented_df.groupby("Video Id"):
    print(f"\n==== Transcript for Video Id: {video_id} ====")
    for index, row in group.iterrows():
        print(f"\nSegment {row['Segment #']}:\n{row['Segment']}")
    print("\n" + "=" * 50)  # Separator for clarity
    
    
# txt


==== Transcript for Video Id: --8n6A8Q6M0 ====

Segment 1:
Oh, my God.

Segment 2:
It is just the perfect recipe for a good day.

Segment 3:
Want some beer? Sure.

Segment 4:
Oh, it's big bottle. Oh my goodness.

Segment 5:
Three Tarzan?

Segment 6:
Good morning from Bohol here in the Philippines.

Segment 7:
Everybody guys, this look how gorgeous this place is.

Segment 8:
Crystal clear water.

Segment 9:
It's just honestly such a vibe, perfect temperature to be spending the day by the beach. And that is exactly what we're gonna do today because I'm staying about meters up that way at a $ per night hotel on there for the next couple of nights.

Segment 10:
And I arrived in the, uh, last night.

Segment 11:
Uh, it was raining.

Segment 12:
It was kind of a little bit miserable. So I was like, you know what?

Segment 13:
We'll wait until the weather is good and boy, or boy did the weather turn out good today.

Segment 14:
This water guys, if you know me, I'm Australian.

Segment 15:
I 

In [16]:
# BERTopic on Segmented Transcripts

from bertopic import BERTopic
from sentence_transformers import SentenceTransformer

# Load multilingual embedding model
embedding_model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')

# Initialize BERTopic with the multilingual embedding model
topic_model = BERTopic(embedding_model=embedding_model, language="multilingual")

# Extract segments and their corresponding Video Ids
segments = []
video_ids = []

for idx, row in segmented_df.iterrows():
    segments.append(row['Segment'])
    video_ids.append(row['Video Id'])

# Fit the BERTopic model
topics, _ = topic_model.fit_transform(segments)

# Create a DataFrame to track topics with Video Ids and Segment numbers
topic_data = {
    "Video Id": video_ids,
    "Segment": segments,
    "Topic": topics
}

In [17]:
# INITIAL

topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,2053,-1_it_the_we_you,"[it, the, we, you, and, to, of, there, all, like]","[So, guys I'm walking down the road right now ..."
1,0,216,0_philippines_we_to_you,"[philippines, we, to, you, the, in, and, so, o...",[we're the lockwoods and we've been traveling ...
2,1,152,1_her_she_voice_singer,"[her, she, voice, singer, very, singing, their...",[..yes this one it's very distracting the cost...
3,2,106,2_yeah_yes_oh_maybe,"[yeah, yes, oh, maybe, really, correct, yep, r...","[- Yeah, yeah, yeah., Yeah. Oh, yeah., Oh, yea..."
4,3,85,3_food_filipino_so_it,"[food, filipino, so, it, of, is, here, this, a...",[they have some type of functions you know oh ...
...,...,...,...,...,...
153,152,11,152_guys_all_ends_right,"[guys, all, ends, right, hi, pool, eh, well, o...","[All right guys., All right guys., All right g..."
154,153,11,153_large_looks_tank_tops,"[large, looks, tank, tops, bummer, sizes, gray...","[That looks good., Oh, this one looks good., O..."
155,154,10,154_leave_retreat_disturbing_wildlife,"[leave, retreat, disturbing, wildlife, away, p...",[I left it for you. I'll leave you alone. You ...
156,155,10,155_pay_cash_him_hiiiiii,"[pay, cash, him, hiiiiii, premier, payment, up...","[Ah, okay, I'll pay you now then., - You can p..."


In [18]:
# Create a DataFrame to organize the results
topic_df = pd.DataFrame(topic_data)

topic_df

Unnamed: 0,Video Id,Segment,Topic
0,MBtWxCM7NYg,this did not go to plan when i was like it's g...,97
1,MBtWxCM7NYg,in australia so pretty expensive not the cheap...,84
2,O0cIJww_amw,I once talked to a filipina.,-1
3,O0cIJww_amw,"Then she said, why are you like that? Then I a...",1
4,O0cIJww_amw,"She said, because of your accent. Why is your ...",103
...,...,...,...
6696,REvaHqT3uro,liters per person and basically put so we almo...,-1
6697,REvaHqT3uro,Oh!,31
6698,REvaHqT3uro,it looks like they're building a brand new Spa...,3
6699,REvaHqT3uro,Hello El Nido.,-1


In [19]:
# Display topics along with their corresponding Video Ids and Segment Numbers
for topic in set(topics):
    print(f"\n==== Topic {topic} ====")
    topic_segments = topic_df[topic_df['Topic'] == topic]
    for _, row in topic_segments.iterrows():
        print(f"Video Id: {row['Video Id']}, Segment: {row['Segment']}")
    print("=" * 50)


==== Topic 0 ====
Video Id: O0cIJww_amw, Segment: I'm the Filipino here, but your accent is more beautiful than mine.
Video Id: O0cIJww_amw, Segment: The first strategy: I looked for filipino natives Why did I do that?
Video Id: O0cIJww_amw, Segment: Google said, the first important strategy is to get native filipinos If you want filipino language, look for native filipinos.
Video Id: O0cIJww_amw, Segment: So that's what I did Since maybe I was naive before and didn't have knowledge of the world "The world of learning language" So I did what they told me I looked for native filipinos on Facebook as well as on Messenger Because there are many filipinos on Facebook There are things I didn't like in that strategy Like, What will I say?
Video Id: O0cIJww_amw, Segment: There are Filipinos who were impolite to me.
Video Id: R8-LyHgcl5I, Segment: hi my name is josie and this is boracay island in the philippines my name's josie welcome back to my channel or welcome if you're new here so today

In [21]:
# UPDATE

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

from bertopic.representation import PartOfSpeech

import requests

# Custom stopwords list
custom_stopwords = ['like', 'yeah', 'yes', 'let', 'okay', 'gonna', 'ok', 'bye', 'just', 'got', 'right', 'mmm', 'oh', 'ah', 'gosh']

# Fetch stopwords from GitHub
github_stopwords_url = "https://raw.githubusercontent.com/stopwords-iso/stopwords-en/master/stopwords-en.txt"
github_stopwords = set(requests.get(github_stopwords_url).text.splitlines())

# Append GitHub stopwords to the custom stopwords set
custom_stopwords = set(custom_stopwords)  # Convert to a set
custom_stopwords.update(github_stopwords)  # ✅ This now works because custom_stopwords is a set

# Combine predefined stopwords (ENGLISH_STOP_WORDS) with custom stopwords
combined_stopwords = list(ENGLISH_STOP_WORDS.union(set(custom_stopwords)))

# Create a CountVectorizer with combined stopwords as a list
vectorizer_model = CountVectorizer(stop_words=combined_stopwords)


In [22]:
representation_model = PartOfSpeech("en_core_web_sm")

In [23]:
topic_model.update_topics(segments, vectorizer_model=vectorizer_model, representation_model=representation_model)

In [24]:
# UPDATED

topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,2053,-1_minutes_time_guys_house,"[minutes, time, guys, house, lechon, people, s...","[So, guys I'm walking down the road right now ..."
1,0,216,0_philippines_island_filipino_super,"[philippines, island, filipino, super, time, c...",[we're the lockwoods and we've been traveling ...
2,1,152,1_voice_singer_media_singing,"[voice, singer, media, singing, actress, runs,...",[..yes this one it's very distracting the cost...
3,2,106,2_correct_sense_authentic_joke,"[correct, sense, authentic, joke, change, frie...","[- Yeah, yeah, yeah., Yeah. Oh, yeah., Oh, yea..."
4,3,85,3_food_filipino_pork_chili,"[food, filipino, pork, chili, sisig, lot, swee...",[they have some type of functions you know oh ...
...,...,...,...,...,...
153,152,11,152_guys_pool__,"[guys, pool, , , , , , , , ]","[All right guys., All right guys., All right g..."
154,153,11,153_tank_bummer_tops_gray,"[tank, bummer, tops, gray, sizes, medium, opti...","[That looks good., Oh, this one looks good., O..."
155,154,10,154_retreat_wildlife_port_streets,"[retreat, wildlife, port, streets, resort, mom...",[I left it for you. I'll leave you alone. You ...
156,155,10,155_pay_cash_premier_card,"[pay, cash, premier, card, friendly, price, mo...","[Ah, okay, I'll pay you now then., - You can p..."


In [24]:
# Visualize the topic distribution
topic_model.visualize_barchart(top_n_topics=160)

In [25]:
# Group by Video Id and get the unique topics for each Video Id
video_topics = topic_df.groupby('Video Id')['Topic'].unique()

# Display the topics for each Video Id
for video_id, topics in video_topics.items():
    print(f"Transcript Video {video_id}:")
    print(f"Topics Present: {', '.join(map(str, topics))}\n")

Transcript Video --8n6A8Q6M0:
Topics Present: 109, 123, 16, 52, -1, 0, 113, 114, 43, 34, 8, 97, 64, 51, 21, 120, 137, 135, 4, 66, 85, 37, 92, 45, 30, 131, 76, 84, 17, 83, 2, 23, 50, 122, 60, 19, 102, 59, 61, 24, 78, 55, 152, 1, 136, 33, 98, 80, 58, 142, 39, 143, 46, 10, 144, 44, 32, 88, 126, 87, 90, 127, 15, 154, 35, 73, 147, 49, 134, 129, 119, 141, 145, 117, 18, 14, 91, 74, 138, 70, 146, 106, 82, 118, 11, 107, 38, 108, 75, 81, 110, 28, 5, 9, 86, 31, 41, 62, 116, 150, 104, 7, 63, 99, 128, 105

Transcript Video 0IMWasj76yU:
Topics Present: 0

Transcript Video 1kErCqgIVMk:
Topics Present: -1

Transcript Video 2TmagN6RhkI:
Topics Present: 1

Transcript Video 2ftG8JuMzz4:
Topics Present: 3, 0, 139, 45, -1, 99, 89, 15, 2, 7, 27, 94, 13, 84, 111, 6, 149, 39

Transcript Video 3n1nspLgfd4:
Topics Present: 3

Transcript Video 401ZvQ11euA:
Topics Present: 12, -1, 36, 80, 76, 125, 0, 156, 37, 50, 4, 51, 147, 140, 5, 139

Transcript Video 4OiOwDh5lKE:
Topics Present: 3

Transcript Video 6ftiWoCJ4d

In [26]:
# Retrieve topic names from BERTopic
topic_info = topic_model.get_topic_info()
topic_mapping = {row["Topic"]: row["Name"] for _, row in topic_info.iterrows()}

# Count topic occurrences per Video Id
topic_counts = topic_df.groupby(["Video Id", "Topic"]).size().reset_index(name="Count")

# Calculate percentage per Video Id
topic_counts["Percentage"] = topic_counts.groupby("Video Id")["Count"].transform(lambda x: (x / x.sum()) * 100)

# Map topic names
topic_counts["Topic Name"] = topic_counts["Topic"].map(topic_mapping)

# Aggregate topics into a single row per Video Id with newline separation
video_topics = topic_counts.groupby("Video Id").apply(
    lambda group: "\n".join(
        [f"{row['Topic Name']} ({row['Percentage']:.2f}%)" for _, row in group.iterrows()]
    )
).reset_index(name="List of Topics with %")

# Display the final DataFrame
print(video_topics)

       Video Id                              List of Topics with %
0   --8n6A8Q6M0  -1_minutes_time_guys_house (30.01%)\n0_philipp...
1   0IMWasj76yU      0_philippines_island_filipino_super (100.00%)
2   1kErCqgIVMk               -1_minutes_time_guys_house (100.00%)
3   2TmagN6RhkI             1_voice_singer_media_singing (100.00%)
4   2ftG8JuMzz4  -1_minutes_time_guys_house (30.14%)\n0_philipp...
..          ...                                                ...
95  y9iCFcSaD58  -1_minutes_time_guys_house (38.21%)\n0_philipp...
96  y9xOKcZD0xE  -1_minutes_time_guys_house (44.00%)\n0_philipp...
97  yR6a9-ufsZc                89_soup_broth_bun_instant (100.00%)
98  zTcCSyucYqI               -1_minutes_time_guys_house (100.00%)
99  zwOJjQuL4i4  -1_minutes_time_guys_house (14.29%)\n1_voice_s...

[100 rows x 2 columns]


  video_topics = topic_counts.groupby("Video Id").apply(


In [27]:
from gensim.models import CoherenceModel
from gensim.corpora import Dictionary

# Prepare the documents (segments) for gensim coherence model
documents = [segment.split() for segment in segments]  # Assuming `segments` is your list of sentences

# Create the dictionary from the segments
dictionary = Dictionary(documents)

# Extract the topics and top words
topics = topic_model.get_topics()  # Get topics and their words

# Prepare a list of coherence scores for each topic
topic_coherence_scores = []

for topic, words in topics.items():
    # Convert words to token IDs (ignore words not in dictionary)
    top_words_ids = [dictionary.token2id[word] for word, _ in words if word in dictionary.token2id]
    
    if top_words_ids:
        # Compute the coherence for the current topic
        coherence_model = CoherenceModel(
            topics=[top_words_ids],  # List of token IDs for this topic
            texts=documents,         # Segments as tokenized documents
            dictionary=dictionary,   # Dictionary for token mapping
            coherence='c_v'          # You can change this to other methods like 'u_mass', 'c_uci', or 'c_npmi'
        )
        
        # Append the coherence score for the current topic
        topic_coherence_scores.append(coherence_model.get_coherence())

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [28]:
# Print coherence scores per topic
for idx, score in enumerate(topic_coherence_scores):
    print(f"Coherence Score for Topic {idx + 1}: {score}")

Coherence Score for Topic 1: 0.27385364826818354
Coherence Score for Topic 2: 0.30252207122579805
Coherence Score for Topic 3: 0.32149341130734466
Coherence Score for Topic 4: 0.403843741295689
Coherence Score for Topic 5: 0.35391818002587927
Coherence Score for Topic 6: 0.4440252326017638
Coherence Score for Topic 7: 0.24150241739411116
Coherence Score for Topic 8: 0.5029134533084241
Coherence Score for Topic 9: 0.3959964662861908
Coherence Score for Topic 10: 0.31145266962150375
Coherence Score for Topic 11: 0.475907438202612
Coherence Score for Topic 12: 1.0
Coherence Score for Topic 13: 0.45419410428167695
Coherence Score for Topic 14: 0.3991319460811908
Coherence Score for Topic 15: 0.34363453391108956
Coherence Score for Topic 16: 0.2722986457913402
Coherence Score for Topic 17: 0.21196798302974268
Coherence Score for Topic 18: 0.2830750550371901
Coherence Score for Topic 19: 1.0
Coherence Score for Topic 20: 0.4562597638865066
Coherence Score for Topic 21: 0.36511383984242274
Co

In [27]:
topic_info = topic_model.get_topic_info()

topic_info.to_csv("WEEK6_BERTopic_info.csv", index=False)

In [28]:
# Extract Video Id from the 'Link' column in topics_per_video_df
topics_per_video_df["Video Id"] = topics_per_video_df["Link"].str.extract(r"v=([a-zA-Z0-9_-]+)")

# Create a mapping of Video Id to its BERTopic topics
video_topic_mapping = dict(zip(video_topics["Video Id"], video_topics["List of Topics with %"]))

# Update the 'BERTopic Topics' column based on matching Video Ids
topics_per_video_df["BERTopic Topics"] = topics_per_video_df["Video Id"].map(video_topic_mapping)

# Drop the temporary 'Video Id' column (keeping all other columns intact)
topics_per_video_df.drop(columns=["Video Id"], inplace=True)


In [29]:
# Display updated DataFrame
topics_per_video_df

Unnamed: 0_level_0,Link,LDA1 Topics,LDA2 Topics,BERTopic Topics,HLTA Topics
Video Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Is this Asia's BIGGEST mall in Manila THE PHILIPPINES! (didnâ€™t go to plan) ðŸ‡µðŸ‡­,https://www.youtube.com/watch?v=MBtWxCM7NYg,,,84_cheap_games_cheaper_oreo (50.00%)\n97_sushi...,"yesterday, camera, shopping, coffee, intestine..."
TOP 6 METHODS I USED TO ACQUIRE TAGALOGFILIPINO - (Tagalog with English Subtitles),https://www.youtube.com/watch?v=O0cIJww_amw,,,-1_minutes_time_guys_house (24.39%)\n0_philipp...,[]
Boracay is PARADISE in The Philippines! (canâ€™t believe I did this..)ðŸ‡µðŸ‡­,https://www.youtube.com/watch?v=R8-LyHgcl5I,,,0_philippines_island_filipino_super (100.00%),"yesterday, camera, shopping, coffee, intestine..."
"First Time in Manila, Philippines! Eating Jollibee, taking the metro, and visiting tourist places",https://www.youtube.com/watch?v=Gl2RpA1xS6M,,,0_philippines_island_filipino_super (50.00%)\n...,"ice, deep, pour, storm, breakfast, super-super..."
Chef Kuya Joey Is Flying To The Philippines! ðŸ‡µðŸ‡­,https://www.youtube.com/watch?v=4OiOwDh5lKE,,,3_food_filipino_pork_chili (100.00%),"singer, voice, performance, bell, regine-velas..."
...,...,...,...,...,...
STRANGEST Things Found In The Philippines!,https://www.youtube.com/watch?v=vwIIeiOhTJ0,,,-1_minutes_time_guys_house (45.45%)\n0_philipp...,"davao, davao-city, island-hop, lagoon, siargao..."
Filipinas Do Not Show Up LATE - Dating A Foreigner,https://www.youtube.com/watch?v=8DYPxsr7kDI,,,-1_minutes_time_guys_house (50.00%)\n39_video_...,"charge, funny, hold, notice, mango, kuala-lump..."
Vocal Coach REACTS to TNT Boys BANG BANG + 5 songs #Yourfacesoundsfamiliar Lucia Sinatra,https://www.youtube.com/watch?v=S4bNEgM483k,,,-1_minutes_time_guys_house (11.11%)\n1_voice_s...,"davao, davao-city, island-hop, lagoon, siargao..."
After Spending Years With Filipino's My English Sounds...,https://www.youtube.com/watch?v=NdLjKxqzDbA,,,0_philippines_island_filipino_super (100.00%),"photo, session, goodness, hair : Level 2 (1.00..."


In [30]:
topics_per_video_df.to_csv("topics_per_video.csv")

### LDA

In [31]:
import os
import random
import requests
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from gensim import corpora
from gensim.models.ldamodel import LdaModel
from gensim.models import FastText,  CoherenceModel
import numpy as np
import spacy
import string
from wordcloud import WordCloud
import unicodedata
import matplotlib.pyplot as plt
from nltk.util import ngrams
from collections import defaultdict, Counter
import itertools

In [32]:
# Fetch stopwords
def fetch_stopwords_from_github(url):
    response = requests.get(url)
    return set(response.text.splitlines())

github_stopwords_url = 'https://raw.githubusercontent.com/stopwords-iso/stopwords-en/master/stopwords-en.txt'
github_stopwords = fetch_stopwords_from_github(github_stopwords_url)

stop_words = set(stopwords.words('english'))
custom_stop_words = ['like', 'yeah', 'know', 'um', 'uh', 'really', 'one', 'go', 'right', 'okay', 'well', 'said', 
                    'going', 'got', 'na', 'always', 'every', 'each', 'say', 'el', 'little', 'still', 
                    'best', 'dutch', 'nice', 'great', 'awesome', 'good', 'cool', 'love', 'amazing', 'wow',
                    'breaking news', 'report', 'coverage', 'investigation', 'interview', 'documentary', 'news', 'netherlands', 'psy', 'subtitle', 'description', 'link', 
                    'journalist', 'headline', 'reporter', 'current events', 'special report', 
                    'analysis', 'documented', 'broadcast', 'reporting', 'v', 'food', 'travel', 'react', 
                    'reacts', 'reaction', 'foreigner', 'thing', 'visit', 'dc', 'japan', 'first', 'fast', 
                    'asia', 'ang', 'indian', 'thai', 'vietnamese', 'russia', 'gon', 'canada', 'canadian', 'russian', 
                    'russia', 'guy', 'lot', 'bit', 'diba', 'ola', 'cuz', 'thai', 'thailand', 'person', 'citizen', 'foreigner', 'foreign', 'foreigners',
                    'facebook', 'filipinos', 'filipinas', 'vlog', 'vlogs', 'vlogging', 'hashtag', 'india', 'bro', 'dito', 'people', 'time', 'music', 'gonna', 'life', 
                    'lol', 'guys', 'tho', 'cute', 'hmm', 'huh', 'channel', 'subscribe', 'day6', 'mandarin', 'chinese', 'beautiful',
                    'chuckles', 'fbe', 'hit', 'laughs', 'yo', 'ka', 'word', 'living', 'boi', 'minimum', 'ya', 'successful', 'perfectly', 'yeap', 
                    'wondering', 'fantastic', 'hurry', 'german', 'age', 'country', 'subscribing', 'bluesy', 'jump', 'pretty', 'understanding', 'personalized',
                    'and', 'the', 'to', 'of', 'in', 'for', 'on', 'with', 'at', 'by', 'from', 'up', 'about', 'over', 'into', 'through', 'between', 'under', 'against', 'all',
                    'you', 'haha', 'hahaha', 'ha', 'hey', 'bye', 'hello', 'hi', 'oh', 'blah', 'easy', 'alright', 'ta', 'day', 'ooh', 'en', 'do', 'lot', 'comment', 'notification', 
                    'korean', 'jjajangmyeon', 'jajangmyeon', 'damn', 'yall', 'month', 'week', 'year', 'ohhh', 'pvf', 'dude', 'mmm', 'kagilagilalas', 'ofcourse', 'australia', 'uxo', 
                    'atleast', 'yusuf', 'bangkok', 'ot', 'anytime', 'allover', 'kala', 'nope', 'wan', 'brazil', 'smooth', 'ot', 'timeshere', 'batchof', 'yep', 'opo', 'del',
                    'gosh', 'po', 'ourself', 'wo', 'wait', 'ugh', 'nyc', 'whoa', 'nicaragua', 'yup', 'em', 'bout', 'le', 'omg', 'overwhelm', 'maam', 'nicer', 'haha', 'hahaha', 'ha', 
                    'nbcs', 'lana', 'rc', 'whatsoever', 'oxy', 'decade', 'whyd', 'unknown', 'ahhhhh', 'ohoh', 'ohto', 'ohhhh', 'bruh', 'ooe', 'ahmedabad', 'mexico', 
                    'understand', 'excuse', 'kinda', 'applause', 'oooh', 'thiswhat', 'nevermind', 'ahh', 'againthank', 'toto', 'aww', 'nah', 'bbmas', 'ay', 'op', 'huh', 'huhu',
                    'tada', 'beacuse', 'voila', 'upstairs', 'thatswhy', 'yea', 'that', 'armenia', 'or', 'not', 'funwhat', 'aka', 'armeniathat', 'woosexy', 'worth', 'laugh', 'box', 
                    'xd', 'vb', 'eff', 'ananya', 'welsh', 'latron', 'shout', 'whatwhat', 'what', 'pause', 'why', 'thats', 'byebye', 'iv', 'bye', 'ado', 'ownup', 'dom', 'jomm', 'sir', 
                    'budgie', 'nomac', 'lavocha', 'germany', 'why', 'walang', 'superduper', 'philip', 'mom', 'jre', 'giddy', 'intro', 'dupe', 'europe', 'dream', 'team', 'dislike', 'content', 
                    'yoongi', 'royale', 'ilu', 'jhope', 'day', 'jin', 'ecc', 'nyhs', 'nego', 'chavez', 'pb', 'everyones', 'epic', 'matter', 'oneonone', 'region', 'change', 'ho', 'seetoh', 
                    'atin', 'vpn', 'facetune', 'busu', 'mackie', 'clyd', 'china', 'rest', 'friend', 'woah', 'dindins', 'poster', 'vibe', 'woman', 'boss', 'woah', 'type', 'mahana', 'joke', 
                    'taller', 'insane', 'whang', 'psa', 'manatee', 'recommend', 'caesar', 'mmmhmm', 'mosul', 'dun', 'clue', 'naysayer', 'hindi', 'ko', 'pero', 'bulgaria', 'question', 'video', 
                    'yobi', 'hindu', 'expat', 'option', 'gap', 'eu', 'simo', 'kouignamann', 'bct', 'month', 'cfo', 'philippines', 'philippine', 'british', 'filipino', 'video', 
                    'http', 'korea', 'korean', 'youtube', 'google', 'united', 'america', 'american', 'kpop', '필리핀', 'bts', 'blackpink', 'twice', 'exo', 'k-pop', 
                    'seventeen', 'stray kids', 'nct', 'kdrama', 'aespa', 'taehyung', 'jimin', 'jungkook']
stop_words.update(custom_stop_words, github_stopwords)

lemmatizer = WordNetLemmatizer()

In [33]:
# Helper function to check if a word contains only Latin script characters
def is_latin_script(word):
    for char in word:
        if not ('LATIN' in unicodedata.name(char, '') or char.isdigit()):
            return False
    return True

def preprocess_text(doc, video_id, ngram_range=(1, 2), tag_weight=1, ngram_weight_factor=1):
    doc = doc.lower().translate(str.maketrans('', '', string.punctuation))
    tokens = word_tokenize(doc)
    tokens_with_pos = pos_tag(tokens)

    # Apply lemmatization
    lemmatized_tokens = [lemmatizer.lemmatize(word, get_wordnet_pos(pos)) for word, pos in tokens_with_pos]

    # Generate n-grams
    ngram_tokens = []
    for n in range(ngram_range[0], ngram_range[1] + 1):
        ngram_tokens.extend([' '.join(gram) for gram in ngrams(lemmatized_tokens, n)])

    # Count n-gram frequency
    ngram_frequencies = Counter(ngram_tokens)

    # Filter stopwords and non-useful words
    filtered_tokens = []
    for word, pos in tokens_with_pos:
        if not word.isalpha() or word in stop_words:
            continue
        if pos in ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'JJ', 'JJR', 'JJS', 'RB', 'RBR', 'RBS']:
            continue  # Remove verbs and adjectives
        filtered_tokens.append(word)

    # Assign weight based on n-grams 
    weighted_tokens = []
    for token in filtered_tokens:
        token_weight = 1 + ngram_frequencies.get(token, 0) * ngram_weight_factor
        weighted_tokens.extend([token] * int(token_weight))

    # Include video tags
    if video_id in video_tags:
        tags = video_tags[video_id]
        for tag in tags:
            if tag.isalpha():
                tag_weight_multiplier = tag_weight
                weighted_tokens.extend([tag] * int(tag_weight_multiplier))
    return ' '.join(weighted_tokens)

def get_wordnet_pos(treebank_tag):
    """Convert POS tag to WordNet format for lemmatization."""
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN  # Default to noun
    
# Folder paths
transcripts_folder_path = 'standard_dataset/'
tags_folder_path = 'tags/'

# Function to load video tags only for fetched video IDs
def load_video_tags(folder_path, video_ids):
    video_tags = {}
    for video_id in video_ids:
        tag_file = os.path.join(folder_path, f"{video_id}.txt")
        if os.path.exists(tag_file):
            with open(tag_file, "r", encoding="utf-8") as file:
                tags_content = file.read().lower()
                video_tags[video_id] = tags_content.split()  # Store as list of words
        else:
            video_tags[video_id] = []  # Default to empty list if no tags
    return video_tags

video_ids = []
transcript_files = []
for file_name in os.listdir(transcripts_folder_path):
    if file_name.endswith('.txt'):
        video_id = file_name.split('_captions')[0]
        video_ids.append(video_id)
        transcript_files.append((video_id, file_name)) 

video_tags = load_video_tags(tags_folder_path, video_ids)

all_documents = []
for video_id, file_name in transcript_files:
    with open(os.path.join(transcripts_folder_path, file_name), 'r', encoding='utf-8') as file:
        content = file.read().lower()
        if len(content.split()) >= 100: 
            processed_text = preprocess_text(content, video_id)  
            all_documents.append(processed_text)

In [34]:
# Create Dictionary and Corpus for LDA
dictionary = corpora.Dictionary([doc.split() for doc in all_documents])
corpus = [dictionary.doc2bow(doc.split()) for doc in all_documents]

# Train LDA model
lda_model = LdaModel(corpus, num_topics=3, id2word=dictionary, alpha='auto', eta='symmetric', passes=100)

# Compute coherence score
coherence_model = CoherenceModel(model=lda_model, corpus=corpus, dictionary=dictionary, coherence='u_mass')
coherence_score = coherence_model.get_coherence()

print(f"Coherence Score: {coherence_score}")

Coherence Score: -4.817815508960809


In [35]:
for idx, doc_bow in enumerate(corpus):
    video_id = video_ids[idx]  
    topic_distribution = lda_model.get_document_topics(doc_bow, minimum_probability=0)
    print(f"Video {video_id}: {topic_distribution}")

topics = lda_model.print_topics(num_words=20) 
for topic_id, topic_words in topics:
    print(f"Topic {topic_id}: {topic_words}")


Video MBtWxCM7NYg_Is this Asia's BIGGEST mall in Manila THE PHILIPPINES! (didnâ€™t go to plan) ðŸ‡µðŸ‡­: [(0, 0.07273847), (1, 0.9272083), (2, 5.3206466e-05)]
Video O0cIJww_amw_TOP 6 METHODS I USED TO ACQUIRE TAGALOGFILIPINO -  (Tagalog with English Subtitles): [(0, 0.00010113662), (1, 3.9200815e-05), (2, 0.9998597)]
Video R8-LyHgcl5I_Boracay is PARADISE in The Philippines! (canâ€™t believe I did this..)ðŸ‡µðŸ‡­: [(0, 0.99994284), (1, 1.6544383e-05), (2, 4.0604347e-05)]
Video Gl2RpA1xS6M_First Time in Manila, Philippines! Eating Jollibee, taking the metro, and visiting tourist places: [(0, 0.9385495), (1, 0.0614065), (2, 4.4030952e-05)]
Video 4OiOwDh5lKE_Chef Kuya Joey Is Flying To The Philippines!  ðŸ‡µðŸ‡­: [(0, 0.9997113), (1, 8.358531e-05), (2, 0.00020514076)]
Video bKCr62Dk7Us_Is This The BEST Filipino Restaurant in Malaysia! ðŸ‡µðŸ‡­ (Kuala Lumpur Mukbang ðŸ‡²ðŸ‡¾): [(0, 0.99996525), (1, 1.00783445e-05), (2, 2.4734956e-05)]
Video j7u0a_WH7hQ_Foreigners biggest SURPRISE in the P

In [36]:
for idx, doc_bow in enumerate(corpus):
    video_id = video_ids[idx]
    topic_distribution = lda_model.get_document_topics(doc_bow, minimum_probability=0)
    dominant_topic = sorted(topic_distribution, key=lambda x: x[1], reverse=True)[0]  # Highest probability
    print(f"Video {video_id}: Dominant Topic {dominant_topic[0]} with probability {dominant_topic[1]:.4f}")


Video MBtWxCM7NYg_Is this Asia's BIGGEST mall in Manila THE PHILIPPINES! (didnâ€™t go to plan) ðŸ‡µðŸ‡­: Dominant Topic 1 with probability 0.9272
Video O0cIJww_amw_TOP 6 METHODS I USED TO ACQUIRE TAGALOGFILIPINO -  (Tagalog with English Subtitles): Dominant Topic 2 with probability 0.9999
Video R8-LyHgcl5I_Boracay is PARADISE in The Philippines! (canâ€™t believe I did this..)ðŸ‡µðŸ‡­: Dominant Topic 0 with probability 0.9999
Video Gl2RpA1xS6M_First Time in Manila, Philippines! Eating Jollibee, taking the metro, and visiting tourist places: Dominant Topic 0 with probability 0.9385
Video 4OiOwDh5lKE_Chef Kuya Joey Is Flying To The Philippines!  ðŸ‡µðŸ‡­: Dominant Topic 0 with probability 0.9997
Video bKCr62Dk7Us_Is This The BEST Filipino Restaurant in Malaysia! ðŸ‡µðŸ‡­ (Kuala Lumpur Mukbang ðŸ‡²ðŸ‡¾): Dominant Topic 0 with probability 1.0000
Video j7u0a_WH7hQ_Foreigners biggest SURPRISE in the Philippines (random Street interviews): Dominant Topic 2 with probability 0.9229
Video --8n6

In [37]:
from collections import defaultdict

# Dictionary to store the count of videos per topic
topic_video_count = defaultdict(int)

# Mapping of video ID to its dominant topic
video_topic_mapping = {}

for idx, doc_bow in enumerate(corpus):
    video_id = video_ids[idx]  # Get video ID
    topic_distribution = lda_model.get_document_topics(doc_bow, minimum_probability=0)

    # Get the most dominant topic (highest probability)
    dominant_topic = sorted(topic_distribution, key=lambda x: x[1], reverse=True)[0][0]
    
    # Store the mapping
    video_topic_mapping[video_id] = dominant_topic
    
    # Increase count for that topic
    topic_video_count[dominant_topic] += 1

# Print number of videos assigned to each topic
print("\nNumber of Videos per Topic:")
for topic, count in sorted(topic_video_count.items()):
    print(f"Topic {topic}: {count} videos")



Number of Videos per Topic:
Topic 0: 40 videos
Topic 1: 15 videos
Topic 2: 45 videos


In [38]:
topic_to_videos = defaultdict(list)

video_topic_mapping = {}

# probability threshold for assigning multiple topics
prob_threshold = 0.2

# Dictionary to store topic words for each video
video_topic_words_LDA1 = {}

for idx, doc_bow in enumerate(corpus):
    video_id = video_ids[idx]  # Get video ID
    topic_distribution = lda_model.get_document_topics(doc_bow, minimum_probability=0)

    # Get topics where probability is above threshold
    assigned_topics = [topic for topic, prob in topic_distribution if prob >= prob_threshold]
    video_topic_mapping[video_id] = assigned_topics  # Store assigned topics per video

    for topic in assigned_topics:
        topic_to_videos[topic].append(video_id)

    # Get the representative words for each assigned topic
    topic_words = []
    for topic in assigned_topics:
        words = [word for word, _ in lda_model.show_topic(topic, topn=10)]  # Get top 10 words
        topic_words.append(", ".join(words))  # Convert list to string

    # Store the topic words as a string
    video_topic_words_LDA1[video_id] = "; ".join(topic_words)  # Separate topics with `;`

# Count occurrences of each topic
topic_counts = Counter()

for topics in video_topic_mapping.values():
    for topic in topics:
        topic_counts[topic] += 1

# Print the number of videos per topic
print("\nNumber of Videos Per Topic:")
for topic, count in sorted(topic_counts.items()):
    print(f"Topic {topic}: {count} videos")

# Print topics assigned per video
print("\nTopics Assigned Per Video:")
for video_id, topics in video_topic_mapping.items():
    topic_list = ', '.join(map(str, topics)) if topics else "No dominant topic"
    print(f"Video ID: {video_id} → Topics: {topic_list}")

# Print videos per topic
print("\nTop Words Per Topic:")
num_words = 30  

for topic_id in sorted(topic_to_videos.keys()): 
    top_words = lda_model.show_topic(topic_id, num_words)
    words_str = ', '.join([word for word, prob in top_words])
    print(f"Topic {topic_id}: {words_str}")



Number of Videos Per Topic:
Topic 0: 45 videos
Topic 1: 16 videos
Topic 2: 48 videos

Topics Assigned Per Video:
Video ID: MBtWxCM7NYg_Is this Asia's BIGGEST mall in Manila THE PHILIPPINES! (didnâ€™t go to plan) ðŸ‡µðŸ‡­ → Topics: 1
Video ID: O0cIJww_amw_TOP 6 METHODS I USED TO ACQUIRE TAGALOGFILIPINO -  (Tagalog with English Subtitles) → Topics: 2
Video ID: R8-LyHgcl5I_Boracay is PARADISE in The Philippines! (canâ€™t believe I did this..)ðŸ‡µðŸ‡­ → Topics: 0
Video ID: Gl2RpA1xS6M_First Time in Manila, Philippines! Eating Jollibee, taking the metro, and visiting tourist places → Topics: 0
Video ID: 4OiOwDh5lKE_Chef Kuya Joey Is Flying To The Philippines!  ðŸ‡µðŸ‡­ → Topics: 0
Video ID: bKCr62Dk7Us_Is This The BEST Filipino Restaurant in Malaysia! ðŸ‡µðŸ‡­ (Kuala Lumpur Mukbang ðŸ‡²ðŸ‡¾) → Topics: 0
Video ID: j7u0a_WH7hQ_Foreigners biggest SURPRISE in the Philippines (random Street interviews) → Topics: 2
Video ID: --8n6A8Q6M0_$200 Luxury Beach Hotel in The Philippines ðŸ‡µðŸ‡­ → T

In [39]:
# Folder paths
transcripts_folder_path = 'standard_dataset/'
tags_folder_path = 'tags/'

# Load video tags
def load_video_tags(folder_path, video_ids):
    video_tags = {}
    for video_id in video_ids:
        tag_file = os.path.join(folder_path, f"{video_id}.txt")
        if os.path.exists(tag_file):
            with open(tag_file, "r", encoding="utf-8") as file:
                tags_content = file.read().lower()
                video_tags[video_id] = tags_content.split()  
        else:
            video_tags[video_id] = []  
    return video_tags

# Load transcripts
video_ids = []
transcript_files = []
for file_name in os.listdir(transcripts_folder_path):
    if file_name.endswith('.txt'):
        video_id = file_name.split('_captions')[0]
        video_ids.append(video_id)
        transcript_files.append((video_id, file_name)) 

video_tags = load_video_tags(tags_folder_path, video_ids)

video_topics = {}

for video_id, file_name in transcript_files:
    with open(os.path.join(transcripts_folder_path, file_name), 'r', encoding='utf-8') as file:
        content = file.read().lower()
        
        if len(content.split()) >= 100:  
            processed_text = preprocess_text(content, video_id)
            
            # Create Dictionary and Corpus
            tokens = processed_text.split()
            corpus = [dictionary.doc2bow(tokens)]
            
            # Train LDA Model
            num_topics = 3  # Limit topics to at most 5 or vocab size
            lda_model_2 = LdaModel(corpus, num_topics=3, id2word=dictionary, alpha='auto', eta='symmetric', passes=100)
            
            # Compute Coherence Score
            coherence_model = CoherenceModel(model=lda_model_2, corpus=corpus, dictionary=dictionary, coherence='u_mass')
            coherence_score = coherence_model.get_coherence()
            
            # Get topic distribution for the video
            topic_distribution = lda_model_2.get_document_topics(corpus[0], minimum_probability=0)
            assigned_topics = {topic: prob for topic, prob in topic_distribution if prob > 0.5}  # Threshold for multiple topics
            
            # Store video topic info
            video_topics[video_id] = {
                "topics": assigned_topics,
                "top_words": {i: lda_model_2.show_topic(i, 20) for i in range(num_topics)},
                "coherence": coherence_score
            }

# Dictionary to store LDA2 topic words for each video
video_topic_words_lda2 = {}

for video_id, data in video_topics.items():
    topic_words = []
    
    for topic_id, prob in data["topics"].items():
        words = [word for word, _ in data["top_words"][topic_id]]  # Extract top words
        topic_words.append(", ".join(words))  # Convert list to string

    # Store the topic words as a string
    video_topic_words_lda2[video_id] = "; ".join(topic_words)  # Separate topics with `;`
    
# Print topics per video
for video_id, data in video_topics.items():
    print(f"\nVideo ID: {video_id}")
    print(f"Coherence Score: {data['coherence']:.4f}")
    print("Assigned Topics:")
    for topic_id, prob in data["topics"].items():
        print(f"  Topic {topic_id} ({prob:.2f} probability): {[word for word, _ in data['top_words'][topic_id]]}")


Video ID: MBtWxCM7NYg_Is this Asia's BIGGEST mall in Manila THE PHILIPPINES! (didnâ€™t go to plan) ðŸ‡µðŸ‡­
Coherence Score: -4.2174
Assigned Topics:
  Topic 1 (1.00 probability): ['sushi', 'mall', 'phone', 'fun', 'coffee', 'peso', 'wheel', 'guess', 'oreo', 'shop', 'shopping', 'chicken', 'walk', 'rain', 'storm', 'street', 'ferris', 'water', 'pesos', 'game']

Video ID: O0cIJww_amw_TOP 6 METHODS I USED TO ACQUIRE TAGALOGFILIPINO -  (Tagalog with English Subtitles)
Coherence Score: -2.4722
Assigned Topics:
  Topic 2 (1.00 probability): ['strategy', 'language', 'tagalog', 'talk', 'accent', 'book', 'filipina', 'heart', 'series', 'strategies', 'kenyan', 'challenge', 'highschool', 'idea', 'swahili', 'languages', 'natives', 'ate', 'pranks', 'nolonger']

Video ID: R8-LyHgcl5I_Boracay is PARADISE in The Philippines! (canâ€™t believe I did this..)ðŸ‡µðŸ‡­
Coherence Score: -6.2049
Assigned Topics:
  Topic 2 (1.00 probability): ['island', 'water', 'boracay', 'ocean', 'beach', 'boat', 'birthday', '

In [40]:
import re

def extract_video_id(url):
    """Extracts the YouTube video ID from various URL formats."""
    match = re.search(
        r"(?:v=|\/(?:embed|shorts|v)\/|youtu\.be\/)([0-9A-Za-z_-]{11})", url
    )
    return match.group(1) if match else None

# Apply the function to extract the Video ID
topics_per_video_df["Video ID"] = topics_per_video_df["Link"].apply(extract_video_id)

# Apply mapping
topics_per_video_df["LDA1 Topics"] = topics_per_video_df["Video ID"].map(video_topic_words_LDA1)
topics_per_video_df["LDA2 Topics"] = topics_per_video_df["Video ID"].map(video_topic_words_lda2)

In [41]:
pd.set_option("display.max_rows", None) 
pd.set_option("display.max_columns", None) 
pd.set_option("display.max_colwidth", 100)  
pd.set_option("display.width", 1000)  

topics_per_video_df 

Unnamed: 0_level_0,Link,LDA1 Topics,LDA2 Topics,BERTopic Topics,HLTA Topics,Video ID
Video Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Is this Asia's BIGGEST mall in Manila THE PHILIPPINES! (didnâ€™t go to plan) ðŸ‡µðŸ‡­,https://www.youtube.com/watch?v=MBtWxCM7NYg,,,84_cheap_games_cheaper_oreo (50.00%)\n97_sushi_mall_shopping_centre (50.00%),"yesterday, camera, shopping, coffee, intestine, option, main-street : Level 2 (1.00)\ndavao, dav...",MBtWxCM7NYg
TOP 6 METHODS I USED TO ACQUIRE TAGALOGFILIPINO - (Tagalog with English Subtitles),https://www.youtube.com/watch?v=O0cIJww_amw,,,-1_minutes_time_guys_house (24.39%)\n0_philippines_island_filipino_super (12.20%)\n1_voice_singe...,[],O0cIJww_amw
Boracay is PARADISE in The Philippines! (canâ€™t believe I did this..)ðŸ‡µðŸ‡­,https://www.youtube.com/watch?v=R8-LyHgcl5I,,,0_philippines_island_filipino_super (100.00%),"yesterday, camera, shopping, coffee, intestine, option, main-street : Level 2 (0.87)\nice, deep,...",R8-LyHgcl5I
"First Time in Manila, Philippines! Eating Jollibee, taking the metro, and visiting tourist places",https://www.youtube.com/watch?v=Gl2RpA1xS6M,,,0_philippines_island_filipino_super (50.00%)\n104_coffee_statue_patty_lots (50.00%),"ice, deep, pour, storm, breakfast, super-super, happen : Level 2 (0.51)\nadobo, food, insect, at...",Gl2RpA1xS6M
Chef Kuya Joey Is Flying To The Philippines! ðŸ‡µðŸ‡­,https://www.youtube.com/watch?v=4OiOwDh5lKE,,,3_food_filipino_pork_chili (100.00%),"singer, voice, performance, bell, regine-velasquez, morissette-amon, morissette : Level 2 (1.00)...",4OiOwDh5lKE
Is This The BEST Filipino Restaurant in Malaysia! ðŸ‡µðŸ‡­ (Kuala Lumpur Mukbang ðŸ‡²ðŸ‡¾),https://www.youtube.com/watch?v=bKCr62Dk7Us,,,-1_minutes_time_guys_house (9.09%)\n3_food_filipino_pork_chili (36.36%)\n7_pork_ribs_shoulder_me...,[],bKCr62Dk7Us
Foreigners biggest SURPRISE in the Philippines (random Street interviews),https://www.youtube.com/watch?v=j7u0a_WH7hQ,,,0_philippines_island_filipino_super (100.00%),"ceremony, dress, trade, sponsor, special, chain, pocket : Level 2 (1.00)\ntown, short, character...",j7u0a_WH7hQ
$200 Luxury Beach Hotel in The Philippines ðŸ‡µðŸ‡­,https://www.youtube.com/watch?v=--8n6A8Q6M0,,,-1_minutes_time_guys_house (30.01%)\n0_philippines_island_filipino_super (0.73%)\n1_voice_singer...,"swim, arrive, typhoon, rent, nature, coconut, cuisine : Level 2 (0.72)\npeach, italian, choice, ...",--8n6A8Q6M0
HOW THE PHILIPPINES CHANGED OUR LIVES FOREVER ðŸ‡µðŸ‡­ AMERICANS REACT,https://www.youtube.com/watch?v=e2W9UaVhsX0,,,-1_minutes_time_guys_house (50.00%)\n0_philippines_island_filipino_super (50.00%),[],e2W9UaVhsX0
PHILIPPINES STREET FOOD Tour - INSANELY DELICIOUS FIRST TIME Trying JOLLIBEE!,https://www.youtube.com/watch?v=y9iCFcSaD58,,,-1_minutes_time_guys_house (38.21%)\n0_philippines_island_filipino_super (1.07%)\n1_voice_singer...,"english, tagalog, accent, proper, earlier, walk-hill, sunday : Level 2 (1.00)\neye, son, traditi...",y9iCFcSaD58


In [None]:
topics_per_video_df.to_csv("topics_per_video.csv")
