In [1]:
import os
import re
import pandas as pd

# Create Initial Dataframe with all video IDs
dataset_folder = "./Previous_THS-ST2_Files/standard_dataset_old"
data_records = []

# Regex pattern to extract Video Id and Title from the filename
filename_pattern = re.compile(r"^(.*)_(.*?)_captions\.txt$")

for file in sorted(os.listdir(dataset_folder)): 
    if file.endswith("_captions.txt"):  # Ensure it's a valid transcript file
        match = filename_pattern.match(file)
        if match:
            video_id, video_title = match.groups()  # Extract Video Id and Title
            file_path = os.path.join(dataset_folder, file)

            # Append data to the list
            data_records.append({
                "Video Title": video_title,
                "Link": f"https://www.youtube.com/watch?v={video_id}",
                "BERTopic Topics": None
            })

# Convert to DataFrame
topics_per_video_df = pd.DataFrame(data_records)
topics_per_video_df.set_index("Video Title", inplace=True)

topics_per_video_df


Unnamed: 0_level_0,Link,BERTopic Topics
Video Title,Unnamed: 1_level_1,Unnamed: 2_level_1
$200 Luxury Beach Hotel in The Philippines ðŸ‡µðŸ‡­,https://www.youtube.com/watch?v=--8n6A8Q6M0,
Philippines Army vs Thailand Army,https://www.youtube.com/watch?v=0IMWasj76yU,
Tour of The House We Built in The Philippines,https://www.youtube.com/watch?v=1kErCqgIVMk,
VOCAL COACH REACTS - REGINE VELASQUEZ - Araw-Gabi,https://www.youtube.com/watch?v=2TmagN6RhkI,
Top 5 Exotic Foods in the Philippines The BRAVE MUST TRY!,https://www.youtube.com/watch?v=2ftG8JuMzz4,
...,...,...
PHILIPPINES STREET FOOD Tour - INSANELY DELICIOUS FIRST TIME Trying JOLLIBEE!,https://www.youtube.com/watch?v=y9iCFcSaD58,
We arrived in Philippines. Manila. Towards The Sun by Hitchhiking 68 (ENG SUBS),https://www.youtube.com/watch?v=y9xOKcZD0xE,
"ChowKing for the first time - FilipinoChinese food CEBU, PHILIPPINES",https://www.youtube.com/watch?v=yR6a9-ufsZc,
"WILD WINGS DAVAO - BEST CHICKEN WINGS, BILAO FILIPINO FOOD FEAST - DAVAO FOOD - ANUJ GABA",https://www.youtube.com/watch?v=zTcCSyucYqI,


### Get the dataset 

In [2]:
# Initialize list to store data
experiment_records = []

# Iterate through transcript files
for file in os.listdir(dataset_folder):
    if file.endswith("_captions.txt"):  # Ensure it's a valid transcript file
        match = filename_pattern.match(file)
        if match:
            video_id, _ = match.groups()  # Extract Video ID
            file_path = os.path.join(dataset_folder, file)

            # Read transcript content
            with open(file_path, "r", encoding="utf-8") as f:
                transcript = f.read().strip()  # Remove extra spaces/newlines

            # Append data to list
            experiment_records.append({
                "Video Id": video_id,
                "Transcript": transcript
            })

# Convert to DataFrame
experiment_df = pd.DataFrame(experiment_records)

# Display the first few rows to verify
experiment_df

Unnamed: 0,Video Id,Transcript
0,MBtWxCM7NYg,this did not go to plan when i was like it's \...
1,O0cIJww_amw,"I once talked to a filipina. Then she said, wh..."
2,R8-LyHgcl5I,hi my name is josie and this is \nboracay isla...
3,Gl2RpA1xS6M,good morning everyone we're Nicole and Mico \n...
4,4OiOwDh5lKE,today is the day we're going to the philippine...
...,...,...
95,vwIIeiOhTJ0,"From dogs riding motorcycles, to inmates danci..."
96,8DYPxsr7kDI,[Music]\ntoday's video is going to primarily b...
97,S4bNEgM483k,Hello and welcome back to my brand new channel...
98,NdLjKxqzDbA,welcome welcome back to Pinoy Planet I'm \nexc...


### Initial Pre-procssing (Minimal)

removal of timestamps, numbers, contents inside parentheses, leading & trailing whitespaces, square brackets etc.

In [3]:
# initial pre-processing (minimal)

# Define a minimal preprocessing function
def minimal_preprocessing(text):
    # Remove timestamps and other non-verbal elements (like "(music)")
    text = re.sub(r'\[.*?\]', '', text)  # Remove content in square brackets (e.g., music)
    text = re.sub(r'\(.*?\)', '', text)  # Remove content in parentheses (e.g., [music], (child speaking))
    text = re.sub(r'\d+', '', text)     # Remove numbers
    text = re.sub(r'\s+', ' ', text)    # Replace multiple spaces with one
    text = text.strip()                 # Remove leading and trailing whitespace
    return text

# Apply the preprocessing to the 'Transcript' column
experiment_df['cleaned_transcript'] = experiment_df['Transcript'].apply(minimal_preprocessing)

# Display the cleaned data
experiment_df[['Transcript', 'cleaned_transcript']]

Unnamed: 0,Transcript,cleaned_transcript
0,this did not go to plan when i was like it's \...,this did not go to plan when i was like it's g...
1,"I once talked to a filipina. Then she said, wh...","I once talked to a filipina. Then she said, wh..."
2,hi my name is josie and this is \nboracay isla...,hi my name is josie and this is boracay island...
3,good morning everyone we're Nicole and Mico \n...,good morning everyone we're Nicole and Mico we...
4,today is the day we're going to the philippine...,today is the day we're going to the philippine...
...,...,...
95,"From dogs riding motorcycles, to inmates danci...","From dogs riding motorcycles, to inmates danci..."
96,[Music]\ntoday's video is going to primarily b...,today's video is going to primarily be towards...
97,Hello and welcome back to my brand new channel...,Hello and welcome back to my brand new channel...
98,welcome welcome back to Pinoy Planet I'm \nexc...,welcome welcome back to Pinoy Planet I'm excit...


### Transcript Segmentation

In [4]:
import torch
import nltk
import swifter
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity

from nltk.tokenize import sent_tokenize

# Load mBERT tokenizer and model
model_name = "bert-base-multilingual-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

# Use Apple's Metal (MPS) for acceleration if available
device = "mps" if torch.backends.mps.is_available() else "cpu"
model.to(device)

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(119547, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=Fals

In [5]:
def encode_sentences(sentences, batch_size=16):
    """Encodes sentences into vector representations using mBERT."""
    embeddings = []
    for i in range(0, len(sentences), batch_size):
        batch = sentences[i:i+batch_size]
        encoded_inputs = tokenizer(batch, padding=True, truncation=True, return_tensors="pt").to(device)
        with torch.no_grad():
            outputs = model(**encoded_inputs)
        batch_embeddings = outputs.last_hidden_state.mean(dim=1).cpu()  # Move to CPU for further processing
        embeddings.append(batch_embeddings)
    return torch.cat(embeddings, dim=0)

In [6]:
def compute_cosine_similarities(embeddings):
    """Computes cosine similarity efficiently in batch."""
    return cosine_similarity(embeddings[:-1], embeddings[1:]).diagonal()

In [7]:
def segment_transcript(transcript, threshold=0.7, min_length=30):
    """Segments transcript into meaningful chunks based on sentence similarity."""
    sentences = nltk.sent_tokenize(transcript)
    sentences = [s.strip() for s in sentences if s.strip()]

    if len(sentences) < 2:
        return [transcript]  # Return the whole text if it's too short

    # Encode sentences
    sentence_embeddings = encode_sentences(sentences)

    # Compute cosine similarity between consecutive sentences
    similarities = compute_cosine_similarities(sentence_embeddings)

    # Identify segment boundaries where similarity drops below threshold
    boundaries = [0] + [i + 1 for i, sim in enumerate(similarities) if sim < threshold] + [len(sentences)]

    # Create segments
    segments = [" ".join(sentences[boundaries[i]:boundaries[i + 1]]) for i in range(len(boundaries) - 1)]

    # Merge small segments
    processed_segments, temp_segment = [], ""
    for segment in segments:
        if len(segment) + len(temp_segment) < min_length:
            temp_segment += " " + segment
        else:
            if temp_segment:
                processed_segments.append(temp_segment.strip())
            temp_segment = segment
    if temp_segment:
        processed_segments.append(temp_segment.strip())

    # Ensure at least two segments
    if len(processed_segments) == 1:
        mid = len(sentences) // 2
        processed_segments = [" ".join(sentences[:mid]), " ".join(sentences[mid:])]

    return processed_segments


Segment

In [8]:
# Apply segmentation using parallel processing
experiment_df['Segments'] = experiment_df['cleaned_transcript'].swifter.apply(segment_transcript)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Pandas Apply:   0%|          | 0/100 [00:00<?, ?it/s]

In [9]:
# Convert to list of dictionaries
segmented_transcripts = [
    {"Video Id": row["Video Id"], "Segment #": i + 1, "Segment": segment}
    for _, row in experiment_df.iterrows()
    for i, segment in enumerate(row["Segments"])
]

In [10]:
# Convert to DataFrame
segmented_df = pd.DataFrame(segmented_transcripts)

# Display segmented transcripts
print(segmented_df)

         Video Id  Segment #  \
0     MBtWxCM7NYg          1   
1     MBtWxCM7NYg          2   
2     O0cIJww_amw          1   
3     O0cIJww_amw          2   
4     O0cIJww_amw          3   
...           ...        ...   
6696  REvaHqT3uro         24   
6697  REvaHqT3uro         25   
6698  REvaHqT3uro         26   
6699  REvaHqT3uro         27   
6700  REvaHqT3uro         28   

                                                Segment  
0     this did not go to plan when i was like it's g...  
1     in australia so pretty expensive not the cheap...  
2                          I once talked to a filipina.  
3     Then she said, why are you like that? Then I a...  
4     She said, because of your accent. Why is your ...  
...                                                 ...  
6696  liters per person and basically put so we almo...  
6697                                                Oh!  
6698  it looks like they're building a brand new Spa...  
6699                               

In [11]:
# Display segmented transcripts in a structured way
for video_id, group in segmented_df.groupby("Video Id"):
    print(f"\n==== Transcript for Video Id: {video_id} ====")
    for index, row in group.iterrows():
        print(f"\nSegment {row['Segment #']}:\n{row['Segment']}")
    print("\n" + "=" * 50)  # Separator for clarity
    
    
# txt


==== Transcript for Video Id: --8n6A8Q6M0 ====

Segment 1:
Oh, my God.

Segment 2:
It is just the perfect recipe for a good day.

Segment 3:
Want some beer? Sure.

Segment 4:
Oh, it's big bottle. Oh my goodness.

Segment 5:
Three Tarzan?

Segment 6:
Good morning from Bohol here in the Philippines.

Segment 7:
Everybody guys, this look how gorgeous this place is.

Segment 8:
Crystal clear water.

Segment 9:
It's just honestly such a vibe, perfect temperature to be spending the day by the beach. And that is exactly what we're gonna do today because I'm staying about meters up that way at a $ per night hotel on there for the next couple of nights.

Segment 10:
And I arrived in the, uh, last night.

Segment 11:
Uh, it was raining.

Segment 12:
It was kind of a little bit miserable. So I was like, you know what?

Segment 13:
We'll wait until the weather is good and boy, or boy did the weather turn out good today.

Segment 14:
This water guys, if you know me, I'm Australian.

Segment 15:
I 

### Feed segments to BERTopic

In [12]:
import pandas as pd
import re
import nltk

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

stop_words = set(stopwords.words('english'))
lemmatizer = nltk.WordNetLemmatizer()

def preprocess_segments(text):
    """Apply basic text preprocessing steps."""
    text = text.lower()  # Lowercasing
    text = re.sub(r'\W+', ' ', text)  # Remove punctuation & special characters
    words = word_tokenize(text)  # Tokenization
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]  # Remove stopwords & lemmatize
    return ' '.join(words)

# Apply preprocessing directly to 'Segment' column
segmented_df["Segment"] = segmented_df["Segment"].apply(preprocess_segments)

segmented_df


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/josephmiguela.maristela/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/josephmiguela.maristela/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/josephmiguela.maristela/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Unnamed: 0,Video Id,Segment #,Segment
0,MBtWxCM7NYg,1,go plan like going get real bad got real bad t...
1,MBtWxCM7NYg,2,australia pretty expensive cheapest option pro...
2,O0cIJww_amw,1,talked filipina
3,O0cIJww_amw,2,said like asked
4,O0cIJww_amw,3,said accent accent like
...,...,...,...
6696,REvaHqT3uro,24,liter per person basically put almost got two ...
6697,REvaHqT3uro,25,oh
6698,REvaHqT3uro,26,look like building brand new spa right seems l...
6699,REvaHqT3uro,27,hello el nido


In [13]:
# BERTopic on Segmented Transcripts

from bertopic import BERTopic
from sentence_transformers import SentenceTransformer

from bertopic.representation import KeyBERTInspired

# Load multilingual embedding model
embedding_model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')

# KeyBERT
keybert_model = KeyBERTInspired()

# Initialize BERTopic with the multilingual embedding model
topic_model = BERTopic(
    embedding_model=embedding_model, 
    language="multilingual", 
    representation_model=keybert_model,  # Improve topic descriptions
)

# Extract segments and their corresponding Video Ids
segments = []
video_ids = []

for idx, row in segmented_df.iterrows():
    segments.append(row['Segment'])
    video_ids.append(row['Video Id'])

# Fit the BERTopic model
topics, _ = topic_model.fit_transform(segments)



In [14]:
# INITIAL

topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,1526,-1_think_kinda_go_gon,"[think, kinda, go, gon, along, like, going, go...",[like would tell like understands learning swa...
1,0,145,0_philippine_filipino_go_come,"[philippine, filipino, go, come, good, hello, ...",[hello everybody doug today coming cebu city p...
2,1,139,1_gun_shotgun_bum_buchi,"[gun, shotgun, bum, buchi, budol, busker, cami...",[uh travesty bum gun australia like normalized...
3,2,116,2____,"[, , , , , , , , , ]","[plain, sample, reminded]"
4,3,99,3_philippine_filipino_manila_pakistan,"[philippine, filipino, manila, pakistan, archi...",[people would often guess filipino american co...
...,...,...,...,...,...
187,186,10,186_comment_think_review_please,"[comment, think, review, please, tell, let, , ...","[leave comment, let u know comment, let u know..."
188,187,10,187_yeah_uh_oh_kinda,"[yeah, uh, oh, kinda, really, , , , , ]","[yeah yeah yeah, yeah yeah yeah, yeah yeah]"
189,188,10,188_cabbage_vegetable_onion_salad,"[cabbage, vegetable, onion, salad, dish, cooke...","[know good vegetable dish, vegetable get green..."
190,189,10,189_distance_road_kilometre_traveled,"[distance, road, kilometre, traveled, kilomete...","[ah okay depending distance, longer distance m..."


### Update Resulting Topics

In [15]:
# Reduce outliers
new_topics = topic_model.reduce_outliers(segments, topics)

topic_model.update_topics(segments, topics=new_topics)

topic_model.get_topic_info()



Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,27,-1_parokya_ai_honk_buff,"[parokya, ai, honk, buff, blindfold, category,...",[like would tell like understands learning swa...
1,0,186,0_like_got_right_go,"[like, got, right, go, know, uh, yeah, get, fo...",[hello everybody doug today coming cebu city p...
2,1,142,1_gun_bum_pambyahe_sapin,"[gun, bum, pambyahe, sapin, isa, ucieczkadoraj...",[uh travesty bum gun australia like normalized...
3,2,117,2_least_mission_reminded_sample,"[least, mission, reminded, sample, buying, jun...","[plain, sample, reminded]"
4,3,109,3_philippine_american_filipino_native,"[philippine, american, filipino, native, popul...",[people would often guess filipino american co...
...,...,...,...,...,...
187,186,18,186_comment_section_brush_leave,"[comment, section, brush, leave, click, please...","[leave comment, let u know comment, let u know..."
188,187,31,187_kinda_yeah_government_clean,"[kinda, yeah, government, clean, halo, replace...","[yeah yeah yeah, yeah yeah yeah, yeah yeah]"
189,188,23,188_vegetable_salad_eggplant_carroty,"[vegetable, salad, eggplant, carroty, opening,...","[know good vegetable dish, vegetable get green..."
190,189,19,189_far_kilometre_distance_road,"[far, kilometre, distance, road, destination, ...","[ah okay depending distance, longer distance m..."


In [16]:
# UPDATE

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

from bertopic.representation import PartOfSpeech

import requests

# Custom stopwords list
custom_stopwords = ['like', 'yeah', 'yes', 'let', 'okay', 'gonna', 'ok', 'bye', 'just', 'got', 'right', 'mmm', 'oh', 'ah', 'gosh', 'guys', 'nice']

# Fetch stopwords from GitHub
github_stopwords_url = "https://raw.githubusercontent.com/stopwords-iso/stopwords-en/master/stopwords-en.txt"
github_stopwords = set(requests.get(github_stopwords_url).text.splitlines())

# Append GitHub stopwords to the custom stopwords set
custom_stopwords = set(custom_stopwords)  
custom_stopwords.update(github_stopwords)  

# Combine predefined stopwords (ENGLISH_STOP_WORDS) with custom stopwords
combined_stopwords = list(ENGLISH_STOP_WORDS.union(set(custom_stopwords)))

# Create a CountVectorizer with combined stopwords as a list
vectorizer_model = CountVectorizer(ngram_range=(1, 3), stop_words=combined_stopwords)

representation_model = KeyBERTInspired()

In [17]:
topic_model.update_topics(segments, vectorizer_model=vectorizer_model, representation_model=representation_model)

In [18]:
# UPDATED

topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,27,-1_parokya parokya_whoooooooooooa wayhay_delas_ey,"[parokya parokya, whoooooooooooa wayhay, delas...",[like would tell like understands learning swa...
1,0,186,0_philippine_filipino_manila_local,"[philippine, filipino, manila, local, hey, fun...",[hello everybody doug today coming cebu city p...
2,1,142,1_bum gun_fist bump fist_bump fist_bump fist bump,"[bum gun, fist bump fist, bump fist, bump fist...",[uh travesty bum gun australia like normalized...
3,2,117,2____,"[, , , , , , , , , ]","[plain, sample, reminded]"
4,3,109,3_philippine_philippine philippine_filipino ph...,"[philippine, philippine philippine, filipino p...",[people would often guess filipino american co...
...,...,...,...,...,...
187,186,18,186_comment comment comment_comment comment_wr...,"[comment comment comment, comment comment, wri...","[leave comment, let u know comment, let u know..."
188,187,31,187_outdoor bit kinda_built kinda mesh_kinda m...,"[outdoor bit kinda, built kinda mesh, kinda me...","[yeah yeah yeah, yeah yeah yeah, yeah yeah]"
189,188,23,188_salad vegetable_vegetable dish_salad veget...,"[salad vegetable, vegetable dish, salad vegeta...","[know good vegetable dish, vegetable get green..."
190,189,19,189_kilometre walked_destination trip alright_...,"[kilometre walked, destination trip alright, k...","[ah okay depending distance, longer distance m..."


In [19]:
# Get topic information
topic_info_df = topic_model.get_topic_info()

# Save to CSV
topic_info_df.to_csv("NewUpdated2-ILW-BERTopic_info.csv", index=False)

print("Topic info saved as topic_info.csv")

Topic info saved as topic_info.csv


# Visualization

In [None]:
# Get topic representations (top words for each topic)
topic_labels = {topic: ", ".join([word for word, _ in words]) 
                for topic, words in topic_model.get_topics().items()}


# Create a DataFrame to track topics with Video Ids and Segment numbers
topic_data = {
    "Video Id": video_ids,
    "Segment": segments,
    "Topic": new_topics,
    "Topic Name": [topic_labels.get(topic, "No Topic") for topic in new_topics]
}

In [25]:
# Create a DataFrame to organize the results
topic_df = pd.DataFrame(topic_data)

topic_df

Unnamed: 0,Video Id,Segment,Topic,Topic Name
0,MBtWxCM7NYg,go plan like going get real bad got real bad t...,0,"philippine, filipino, manila, local, hey, fun,..."
1,MBtWxCM7NYg,australia pretty expensive cheapest option pro...,0,"philippine, filipino, manila, local, hey, fun,..."
2,O0cIJww_amw,talked filipina,3,"philippine, philippine philippine, filipino ph..."
3,O0cIJww_amw,said like asked,28,"parokya parokya, whoooooooooooa wayhay, delas,..."
4,O0cIJww_amw,said accent accent like,0,"parokya parokya, whoooooooooooa wayhay, delas,..."
...,...,...,...,...
6696,REvaHqT3uro,liter per person basically put almost got two ...,0,"philippine, filipino, manila, local, hey, fun,..."
6697,REvaHqT3uro,oh,29,"hahaha aaaaah, hahaha aaaaah aaaaah, aaaa, hah..."
6698,REvaHqT3uro,look like building brand new spa right seems l...,0,"philippine, filipino, manila, local, hey, fun,..."
6699,REvaHqT3uro,hello el nido,56,"parokya parokya, whoooooooooooa wayhay, delas,..."


## Videos under each Topic

In [26]:
# Display topics along with their corresponding Video Ids and Segment Numbers
for topic in sorted(set(topics)):  # Sort to display topics in order
    topic_name = topic_labels.get(topic, "No Topic")
    print(f"\n==== Topic #{topic} - {topic_name} ====\n")  # Format the header
    
    # Filter segments that belong to the current topic
    topic_segments = topic_df[topic_df['Topic'] == topic]

    # Print Video Id and Segment text for each row
    for _, row in topic_segments.iterrows():
        print(f"Video Id: {row['Video Id']}, Segment: {row['Segment']}\n")
    
    print("=" * 50)  # Separator for readability


==== Topic #-1 - parokya parokya, whoooooooooooa wayhay, delas, ey, ala pokwang, delas ala, whoooooooooooa, ala pokwang buff, ala, honk ====

Video Id: --8n6A8Q6M0, Segment: honk honk

Video Id: --8n6A8Q6M0, Segment: kawazaki

Video Id: MKBogPOM3Ao, Segment: nooo

Video Id: MKBogPOM3Ao, Segment: sayote

Video Id: MKBogPOM3Ao, Segment: advocate

Video Id: mcWEmkXF4ik, Segment: powdery

Video Id: mcWEmkXF4ik, Segment: wasted

Video Id: mcWEmkXF4ik, Segment: whoooooooooooa wayhay

Video Id: mcWEmkXF4ik, Segment: category

Video Id: mcWEmkXF4ik, Segment: tattooed

Video Id: mcWEmkXF4ik, Segment: parokya parokya

Video Id: mcWEmkXF4ik, Segment: barking

Video Id: mcWEmkXF4ik, Segment: farmland valley

Video Id: mcWEmkXF4ik, Segment: privately owned

Video Id: dGM9kDXdWcQ, Segment: b

Video Id: VoUdWRHkDwA, Segment: sunscreen

Video Id: lLQcFpgbMOk, Segment: impromptu

Video Id: fA0m-LaA_7c, Segment: blindfold ey

Video Id: fA0m-LaA_7c, Segment: blindfold

Video Id: fA0m-LaA_7c, Segment: gl

In [28]:
# Display topics along with their corresponding Video Ids and Segment Numbers
for topic in sorted(set(topics)):  # Sort to display topics in order
    topic_name = topic_labels.get(topic, "No Topic")
    print(f"\n==== Topic #{topic} - {topic_name} ====\n")  # Format the header
    
    # Filter segments that belong to the current topic
    topic_segments = topic_df[topic_df['Topic'] == topic]
    
    # Group by Video Id and count the number of segments assigned to this topic
    video_segment_count = topic_segments.groupby('Video Id').size()
    
    # Print the details for each Video Id and the number of segments assigned to this topic
    for video_id, count in video_segment_count.items():
        print(f"Video Id: {video_id}, Total Segments: {count}")
    
    print("=" * 50)  # Separator for readability



==== Topic #-1 - parokya parokya, whoooooooooooa wayhay, delas, ey, ala pokwang, delas ala, whoooooooooooa, ala pokwang buff, ala, honk ====

Video Id: --8n6A8Q6M0, Total Segments: 2
Video Id: BjaKfnWjX9Y, Total Segments: 4
Video Id: MKBogPOM3Ao, Total Segments: 3
Video Id: VoUdWRHkDwA, Total Segments: 1
Video Id: dGM9kDXdWcQ, Total Segments: 1
Video Id: fA0m-LaA_7c, Total Segments: 4
Video Id: lLQcFpgbMOk, Total Segments: 1
Video Id: mcWEmkXF4ik, Total Segments: 9
Video Id: nCbla8yws6k, Total Segments: 1
Video Id: wpnWOgl3MVc, Total Segments: 1

==== Topic #0 - philippine, filipino, manila, local, hey, fun, street, city, spot, gon ====

Video Id: --8n6A8Q6M0, Total Segments: 8
Video Id: 0IMWasj76yU, Total Segments: 1
Video Id: 1kErCqgIVMk, Total Segments: 1
Video Id: 2TmagN6RhkI, Total Segments: 1
Video Id: 2ftG8JuMzz4, Total Segments: 1
Video Id: 3n1nspLgfd4, Total Segments: 1
Video Id: 4OiOwDh5lKE, Total Segments: 1
Video Id: 6ftiWoCJ4dM, Total Segments: 1
Video Id: 7jDrzdX2Ucc, To

## Topic Distribution per Transcript

In [33]:
# Group by Video Id and get the unique topics for each Video Id
video_topics = topic_df.groupby('Video Id')['Topic'].unique()

# Display the topics for each Video Id
for video_id, topics in video_topics.items():
    print(f"Transcript Video {video_id}:")
    print(f"Topics Present: {', '.join(map(str, topics))}\n")

Transcript Video --8n6A8Q6M0:
Topics Present: 124, 125, 17, 63, 3, 140, 35, 66, 143, 121, 0, 167, 42, 4, 95, 43, 39, 52, 172, 1, 30, 41, 61, 83, 179, 78, 141, 40, 18, 185, 101, 142, 85, 33, 96, 99, 46, 13, 80, 31, 113, 134, 111, 98, 12, 10, 190, 76, 84, 110, 145, 44, 14, 157, 115, 81, 182, 114, 126, 178, 62, 86, 70, 112, 131, 158, 2, 160, 5, 94, 71, 58, 138, 103, 183, 20, 7, 180, 130, 117, 28, 189, 25, 29, 19, 73, 15, 135, 120, 139, 23, 65, 56, 87, 177, 154, 174, 16, 82, 68, -1, 171, 51, 122, 165, 147, 11, 38, 102, 93, 159, 107, 149, 74, 164, 57, 69, 67, 187, 148, 129, 136, 75, 127, 119, 88, 156, 137, 36, 79, 50, 6, 175, 166, 34, 89, 53, 123, 118, 161, 186, 162, 128, 152, 163, 132

Transcript Video 0IMWasj76yU:
Topics Present: 0

Transcript Video 1kErCqgIVMk:
Topics Present: 0

Transcript Video 2TmagN6RhkI:
Topics Present: 0

Transcript Video 2ftG8JuMzz4:
Topics Present: 21, 106, 96, 17, 1, 163, 52, 22, 92, 0, 19, 2, 15, 23, 36, 97, 168, 127, 157, 9, 173, 80, 35, 8, 111, 38, 37, 149, 1

In [None]:
# Retrieve topic names from BERTopic
topic_info = topic_model.get_topic_info()
topic_mapping = {row["Topic"]: row["Name"] for _, row in topic_info.iterrows()}

# Count topic occurrences per Video Id
topic_counts = topic_df.groupby(["Video Id", "Topic"]).size().reset_index(name="Count")

# Calculate percentage per Video Id
topic_counts["Percentage"] = topic_counts.groupby("Video Id")["Count"].transform(lambda x: (x / x.sum()) * 100)

# Map topic names
topic_counts["Topic Name"] = topic_counts["Topic"].map(topic_mapping)

# Aggregate topics into a single row per Video Id with newline separation
video_topics = topic_counts.groupby("Video Id").apply(
    lambda group: "\n".join(
        [f"{row['Topic Name']} ({row['Percentage']:.2f}%)" for _, row in group.iterrows()]
    )
).reset_index(name="List of Topics with %")

# Display the final DataFrame
print(video_topics)

       Video Id                              List of Topics with %
0   --8n6A8Q6M0  -1_parokya parokya_whoooooooooooa wayhay_delas...
1   0IMWasj76yU       0_philippine_filipino_manila_local (100.00%)
2   1kErCqgIVMk       0_philippine_filipino_manila_local (100.00%)
3   2TmagN6RhkI       0_philippine_filipino_manila_local (100.00%)
4   2ftG8JuMzz4  0_philippine_filipino_manila_local (1.37%)\n1_...
..          ...                                                ...
95  y9iCFcSaD58  0_philippine_filipino_manila_local (0.71%)\n1_...
96  y9xOKcZD0xE  0_philippine_filipino_manila_local (28.00%)\n2...
97  yR6a9-ufsZc       0_philippine_filipino_manila_local (100.00%)
98  zTcCSyucYqI       0_philippine_filipino_manila_local (100.00%)
99  zwOJjQuL4i4  0_philippine_filipino_manila_local (7.14%)\n6_...

[100 rows x 2 columns]


  video_topics = topic_counts.groupby("Video Id").apply(


In [38]:
video_topics.to_csv("video_topics_distribution.csv", index=False)

## Given a video, what are the topics under it?

In [37]:
# Step 1: Take user input
video_id_input = input("Enter Video Id: ").strip()

# Step 2: Retrieve Video Title from topics_per_video_df
video_title_row = topics_per_video_df.reset_index()
video_title = video_title_row.loc[video_title_row["Link"].str.contains(video_id_input, case=False, na=False), "Video Title"].values

if len(video_title) == 0:
    print("\nVideo Id not found in dataset.")
else:
    video_title = video_title[0]  # Extract single value
    print(f"\nResults for {video_id_input}.txt\n")
    print(f"Video Title: {video_title}\n")

    # Step 3: Display full initial transcript from experiment_df
    transcript = experiment_df.loc[experiment_df["Video Id"] == video_id_input, "Transcript"].values
    if len(transcript) > 0:
        print("<Transcript File>\n")
        print(transcript[0])  # Print the full transcript
    else:
        print("Transcript not found.")

    # Step 4: Retrieve topics for this Video Id from video_topics
    topics_present = video_topics.loc[video_topics["Video Id"] == video_id_input, "List of Topics with %"].values
    if len(topics_present) > 0:
        print("\nTopics Present:\n")
        print(topics_present[0])  # Display topic names and percentages
    else:
        print("No topics found.")

    # Step 5: Retrieve and display segmented transcript with topics
    print("\nSegmented Transcript and Assigned Topics:\n")

    video_segments = topic_df.loc[topic_df["Video Id"] == video_id_input, ["Segment", "Topic", "Topic Name"]]

    if not video_segments.empty:
        for idx, row in video_segments.iterrows():
            print(f"Segment {idx + 1}:")
            print(f"{row['Segment']}\n")
            print(f"➡ Assigned Topic: {row['Topic Name']} (Topic #{row['Topic']})\n")
            print("-" * 50)  # Separator for clarity
    else:
        print("No segments found for this Video Id.")




Results for y9iCFcSaD58.txt

Video Title: PHILIPPINES STREET FOOD Tour - INSANELY DELICIOUS  FIRST TIME Trying JOLLIBEE!

<Transcript File>

After wrapping up our time in El Nido, we have made it all the way over to Boracay, 
and unfortunately, the weather has not been on our side on this beautiful island,
but good news we have our friends Greg and Jumi who have made it all the way over from Cebu with us,  
and with this bad weather we thought it would be the perfect time to bring them back into  
our lives and have another amazing opportunity 
to try some local Filipino food.
So we're super excited about that because we're gonna see the non-touristic side of Boracay with them  
We're gonna start with a fried quail egg called Kwek Kwek,
hope Jumi's gonna correct me if I mispronounced it
but this is one of those things where even like the local taxi drivers are coming up grabbing one as a snack, and...
let's go get eating!
Who's ordering? Am I ordering or you're ordering?
Well, you sho