In [67]:
import os
import re
import pandas as pd

# Create Initial Dataframe with all video IDs
dataset_folder = "./standard_dataset"
data_records = []

# Regex pattern to extract Video Id and Title from the filename
filename_pattern = re.compile(r"^(.*)_(.*?)_captions\.txt$")

for file in os.listdir(dataset_folder):
    if file.endswith("_captions.txt"):  # Ensure it's a valid transcript file
        match = filename_pattern.match(file)
        if match:
            video_id, video_title = match.groups()  # Extract Video Id and Title
            file_path = os.path.join(dataset_folder, file)

            # Append data to the list
            data_records.append({
                "Video Title": video_title,
                "Link": f"https://www.youtube.com/watch?v={video_id}",
                "LDA Topics": None,
                "BERTopic Topics": None,
                "HLTA Topics":[]
            })

# Convert to DataFrame
topics_per_video_df = pd.DataFrame(data_records)
topics_per_video_df.set_index("Video Title", inplace=True)

topics_per_video_df


Unnamed: 0_level_0,Link,LDA Topics,BERTopic Topics,HLTA Topics
Video Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
$200 Luxury Beach Hotel in The Philippines ðŸ‡µðŸ‡­,https://www.youtube.com/watch?v=--8n6A8Q6M0,,,[]
Philippines Army vs Thailand Army,https://www.youtube.com/watch?v=0IMWasj76yU,,,[]
Tour of The House We Built in The Philippines,https://www.youtube.com/watch?v=1kErCqgIVMk,,,[]
Top 5 Exotic Foods in the Philippines The BRAVE MUST TRY!,https://www.youtube.com/watch?v=2ftG8JuMzz4,,,[]
VOCAL COACH REACTS - REGINE VELASQUEZ - Araw-Gabi,https://www.youtube.com/watch?v=2TmagN6RhkI,,,[]
...,...,...,...,...
"Vi & I Visit the Roxas Night Market, Davao",https://www.youtube.com/watch?v=Z9Uz-NUaCG8,,,[]
Filipina British Life in UK May pa request si mister!ðŸ¤,https://www.youtube.com/watch?v=ZFpnR0xpdRI,,,[]
"WILD WINGS DAVAO - BEST CHICKEN WINGS, BILAO FILIPINO FOOD FEAST - DAVAO FOOD - ANUJ GABA",https://www.youtube.com/watch?v=zTcCSyucYqI,,,[]
(ENG) SB19 Dance Cover BTS Boy With Luv & Idol - Indonesian Reaction,https://www.youtube.com/watch?v=zwOJjQuL4i4,,,[]


### HLTA

In [68]:
# Store results as a flat list
import json

nodes_filename = 'standard_test_1.nodes.json'
nodes_filepath = './HLTM/output-jsons/' + nodes_filename

with open(nodes_filepath, 'r') as f:
    data = json.load(f)

def flatten_tree(node, depth=1, result=[]):
    """Recursively flattens a tree structure into a list."""
    result.append({
        "id": node["id"],
        "texts": node["text"].split(),
        "level": depth
    })
    
    for child in node.get("children", []):
        flatten_tree(child, depth + 1, result)
    
    return result

# Process each root node in the JSON
flat_list = []
for root in data:
    flat_list.extend(flatten_tree(root))

print(flat_list)

[{'id': 'Z23', 'texts': ['sing', 'singer', 'voice', 'singing', 'performance', 'vocal', 'bell'], 'level': 1}, {'id': 'Z117', 'texts': ['singer', 'voice', 'performance', 'bell', 'regine-velasquez', 'morissette-amon', 'morissette'], 'level': 2}, {'id': 'Z118', 'texts': ['world', 'talent', 'record', 'shoot', 'personal', 'public', 'luke'], 'level': 2}, {'id': 'Z120', 'texts': ['boil', 'vegetable', 'waste', 'original', 'largest', 'decide', 'guest'], 'level': 2}, {'id': 'Z116', 'texts': ['sing', 'singing', 'vocal', 'stage', 'tnt-boy', 'note', 'male'], 'level': 2}, {'id': 'Z119', 'texts': ['primate', 'tarsier', 'peace', 'animal', 'legend', 'banana-ketchup', 'band'], 'level': 2}, {'id': 'Z23', 'texts': ['sing', 'singer', 'voice', 'singing', 'performance', 'vocal', 'bell'], 'level': 1}, {'id': 'Z117', 'texts': ['singer', 'voice', 'performance', 'bell', 'regine-velasquez', 'morissette-amon', 'morissette'], 'level': 2}, {'id': 'Z118', 'texts': ['world', 'talent', 'record', 'shoot', 'personal', 'pu

In [69]:
from collections import defaultdict

topic_map_filepath = './HLTM/output-jsons/' + 'standard_test_1.topics.json'

with open(topic_map_filepath, 'r') as f:
    topic_mapping_data = json.load(f)

topic_ids_per_vid_mapping = defaultdict(list)

# use dictionary to change mapping to list topics per video
# doc id serves as the grouping key
for topic_obj in topic_mapping_data:
    topic = topic_obj["topic"]
    
    for doc_entry in topic_obj["doc"]:
        doc_number, probability = doc_entry
        topic_ids_per_vid_mapping[doc_number].append((topic, probability))


# Document numbers are based on the order they are fed into the HLTA model or which line they are in orig text file, starts from 0 til n

In [70]:
# link the top words to the video instead of just the topic id
topic_to_words = {entry["id"]: (", ".join(entry["texts"]), entry["level"]) for entry in flat_list}


# Step 2: Convert topic_mapping_data into topics_per_vid_mapping
for topic_obj in topic_mapping_data:
    topic = topic_obj["topic"]
    for doc_entry in topic_obj["doc"]:
        doc_number, probability = doc_entry
        topic_ids_per_vid_mapping[doc_number].append((topic, probability))

# Step 3: Store transformed data in a new dictionary
topic_words_per_vid_mapping = defaultdict(list)

for doc, topic_list in topic_ids_per_vid_mapping.items():
    for t, p in topic_list:
        if t in topic_to_words:
            topic_words, level = topic_to_words[t]
            topic_words_per_vid_mapping[doc].append((topic_words, level, p))
        else:
            topic_words_per_vid_mapping[doc].append((t, None, p))  # Handle case where topic ID is not found

# Step 4: Print the results
for doc, topic_list in topic_words_per_vid_mapping.items():
    topics_str = ", ".join(f"{t} (Level {lvl}): {p:.2f}" if lvl is not None else f"{t}: {p:.2f}" for t, lvl, p in topic_list)
    print(f"Document {doc} - {topics_str}")

Document 34 - yesterday, camera, shopping, coffee, intestine, option, main-street (Level 2): 1.00, beach, explore, weather, road, ride, hotel, car (Level 2): 1.00, photo, session, goodness, hair (Level 2): 0.97, private, wake, entrance, cheap, south, cafe, build (Level 2): 1.00, colt, brooklyn, bgc, month, baby, late, carry (Level 2): 0.98, pay, drive, tourist, chocolate-hill, boat, ticket, real (Level 2): 1.00, chicken-chicken, tasty, singapore, mushroom, dish, honest, mix (Level 2): 1.00, swim, arrive, typhoon, rent, nature, coconut, cuisine (Level 2): 0.98, remind, fried, bite, roasted, bit, wrong, pie (Level 2): 0.98, quick, busy, bed, close, toilet, kitchen, bedroom (Level 2): 1.00, pizza, foot, scared, tricycle, wave, scary, pasta (Level 2): 1.00, random, extra, finally, taxi, fridge, driver, milk (Level 2): 1.00, peach, italian, choice, garlic-rice, laughter, dinner, juicy (Level 2): 1.00, previous, climb, cooky, hmmm, surprise, request, maja (Level 2): 0.54, airport, chill, fli

In [73]:
# append into main dataframe
for doc, topic_list in topic_words_per_vid_mapping.items():
    t = ""
    for topic in topic_list:
       t += topic[0] + " : Level " + str(topic[1]) + " ({:.2f})".format(topic[2]) + '\n'

    topics_per_video_df.iloc[int(doc), 3] = t

topics_per_video_df
topics_per_video_df.to_csv('topics_per_video.csv')


### BERTopic

### LDA