In [2]:
import os
import re
import pandas as pd

# Create Initial Dataframe with all video IDs
dataset_folder = "../../video_titles"
data_records = []

# Regex pattern to extract Video Id and Title from the filename
filename_pattern = re.compile(r"^(.*?)\.txt$")

for file in sorted(os.listdir(dataset_folder)): 
    if file.endswith(".txt"):  # Ensure it's a valid transcript file
        match = filename_pattern.match(file)
        video_id = match.group(1)  # Extract Video Id and Title

        with open(os.path.join(dataset_folder, file), 'r', encoding='utf-8') as file:
            video_title = file.read()

            # Append data to the list
            data_records.append({
                "Video Title": video_title,
                "Link": f"https://www.youtube.com/watch?v={video_id}",
                "HLTA Topics":[]
            })

# Convert to DataFrame
topics_per_video_df = pd.DataFrame(data_records)
topics_per_video_df.set_index("Video Title", inplace=True)

topics_per_video_df 

Unnamed: 0_level_0,Link,HLTA Topics
Video Title,Unnamed: 1_level_1,Unnamed: 2_level_1
Marine reacts to the Philippine Light Reaction Regiment (LRR),https://www.youtube.com/watch?v=-1B7cVoZr1c,[]
SHOWING MY SISTER SB19 'GENTO' Music Video,https://www.youtube.com/watch?v=-9bfDHHneyU,[]
10 Reasons/Do not Retire TO the Philippines/Moving to the Philippines/Philippine/Dumaguete,https://www.youtube.com/watch?v=-C5iB25BRsA,[]
Christmas is NEXT LEVEL in the Philippines | Latinos react to Viral Filipino Singing TikToks,https://www.youtube.com/watch?v=-GVbt3qdq70,[]
RAPSTAR REACTION! FLOW G is the FILIPINO DRAKE!,https://www.youtube.com/watch?v=-JTkbbE_KP4,[]
...,...,...
"Foreigners Find SECRET LOCAL Cold Springs In BOHOL Philippines, No Tourists!",https://www.youtube.com/watch?v=zl071FhqyJI,[]
PHILIPPINES PROVINCE LIFE IS SO DIFFERENT FROM LIFE IN THE UK 🇵🇭 Foreigner and Filipina VLOG,https://www.youtube.com/watch?v=zm_8N4vFnZw,[]
"🇰🇷Koreans React to P-pop Idol Group | Lovey Dovey, Dash, Salamat by Hori7on",https://www.youtube.com/watch?v=zqvDHfgxWh8,[]
FOREIGNER LIVING IN YHE PHILIPPINES WEEKLY FOOD BUDGET IS IT ENOUGH OR NOT ??,https://www.youtube.com/watch?v=zvTP6wl9sTU,[]


In [3]:
# Store results as a flat list
import json

nodes_filename = '1800.nodes.json'
nodes_filepath = '../output-jsons/' + nodes_filename

with open(nodes_filepath, 'r') as f:
    data = json.load(f)

def flatten_tree(node, depth=1, result=[]):
    """Recursively flattens a tree structure into a list."""
    result.append({
        "id": node["id"],
        "texts": node["text"].split(),
        "level": depth
    })
    
    for child in node.get("children", []):
        flatten_tree(child, depth + 1, result)
    
    return result

# Process each root node in the JSON
flat_list = []
for root in data:
    flat_list.extend(flatten_tree(root))

print(flat_list)

[{'id': 'Z41', 'texts': ['island', 'peso', 'walk', 'road', 'price', 'car', 'beach'], 'level': 1}, {'id': 'Z32', 'texts': ['peso', 'price', 'plan', 'month', 'property', 'cheap', 'flight'], 'level': 2}, {'id': 'Z28', 'texts': ['flight', 'airport', 'driver', 'busy', 'plane', 'short', 'clothes'], 'level': 3}, {'id': 'Z140', 'texts': ['driver', 'busy', 'journey'], 'level': 4}, {'id': 'Z142', 'texts': ['clothes', 'shirt', 'shoe', 'bunch'], 'level': 4}, {'id': 'Z139', 'texts': ['flight', 'airport', 'plane'], 'level': 4}, {'id': 'Z141', 'texts': ['short', 'luck', 'wave'], 'level': 4}, {'id': 'Z29', 'texts': ['property', 'concrete', 'land', 'plant', 'house-house', 'field', 'wall'], 'level': 3}, {'id': 'Z143', 'texts': ['property', 'concrete', 'land'], 'level': 4}, {'id': 'Z145', 'texts': ['plant', 'house-house', 'field', 'pressure', 'board', 'space'], 'level': 4}, {'id': 'Z144', 'texts': ['wall', 'pipe', 'folk', 'jay'], 'level': 4}, {'id': 'Z26', 'texts': ['plan', 'month', 'meet', 'corner', 'we

In [4]:
from collections import defaultdict

topic_map_filepath = '../output-jsons/' + '1800.topics.json'

with open(topic_map_filepath, 'r') as f:
    topic_mapping_data = json.load(f)

topic_ids_per_vid_mapping = defaultdict(list)

# use dictionary to change mapping to list topics per video
# doc id serves as the grouping key
for topic_obj in topic_mapping_data:
    topic = topic_obj["topic"]
    
    for doc_entry in topic_obj["doc"]:
        doc_number, probability = doc_entry
        topic_ids_per_vid_mapping[doc_number].append((topic, probability))


# Document numbers are based on the order they are fed into the HLTA model or which line they are in orig text file, starts from 0 til n

In [5]:
# link the top words to the video instead of just the topic id
topic_to_words = {entry["id"]: (", ".join(entry["texts"]), entry["level"]) for entry in flat_list}

# store data in a new dictionary
topic_words_per_vid_mapping = defaultdict(list)

for doc, topic_list in topic_ids_per_vid_mapping.items():
    for t, p in topic_list:
        if t in topic_to_words:
            topic_words, level = topic_to_words[t]
            topic_words_per_vid_mapping[doc].append((topic_words, level, p))
        else:
            topic_words_per_vid_mapping[doc].append((t, None, p))  # Handle case where topic ID is not found

for doc, topic_list in topic_words_per_vid_mapping.items():
    topics_str = ", ".join(f"{t} (Level {lvl}): {p:.2f}" if lvl is not None else f"{t}: {p:.2f}" for t, lvl, p in topic_list)
    print(f"Document {doc} - {topics_str}")

Document 452 - pizza, hungry, pasta (Level 4): 1.00, warm, dangerous, terrible, glad, forget, strong, finger (Level 3): 0.99, terrible, glad, forget (Level 4): 0.96, mango, fish, fruit, chocolate, fresh, pizza, hungry (Level 3): 0.95, dangerous, strong, finger, tooth, success (Level 4): 0.99, holiday, december, september, life (Level 4): 0.64, filipina, wife (Level 4): 1.00, taste, sauce, flavor, rice, chicken, delicious, meat (Level 2): 1.00, weird, giant, normal (Level 4): 0.85, extra, gift, trash, list, form, special, double (Level 3): 0.73, soy-sauce, oil, lechon, pig (Level 4): 1.00, rice-rice, rice (Level 4): 1.00, plan, month, corner, week, project (Level 4): 0.73, soy-sauce, oil, lechon, pig, buddy, trust, ben (Level 3): 0.99, eat, sweet, cheese, meal, mukbang, spicy-spicy (Level 4): 0.98, kilo, apple, worry (Level 4): 0.92, close, dream, strange (Level 4): 0.71, sunday, loud, science-science (Level 4): 0.61, restaurant, perfect, menu, type_, famous, honest, popular (Level 3): 

In [8]:
# append into main dataframe
for doc, topic_list in topic_words_per_vid_mapping.items():
    # Group topics by level
    level_groups = defaultdict(list)
    for t, lvl, p in topic_list:
        level_groups[lvl].append((t, p))

   
    sorted_levels = sorted(level_groups.keys(), reverse=True, key=lambda x: x if x is not None else -1)

  
    formatted_text = ""
    for i, lvl in enumerate(sorted_levels):
        if lvl is not None:
            formatted_text += f"Level {lvl}\n"
        for t, p in level_groups[lvl]:
            formatted_text += f"{t}: ({p:.2f})\n"
        formatted_text += "\n"  

    
    topics_per_video_df.iloc[int(doc), 1] = formatted_text.strip()  

In [9]:
topics_per_video_df.to_csv("1800-topics-per-vid.csv")