In [1]:
import os
import re
import pandas as pd

# Create Initial Dataframe with all video IDs
dataset_folder = "../../video_titles"
data_records = []

# Regex pattern to extract Video Id and Title from the filename
filename_pattern = re.compile(r"^(.*?)\.txt$")

for file in sorted(os.listdir(dataset_folder)): 
    if file.endswith(".txt"):  # Ensure it's a valid transcript file
        match = filename_pattern.match(file)
        video_id = match.group(1)  # Extract Video Id and Title

        with open(os.path.join(dataset_folder, file), 'r', encoding='utf-8') as file:
            video_title = file.read()

            # Append data to the list
            data_records.append({
                "Video Title": video_title,
                "Link": f"https://www.youtube.com/watch?v={video_id}",
                "HLTA Topics":[]
            })

# Convert to DataFrame
topics_per_video_df = pd.DataFrame(data_records)
topics_per_video_df.set_index("Video Title", inplace=True)

topics_per_video_df 

Unnamed: 0_level_0,Link,HLTA Topics
Video Title,Unnamed: 1_level_1,Unnamed: 2_level_1
$200 Luxury Beach Hotel in The Philippines 🇵🇭,https://www.youtube.com/watch?v=--8n6A8Q6M0,[]
Marine reacts to the Philippine Light Reaction Regiment (LRR),https://www.youtube.com/watch?v=-1B7cVoZr1c,[]
Ultimate Filipino Food Festival In The Netherlands!,https://www.youtube.com/watch?v=-7vF5F-1btE,[]
SHOWING MY SISTER SB19 'GENTO' Music Video,https://www.youtube.com/watch?v=-9bfDHHneyU,[]
10 Reasons/Do not Retire TO the Philippines/Moving to the Philippines/Philippine/Dumaguete,https://www.youtube.com/watch?v=-C5iB25BRsA,[]
...,...,...
PHILIPPINES PROVINCE LIFE IS SO DIFFERENT FROM LIFE IN THE UK 🇵🇭 Foreigner and Filipina VLOG,https://www.youtube.com/watch?v=zm_8N4vFnZw,[]
"Starring real-life gay couple, latest Filipino gay series ""Lakan"" is nothing like the BL you've seen",https://www.youtube.com/watch?v=zpT46etTP6E,[]
"🇰🇷Koreans React to P-pop Idol Group | Lovey Dovey, Dash, Salamat by Hori7on",https://www.youtube.com/watch?v=zqvDHfgxWh8,[]
FOREIGNER LIVING IN YHE PHILIPPINES WEEKLY FOOD BUDGET IS IT ENOUGH OR NOT ??,https://www.youtube.com/watch?v=zvTP6wl9sTU,[]


In [2]:
# Store results as a flat list
import json

nodes_filename = 'T3.nodes.json'
nodes_filepath = '../output-jsons/' + nodes_filename

with open(nodes_filepath, 'r') as f:
    data = json.load(f)

def flatten_tree(node, depth=1, result=[]):
    """Recursively flattens a tree structure into a list."""
    result.append({
        "id": node["id"],
        "texts": node["text"].split(),
        "level": depth
    })
    
    for child in node.get("children", []):
        flatten_tree(child, depth + 1, result)
    
    return result

# Process each root node in the JSON
flat_list = []
for root in data:
    flat_list.extend(flatten_tree(root))

print(flat_list)

[{'id': 'Z35', 'texts': ['mango', 'fruit', 'milk', 'banana', 'vegetable', 'coconut', 'bread'], 'level': 1}, {'id': 'Z214', 'texts': ['mango', 'banana', 'pineapple', 'orange', 'apple', 'peach', 'banana-banana'], 'level': 2}, {'id': 'Z170', 'texts': ['pineapple', 'orange', 'apple', 'cut'], 'level': 3}, {'id': 'Z172', 'texts': ['banana', 'banana-banana'], 'level': 3}, {'id': 'Z171', 'texts': ['mango', 'peach'], 'level': 3}, {'id': 'Z213', 'texts': ['fruit', 'milk', 'vegetable', 'coconut', 'bread', 'bean', 'dip'], 'level': 2}, {'id': 'Z168', 'texts': ['milk', 'coconut'], 'level': 3}, {'id': 'Z167', 'texts': ['vegetable', 'fruit'], 'level': 3}, {'id': 'Z169', 'texts': ['bread', 'bean', 'tea', 'dip', 'jelly'], 'level': 3}, {'id': 'Z35', 'texts': ['mango', 'fruit', 'milk', 'banana', 'vegetable', 'coconut', 'bread'], 'level': 1}, {'id': 'Z214', 'texts': ['mango', 'banana', 'pineapple', 'orange', 'apple', 'peach', 'banana-banana'], 'level': 2}, {'id': 'Z170', 'texts': ['pineapple', 'orange', 'a

In [3]:
from collections import defaultdict

topic_map_filepath = '../output-jsons/' + 'T3.topics.json'

with open(topic_map_filepath, 'r') as f:
    topic_mapping_data = json.load(f)

topic_ids_per_vid_mapping = defaultdict(list)

# use dictionary to change mapping to list topics per video
# doc id serves as the grouping key
for topic_obj in topic_mapping_data:
    topic = topic_obj["topic"]
    
    for doc_entry in topic_obj["doc"]:
        doc_number, probability = doc_entry
        topic_ids_per_vid_mapping[doc_number].append((topic, probability))


# Document numbers are based on the order they are fed into the HLTA model or which line they are in orig text file, starts from 0 til n

In [4]:
# link the top words to the video instead of just the topic id
topic_to_words = {entry["id"]: (", ".join(entry["texts"]), entry["level"]) for entry in flat_list}

# store data in a new dictionary
topic_words_per_vid_mapping = defaultdict(list)

for doc, topic_list in topic_ids_per_vid_mapping.items():
    for t, p in topic_list:
        if t in topic_to_words:
            topic_words, level = topic_to_words[t]
            topic_words_per_vid_mapping[doc].append((topic_words, level, p))
        else:
            topic_words_per_vid_mapping[doc].append((t, None, p))  # Handle case where topic ID is not found

for doc, topic_list in topic_words_per_vid_mapping.items():
    topics_str = ", ".join(f"{t} (Level {lvl}): {p:.2f}" if lvl is not None else f"{t}: {p:.2f}" for t, lvl, p in topic_list)
    print(f"Document {doc} - {topics_str}")

Document 65 - latino, feature-feature, podcast, viral, coach, feature, freak (Level 2): 1.00, beauty, incredible, star, movie, actual, power, jesus (Level 2): 0.57, singer, song, singing, sing, voice, song-song, fish (Level 1): 1.00, fish-fish, fish (Level 3): 1.00, vocal, opm, lyric, artist, performance, karaoke, tiktok (Level 2): 1.00, brother, jay, pipe (Level 3): 0.69, vocal, opm, lyric, tiktok (Level 3): 1.00, latino, viral, coach, freak, review (Level 3): 1.00, dance, dancing (Level 3): 1.00, karaoke, machine (Level 3): 1.00, feature-feature, podcast, feature (Level 3): 1.00, competition, kick, winner (Level 3): 0.82, drink, beer, drinking, alcohol, bottle, juice, drunk (Level 2): 0.68, actual, key, holy (Level 3): 0.62, boat, tour (Level 3): 1.00, dance, competition, kick, winner, training, dancing, knee (Level 2): 0.93, singer, singing, voice, song-song (Level 3): 1.00, song, sing (Level 3): 1.00, alcohol, bottle, drunk (Level 3): 0.99, true, story, sense, heart, instagram (Lev

In [5]:
# append into main dataframe
for doc, topic_list in topic_words_per_vid_mapping.items():
    # Group topics by level
    level_groups = defaultdict(list)
    for t, lvl, p in topic_list:
        level_groups[lvl].append((t, p))

   
    sorted_levels = sorted(level_groups.keys(), reverse=True, key=lambda x: x if x is not None else -1)

  
    formatted_text = ""
    for i, lvl in enumerate(sorted_levels):
        if lvl is not None:
            formatted_text += f"Level {lvl}\n"
        for t, p in level_groups[lvl]:
            formatted_text += f"{t}: ({p:.2f})\n"
        formatted_text += "\n"  

    
    topics_per_video_df.iloc[int(doc), 1] = formatted_text.strip()  

In [6]:
topics_per_video_df.to_csv("T3-topics-per-vid.csv")