In [26]:
# Store topics as list of lists
import json
# Load the JSON file
filename = 'no-verb.nodes.json'
filepath = './output-jsons/' + filename
with open(filepath, 'r') as f:
    data = json.load(f)

def flatten_tree(node, depth=1, result=[]):
    """Recursively flattens a tree structure into a list."""
    result.append({
        "id": node["id"],
        "texts": node["text"].split(),
        "level": depth
    })
    
    for child in node.get("children", []):
        flatten_tree(child, depth + 1, result)
    
    return result

# Process each root node in the JSON
flat_list = []
for root in data:
    flat_list.extend(flatten_tree(root))

for item in flat_list:
    print(item)

{'id': 'Z23', 'texts': ['grill', 'ton', 'tank', 'morning', 'fit', 'doctor', 'medical'], 'level': 1}
{'id': 'Z134', 'texts': ['morissette', 'morissette-amon', 'description', 'sense', 'ferris-wheel', 'siargao', 'rib'], 'level': 2}
{'id': 'Z131', 'texts': ['doctor', 'medical', 'helpful', 'private', 'manila-metro', 'start', 'global'], 'level': 2}
{'id': 'Z143', 'texts': ['review', 'yummy', 'mushroom', 'bang', 'jame', 'concert', 'coron'], 'level': 2}
{'id': 'Z139', 'texts': ['vocal', 'voice', 'singer', 'sing', 'stage', 'tnt-boy', 'ariana'], 'level': 2}
{'id': 'Z144', 'texts': ['husband', 'wife', 'husband-wife', 'mother', 'mister', 'lucky', 'visa'], 'level': 2}
{'id': 'Z138', 'texts': ['buddy', 'closet', 'brooklyn', 'play', 'wing', 'ski', 'pan'], 'level': 2}
{'id': 'Z136', 'texts': ['sick', 'kid', 'height', 'goodness', 'paradise', 'corn', 'late'], 'level': 2}
{'id': 'Z145', 'texts': ['ramen', 'blogger', 'meal', 'sisig', 'fatty', 'sea', 'culture'], 'level': 2}
{'id': 'Z133', 'texts': ['fit', 

In [27]:
import pandas as pd

sorted_flat_list = sorted(flat_list, key=lambda x: x['level'], reverse=True)

id = [item['id'] for item in sorted_flat_list]
df = pd.DataFrame({'ID': id})

# add levels
levels = [item['level'] for item in sorted_flat_list]
df['Level'] = levels


# Extract the 'texts' portion and join them into a single string for each item
texts = [' '.join(item['texts']) for item in sorted_flat_list]

# Create a DataFrame with the texts
df['Topics'] = texts

print(df)


       ID  Level                                             Topics
0    Z134      2  morissette morissette-amon description sense f...
1    Z131      2  doctor medical helpful private manila-metro st...
2    Z143      2      review yummy mushroom bang jame concert coron
3    Z139      2       vocal voice singer sing stage tnt-boy ariana
4    Z144      2  husband wife husband-wife mother mister lucky ...
..    ...    ...                                                ...
235   Z23      1          grill ton tank morning fit doctor medical
236   Z22      1  jollibee chain jollibee-jollibee noodle strang...
237   Z25      1  coconut fresh dark explore guest chocolate-hil...
238   Z21      1  sandwich post infinity language bird ramadan s...
239   Z24      1     tourist beach boat road beach-beach tide space

[240 rows x 3 columns]


# FOR 100

In [28]:
# # Create Initial Dataframe with all video IDs
# dataset_folder = "./Previous_THS-ST2_Files/standard_dataset_old"
# data_records = []

# # Regex pattern to extract Video Id and Title from the filename
# filename_pattern = re.compile(r"^(.*)_(.*?)_captions\.txt$")

# for file in sorted(os.listdir(dataset_folder)): 
#     if file.endswith("_captions.txt"):  # Ensure it's a valid transcript file
#         match = filename_pattern.match(file)
#         if match:
#             video_id, video_title = match.groups()  # Extract Video Id and Title
#             file_path = os.path.join(dataset_folder, file)

#             # Append data to the list
#             data_records.append({
#                 "Video Title": video_title,
#                 "Link": f"https://www.youtube.com/watch?v={video_id}",
#                 "LDA1 Topics": None,
#                 "BERTopic Topics": None,
#                 "HLTA Topics":[]
#             })

# # Convert to DataFrame
# videos_per_topic_df = pd.DataFrame(data_records)
# videos_per_topic_df.set_index("Video Title", inplace=True)

# videos_per_topic_df

# FOR 1800

In [29]:

import os

document_mapping_file_path = './output-jsons/no-verb.topics.json'
with open(document_mapping_file_path, 'r') as f:
    mapping_data = json.load(f)

print(mapping_data)

video_titles = []
directory = '../video_titles'

for filename in sorted(os.listdir(directory)):
    if filename.endswith('.txt'):
        with open(os.path.join(directory, filename), 'r', encoding='utf-8') as file:
            video_titles.append(file.read())

topic_videos = {}

for item in mapping_data:
    topic_id = item['topic']
    video_texts = [video_titles[int(doc_id)] for doc_id, _ in item['doc'] if int(doc_id) < len(video_titles)]
    topic_videos[topic_id] = "\n".join(video_texts)

# Add the new column by mapping the topic IDs
df["Video Titles"] = df["ID"].map(topic_videos).fillna("")

# Print the updated DataFrame
print(df)

df.to_csv('videos_per_topic.csv', index=False)



[{'topic': 'Z22', 'doc': [['84', 1.0], ['79', 1.0], ['95', 1.0], ['34', 1.0], ['30', 1.0], ['25', 1.0], ['40', 1.0], ['54', 1.0], ['15', 0.98], ['97', 0.67], ['27', 0.53]]}, {'topic': 'Z158', 'doc': [['38', 1.0], ['92', 1.0], ['27', 1.0], ['13', 1.0], ['56', 0.99], ['0', 0.99], ['28', 0.99], ['35', 0.98], ['61', 0.98], ['63', 0.97], ['33', 0.94], ['26', 0.86], ['95', 0.86], ['15', 0.79], ['30', 0.79], ['79', 0.67], ['32', 0.66], ['29', 0.61]]}, {'topic': 'Z159', 'doc': [['0', 1.0], ['27', 1.0], ['95', 1.0], ['61', 1.0], ['56', 1.0], ['86', 0.99], ['29', 0.98], ['79', 0.75], ['80', 0.69]]}, {'topic': 'Z16', 'doc': [['27', 1.0], ['89', 0.99], ['15', 0.94]]}, {'topic': 'Z164', 'doc': [['27', 1.0], ['79', 1.0], ['34', 1.0], ['26', 1.0], ['95', 1.0], ['96', 1.0]]}, {'topic': 'Z117', 'doc': [['84', 1.0], ['79', 1.0], ['95', 1.0], ['54', 1.0], ['34', 1.0], ['30', 1.0], ['40', 1.0], ['25', 0.99], ['15', 0.98], ['27', 0.96], ['63', 0.72]]}, {'topic': 'Z147', 'doc': [['0', 1.0], ['47', 1.0], ['7