In [5]:
import json
import pandas as pd
import os 
import re

# Load the JSON file
filename = '100.nodes.json'
filepath = './output-jsons/' + filename
with open(filepath, 'r') as f:
    data = json.load(f)

def flatten_tree(node, depth=1, result=None):
    if result is None:
        result = []
        
    """Recursively flattens a tree structure into a list."""
    result.append({
        "id": node["id"],
        "texts": node["text"].split(),
        "level": depth
    })
    
    for child in node.get("children", []):
        flatten_tree(child, depth + 1, result)
    
    return result

# Process each root node in the JSON
flat_list = []
for root in data:
    flat_list.extend(flatten_tree(root))

for item in flat_list:
    print(item)

{'id': 'Z41', 'texts': ['ariana', 'tnt-boy', 'singer', 'world', 'talent', 'male', 'record'], 'level': 1}
{'id': 'Z31', 'texts': ['ariana', 'tnt-boy', 'singer', 'world', 'talent', 'male', 'record'], 'level': 2}
{'id': 'Z23', 'texts': ['native', 'tarsier', 'female', 'book', 'catch', 'statue', 'inch'], 'level': 3}
{'id': 'Z18', 'texts': ['inch', 'cost-peso', 'animal'], 'level': 4}
{'id': 'Z111', 'texts': ['tarsier', 'government', 'ferry'], 'level': 4}
{'id': 'Z19', 'texts': ['native', 'book', 'spanish'], 'level': 4}
{'id': 'Z110', 'texts': ['female', 'catch', 'statue'], 'level': 4}
{'id': 'Z21', 'texts': ['singer', 'vocal', 'performance', 'sing', 'voice', 'celebrity', 'bell'], 'level': 3}
{'id': 'Z14', 'texts': ['bell', 'hear', 'regine-velasquez'], 'level': 4}
{'id': 'Z12', 'texts': ['singer', 'vocal', 'celebrity', 'actress'], 'level': 4}
{'id': 'Z11', 'texts': ['performance', 'sing', 'voice'], 'level': 4}
{'id': 'Z13', 'texts': ['morissette-amon', 'morissette', 'forget'], 'level': 4}
{'i

In [6]:
import pandas as pd
import os 
import re

sorted_flat_list = sorted(flat_list, key=lambda x: x['level'], reverse=True)

id = [item['id'] for item in sorted_flat_list]
df = pd.DataFrame({'ID': id})

# add levels
levels = [item['level'] for item in sorted_flat_list]
df['Level'] = levels


# Extract the 'texts' portion and join them into a single string for each item
texts = [' '.join(item['texts']) for item in sorted_flat_list]

# Create a DataFrame with the texts
df['Topics'] = texts

print(df)

#scoring and label df
scoring_df = df.copy()

scoring_df['Topic Label'] = None
scoring_df['Conceptual Relation'] = None
scoring_df['Filipino Context'] = None

scoring_df.to_csv('100-quali-criteria.csv', index=False)


       ID  Level                                             Topics
0     Z18      4                              inch cost-peso animal
1    Z111      4                           tarsier government ferry
2     Z19      4                                native book spanish
3    Z110      4                                female catch statue
4     Z14      4                         bell hear regine-velasquez
..    ...    ...                                                ...
343   Z46      1   airport month noise drive local plan beach-beach
344   Z42      1             ube blog store comfort bench soft ship
345   Z45      1  boat-trip boat-tour palawan spirit feed gas ma...
346   Z44      1           steel north gym color purpose body board
347   Z47      1  gravy dip combination vinegar sweet-sweet rice...

[348 rows x 3 columns]


# FOR 100

In [7]:
document_mapping_file_path = './output-jsons/100.topics.json'
with open(document_mapping_file_path, 'r') as f:
    mapping_data = json.load(f)

# Create Initial Dataframe with all video IDs
dataset_folder = "../Previous_THS-ST2_Files/standard_dataset_old"
data_records = []

# Regex pattern to extract Video Id and Title from the filename
filename_pattern = re.compile(r"^(.*)_(.*?)_captions\.txt$")

video_titles = []

for file in sorted(os.listdir(dataset_folder)): 
    if file.endswith("_captions.txt"):  # Ensure it's a valid transcript file
        match = filename_pattern.match(file)
        if match:
            video_id, video_title = match.groups()  # Extract Video Id and Title
            file_path = os.path.join(dataset_folder, file)

            video_titles.append(video_title)

topic_videos = {}

for item in mapping_data:
    topic_id = item['topic']
    video_texts = [video_titles[int(doc_id)] for doc_id, _ in item['doc'] if int(doc_id) < len(video_titles)]
    topic_videos[topic_id] = "\n".join(video_texts)

# Add the new column by mapping the topic IDs
df["Video Titles"] = df["ID"].map(topic_videos).fillna("")

# Print the updated DataFrame
print(df)

df.to_csv('100_videos_per_topic.csv', index=False)




       ID  Level                                             Topics  \
0     Z18      4                              inch cost-peso animal   
1    Z111      4                           tarsier government ferry   
2     Z19      4                                native book spanish   
3    Z110      4                                female catch statue   
4     Z14      4                         bell hear regine-velasquez   
..    ...    ...                                                ...   
343   Z46      1   airport month noise drive local plan beach-beach   
344   Z42      1             ube blog store comfort bench soft ship   
345   Z45      1  boat-trip boat-tour palawan spirit feed gas ma...   
346   Z44      1           steel north gym color purpose body board   
347   Z47      1  gravy dip combination vinegar sweet-sweet rice...   

                                          Video Titles  
0           STRANGEST Things Found In The Philippines!  
1    The Legend of The Chocolate 

# FOR 1800

In [8]:

# document_mapping_file_path = './output-jsons/no-verb.topics.json'
# with open(document_mapping_file_path, 'r') as f:
#     mapping_data = json.load(f)

# print(mapping_data)

# video_titles = []
# directory = '../video_titles'

# for filename in sorted(os.listdir(directory)):
#     if filename.endswith('.txt'):
#         with open(os.path.join(directory, filename), 'r', encoding='utf-8') as file:
#             video_titles.append(file.read())

# topic_videos = {}

# for item in mapping_data:
#     topic_id = item['topic']
#     video_texts = [video_titles[int(doc_id)] for doc_id, _ in item['doc'] if int(doc_id) < len(video_titles)]
#     topic_videos[topic_id] = "\n".join(video_texts)

# # Add the new column by mapping the topic IDs
# df["Video Titles"] = df["ID"].map(topic_videos).fillna("")

# # Print the updated DataFrame
# print(df)

# df.to_csv('1800_videos_per_topic.csv', index=False)

