In [1]:
import json
import pandas as pd
import os 
import re

# Load the JSON file
filename = 'T3.nodes.json'
filepath = '../output-jsons/' + filename
with open(filepath, 'r') as f:
    data = json.load(f)

def flatten_tree(node, depth=1, result=None):
    if result is None:
        result = []
        
    """Recursively flattens a tree structure into a list."""
    result.append({
        "id": node["id"],
        "texts": node["text"].split(),
        "level": depth
    })
    
    for child in node.get("children", []):
        flatten_tree(child, depth + 1, result)
    
    return result

# Process each root node in the JSON
flat_list = []
for root in data:
    flat_list.extend(flatten_tree(root))


In [2]:
import pandas as pd
import os 
import re

sorted_flat_list = sorted(flat_list, key=lambda x: x['level'], reverse=True)

id = [item['id'] for item in sorted_flat_list]
df = pd.DataFrame({'Topic ID': id})

# add levels
levels = [item['level'] for item in sorted_flat_list]
df['Level'] = levels


# Extract the 'texts' portion and join them into a single string for each item
texts = [' '.join(item['texts']) for item in sorted_flat_list]

# Create a DataFrame with the texts
df['Topics'] = texts

print(df)

#scoring and label df
scoring_df = df.copy()

scoring_df['Topic Label'] = None
scoring_df['Conceptual Relation'] = None
scoring_df['Filipino Context'] = None

# scoring_df.to_csv('1800-quali-criteria.csv', index=False)


    Topic ID  Level                                             Topics
0       Z170      3                         pineapple orange apple cut
1       Z172      3                               banana banana-banana
2       Z171      3                                        mango peach
3       Z168      3                                       milk coconut
4       Z167      3                                    vegetable fruit
..       ...    ...                                                ...
267     Z316      1  popular beauty incredible star movie actual power
268      Z38      1      peso shop coffee dollar store cheap expensive
269     Z310      1        sauce delicious spicy dish pork meat crispy
270      Z32      1                 road walk local car trip main busy
271     Z313      1    garlic onion pepper chili ingredient cook salty

[272 rows x 3 columns]


# FOR 100

In [3]:
# document_mapping_file_path = './output-jsons/100.topics.json'
# with open(document_mapping_file_path, 'r') as f:
#     mapping_data = json.load(f)

# # Create Initial Dataframe with all video IDs
# dataset_folder = "../Previous_THS-ST2_Files/standard_dataset_old"
# data_records = []

# # Regex pattern to extract Video Id and Title from the filename
# filename_pattern = re.compile(r"^(.*)_(.*?)_captions\.txt$")

# video_titles = []

# for file in sorted(os.listdir(dataset_folder)): 
#     if file.endswith("_captions.txt"):  # Ensure it's a valid transcript file
#         match = filename_pattern.match(file)
#         if match:
#             video_id, video_title = match.groups()  # Extract Video Id and Title
#             file_path = os.path.join(dataset_folder, file)

#             video_titles.append(video_title)

# topic_videos = {}

# for item in mapping_data:
#     topic_id = item['topic']
#     video_texts = [video_titles[int(doc_id)] for doc_id, _ in item['doc'] if int(doc_id) < len(video_titles)]
#     topic_videos[topic_id] = "\n".join(video_texts)

# # Add the new column by mapping the topic IDs
# df["Video Titles"] = df["ID"].map(topic_videos).fillna("")

# # Print the updated DataFrame
# print(df)

# df.to_csv('100_videos_per_topic.csv', index=False)




# FOR 1800

In [None]:

document_mapping_file_path = '../output-jsons/T3.topics.json'
with open(document_mapping_file_path, 'r') as f:
    mapping_data = json.load(f)

print(mapping_data)

video_titles = []
directory = '../../video_titles'

for filename in sorted(os.listdir(directory)):
    if filename.endswith('.txt'):
        with open(os.path.join(directory, filename), 'r', encoding='utf-8') as file:
            video_titles.append(file.read())

topic_videos = {}

for item in mapping_data:
    topic_id = item['topic']
    video_texts = [video_titles[int(doc_id)] for doc_id, _ in item['doc'] if int(doc_id) < len(video_titles)]
    topic_videos[topic_id] = "\n".join(video_texts)

# Add the new column by mapping the topic IDs
df["Video Titles"] = df["Topic ID"].map(topic_videos).fillna("")

# Print the updated DataFrame
print(df)

df.to_csv('T3_videos_per_topic.csv', index=False)



[{'topic': 'Z22', 'doc': [['65', 1.0], ['105', 1.0], ['321', 1.0], ['337', 1.0], ['494', 1.0], ['514', 1.0], ['536', 1.0], ['721', 1.0], ['759', 1.0], ['800', 1.0], ['910', 1.0], ['986', 1.0], ['1115', 1.0], ['1378', 1.0], ['1386', 1.0], ['1389', 1.0], ['1429', 1.0], ['1442', 1.0], ['1587', 1.0], ['1639', 1.0], ['1735', 1.0], ['1757', 1.0], ['1786', 1.0], ['1896', 1.0], ['1938', 1.0], ['2025', 1.0], ['2453', 1.0], ['2504', 1.0], ['315', 1.0], ['631', 1.0], ['853', 1.0], ['1224', 1.0], ['1242', 1.0], ['1600', 1.0], ['1637', 1.0], ['1791', 1.0], ['1844', 1.0], ['2261', 1.0], ['2267', 1.0], ['2381', 1.0], ['2581', 1.0], ['367', 1.0], ['589', 1.0], ['811', 1.0], ['1288', 1.0], ['1348', 1.0], ['1621', 1.0], ['2019', 1.0], ['2285', 1.0], ['2305', 1.0], ['2479', 1.0], ['2515', 1.0], ['78', 1.0], ['296', 1.0], ['479', 1.0], ['498', 1.0], ['1027', 1.0], ['1584', 1.0], ['1729', 1.0], ['1783', 1.0], ['1949', 1.0], ['2182', 1.0], ['2345', 1.0], ['2472', 1.0], ['2676', 1.0], ['106', 1.0], ['725', 1

In [7]:
df = df.head(3)
df.to_csv('small_videos_per_topic.csv', index=False)

In [5]:
# Statistics, how many videos under the topics

df['Video Count'] = df['Video Titles'].apply(lambda x: len(set(x.split("\n")) if x else []))

pd.set_option('display.max_rows', None)  
print(df[['Topic ID', 'Video Count']])
pd.reset_option('display.max_rows')  


   Topic ID  Video Count
0      Z170          362
1      Z172          267
2      Z171          278
3      Z168          474
4      Z167          529
5      Z169          211
6     Z1161          307
7     Z1162          287
8     Z1160           81
9     Z1170          603
10    Z1166          430
11    Z1169          353
12    Z1168          374
13    Z1167          281
14    Z1163          639
15    Z1164          180
16    Z1165          507
17     Z196          539
18     Z197          295
19     Z193          484
