In [1]:
import json
import pandas as pd
import os 
import re

# Load the JSON file
filename = 'T3.nodes.json'
filepath = '../output-jsons/' + filename
with open(filepath, 'r') as f:
    data = json.load(f)

def flatten_tree(node, depth=1, result=None):
    if result is None:
        result = []
        
    """Recursively flattens a tree structure into a list."""
    result.append({
        "id": node["id"],
        "texts": node["text"].split(),
        "level": depth
    })
    
    for child in node.get("children", []):
        flatten_tree(child, depth + 1, result)
    
    return result

# Process each root node in the JSON
flat_list = []
for root in data:
    flat_list.extend(flatten_tree(root))


In [2]:
import pandas as pd
import os 
import re

sorted_flat_list = sorted(flat_list, key=lambda x: x['level'], reverse=True)

id = [item['id'] for item in sorted_flat_list]
df = pd.DataFrame({'Topic ID': id})

# add levels
levels = [item['level'] for item in sorted_flat_list]
df['Level'] = levels


# Extract the 'texts' portion and join them into a single string for each item
texts = [' '.join(item['texts']) for item in sorted_flat_list]

# Create a DataFrame with the texts
df['Topics'] = texts

print(df)

#scoring and label df
scoring_df = df.copy()

scoring_df['Topic Label'] = None
scoring_df['Conceptual Relation'] = None
scoring_df['Filipino Context'] = None

# scoring_df.to_csv('1800-quali-criteria.csv', index=False)


    Topic ID  Level                                             Topics
0       Z143      4                                   airport terminal
1       Z142      4                                 flight plane board
2       Z137      4    internet hotel wifus average haircut luke shave
3       Z141      4                          transportation public bus
4       Z134      4     business company issue reason deal owner aware
..       ...    ...                                                ...
286      Z43      1        sauce spicy delicious pork dish meat crispy
287      Z46      1     peso shop coffee market dollar cheap breakfast
288      Z42      1  singing song sing singer vocal feature-feature...
289      Z45      1              island road beach local ride car taxi
290      Z44      1         mountain wall foot river blue bathroom sky

[291 rows x 3 columns]


# FOR 100

In [3]:
# document_mapping_file_path = './output-jsons/100.topics.json'
# with open(document_mapping_file_path, 'r') as f:
#     mapping_data = json.load(f)

# # Create Initial Dataframe with all video IDs
# dataset_folder = "../Previous_THS-ST2_Files/standard_dataset_old"
# data_records = []

# # Regex pattern to extract Video Id and Title from the filename
# filename_pattern = re.compile(r"^(.*)_(.*?)_captions\.txt$")

# video_titles = []

# for file in sorted(os.listdir(dataset_folder)): 
#     if file.endswith("_captions.txt"):  # Ensure it's a valid transcript file
#         match = filename_pattern.match(file)
#         if match:
#             video_id, video_title = match.groups()  # Extract Video Id and Title
#             file_path = os.path.join(dataset_folder, file)

#             video_titles.append(video_title)

# topic_videos = {}

# for item in mapping_data:
#     topic_id = item['topic']
#     video_texts = [video_titles[int(doc_id)] for doc_id, _ in item['doc'] if int(doc_id) < len(video_titles)]
#     topic_videos[topic_id] = "\n".join(video_texts)

# # Add the new column by mapping the topic IDs
# df["Video Titles"] = df["ID"].map(topic_videos).fillna("")

# # Print the updated DataFrame
# print(df)

# df.to_csv('100_videos_per_topic.csv', index=False)




# FOR 1800

In [4]:

document_mapping_file_path = '../output-jsons/T3.topics.json'
with open(document_mapping_file_path, 'r') as f:
    mapping_data = json.load(f)

print(mapping_data)

video_titles = []
directory = '../../video_titles'

for filename in sorted(os.listdir(directory)):
    if filename.endswith('.txt'):
        with open(os.path.join(directory, filename), 'r', encoding='utf-8') as file:
            video_titles.append(file.read())

topic_videos = {}

for item in mapping_data:
    topic_id = item['topic']
    video_texts = [video_titles[int(doc_id)] for doc_id, _ in item['doc'] if int(doc_id) < len(video_titles)]
    topic_videos[topic_id] = "\n".join(video_texts)

# Add the new column by mapping the topic IDs
df["Video Titles"] = df["Topic ID"].map(topic_videos).fillna("")

# Print the updated DataFrame
print(df)

df.to_csv('T3_videos_per_topic.csv', index=False)



[{'topic': 'Z235', 'doc': [['936', 1.0], ['1049', 1.0], ['1258', 1.0], ['2549', 1.0], ['550', 1.0], ['690', 1.0], ['838', 1.0], ['2163', 1.0], ['817', 1.0], ['934', 1.0], ['2441', 1.0], ['2600', 1.0], ['1435', 1.0], ['2017', 1.0], ['524', 1.0], ['2433', 1.0], ['600', 1.0], ['660', 1.0], ['888', 1.0], ['1092', 1.0], ['1217', 1.0], ['2521', 1.0], ['128', 1.0], ['1428', 1.0], ['1991', 1.0], ['847', 1.0], ['1723', 1.0], ['1965', 1.0], ['1409', 1.0], ['463', 1.0], ['685', 1.0], ['2153', 1.0], ['2300', 1.0], ['2213', 1.0], ['2567', 1.0], ['1596', 1.0], ['2099', 1.0], ['1816', 1.0], ['1924', 1.0], ['2461', 1.0], ['308', 1.0], ['1525', 1.0], ['1882', 1.0], ['1912', 1.0], ['1078', 1.0], ['1937', 1.0], ['97', 1.0], ['703', 1.0], ['1022', 1.0], ['70', 1.0], ['333', 1.0], ['2109', 1.0], ['351', 1.0], ['569', 1.0], ['350', 1.0], ['516', 1.0], ['2627', 1.0], ['1929', 1.0], ['2056', 1.0], ['1292', 1.0], ['1640', 1.0], ['1123', 1.0], ['1925', 1.0], ['2696', 1.0], ['2440', 1.0], ['2499', 1.0], ['1632',

In [5]:
# Statistics, how many videos under the topics

df['Video Count'] = df['Video Titles'].apply(lambda x: len(set(x.split("\n")) if x else []))

pd.set_option('display.max_rows', None)  
print(df[['Topic ID', 'Video Count']])
pd.reset_option('display.max_rows')  


    Topic ID  Video Count
0       Z143          249
1       Z142          299
2       Z137          128
3       Z141          315
4       Z134          517
5       Z140          516
6       Z135          287
7       Z136          292
8       Z138          256
9       Z139          667
10      Z132          157
11      Z133          224
12       Z15          116
13       Z17          457
14       Z14          100
15       Z16          429
16       Z18          116
17       Z12          348
18       Z13          101
19       Z11          377
20      Z115           55
21      Z116          277
22      Z117          603
23      Z112          827
24      Z111          511
25      Z113          461
26      Z114          650
27      Z110          320
28       Z19          291
29      Z121          670
30      Z123          242
31      Z125          311
32      Z127          707
33      Z120          347
34      Z124          202
35      Z122          402
36      Z126          182
37      Z118