In [11]:
import json
import pandas as pd
import os 
import re

# Load the JSON file
filename = '1800.nodes.json'
filepath = '../output-jsons/' + filename
with open(filepath, 'r') as f:
    data = json.load(f)

def flatten_tree(node, depth=1, result=None):
    if result is None:
        result = []
        
    """Recursively flattens a tree structure into a list."""
    result.append({
        "id": node["id"],
        "texts": node["text"].split(),
        "level": depth
    })
    
    for child in node.get("children", []):
        flatten_tree(child, depth + 1, result)
    
    return result

# Process each root node in the JSON
flat_list = []
for root in data:
    flat_list.extend(flatten_tree(root))


In [12]:
import pandas as pd
import os 
import re

sorted_flat_list = sorted(flat_list, key=lambda x: x['level'], reverse=True)

id = [item['id'] for item in sorted_flat_list]
df = pd.DataFrame({'Topic ID': id})

# add levels
levels = [item['level'] for item in sorted_flat_list]
df['Level'] = levels


# Extract the 'texts' portion and join them into a single string for each item
texts = [' '.join(item['texts']) for item in sorted_flat_list]

# Create a DataFrame with the texts
df['Topics'] = texts

print(df)

#scoring and label df
scoring_df = df.copy()

scoring_df['Topic Label'] = None
scoring_df['Conceptual Relation'] = None
scoring_df['Filipino Context'] = None

# scoring_df.to_csv('1800-quali-criteria.csv', index=False)


    Topic ID  Level                                             Topics
0       Z140      4                                driver busy journey
1       Z142      4                           clothes shirt shoe bunch
2       Z139      4                               flight airport plane
3       Z141      4                                    short luck wave
4       Z143      4                             property concrete land
..       ...    ...                                                ...
352      Z41      1              island peso walk road price car beach
353      Z43      1    rain retire building rainy cost season province
354      Z42      1     taste sauce flavor rice chicken delicious meat
355      Z45      1  asian country war government history school ma...
356      Z44      1  singer song vocal singing sing feature-friday-...

[357 rows x 3 columns]


# FOR 100

In [13]:
# document_mapping_file_path = './output-jsons/100.topics.json'
# with open(document_mapping_file_path, 'r') as f:
#     mapping_data = json.load(f)

# # Create Initial Dataframe with all video IDs
# dataset_folder = "../Previous_THS-ST2_Files/standard_dataset_old"
# data_records = []

# # Regex pattern to extract Video Id and Title from the filename
# filename_pattern = re.compile(r"^(.*)_(.*?)_captions\.txt$")

# video_titles = []

# for file in sorted(os.listdir(dataset_folder)): 
#     if file.endswith("_captions.txt"):  # Ensure it's a valid transcript file
#         match = filename_pattern.match(file)
#         if match:
#             video_id, video_title = match.groups()  # Extract Video Id and Title
#             file_path = os.path.join(dataset_folder, file)

#             video_titles.append(video_title)

# topic_videos = {}

# for item in mapping_data:
#     topic_id = item['topic']
#     video_texts = [video_titles[int(doc_id)] for doc_id, _ in item['doc'] if int(doc_id) < len(video_titles)]
#     topic_videos[topic_id] = "\n".join(video_texts)

# # Add the new column by mapping the topic IDs
# df["Video Titles"] = df["ID"].map(topic_videos).fillna("")

# # Print the updated DataFrame
# print(df)

# df.to_csv('100_videos_per_topic.csv', index=False)




# FOR 1800

In [14]:

document_mapping_file_path = '../output-jsons/1800.topics.json'
with open(document_mapping_file_path, 'r') as f:
    mapping_data = json.load(f)

print(mapping_data)

video_titles = []
directory = '../../video_titles'

for filename in sorted(os.listdir(directory)):
    if filename.endswith('.txt'):
        with open(os.path.join(directory, filename), 'r', encoding='utf-8') as file:
            video_titles.append(file.read())

topic_videos = {}

for item in mapping_data:
    topic_id = item['topic']
    video_texts = [video_titles[int(doc_id)] for doc_id, _ in item['doc'] if int(doc_id) < len(video_titles)]
    topic_videos[topic_id] = "\n".join(video_texts)

# Add the new column by mapping the topic IDs
df["Video Titles"] = df["Topic ID"].map(topic_videos).fillna("")

# Print the updated DataFrame
print(df)

df.to_csv('1800_videos_per_topic.csv', index=False)



[{'topic': 'Z1110', 'doc': [['452', 1.0], ['644', 1.0], ['699', 1.0], ['793', 1.0], ['979', 1.0], ['1210', 1.0], ['1477', 1.0], ['1620', 1.0], ['146', 0.97], ['169', 0.97], ['261', 0.97], ['447', 0.97], ['539', 0.97], ['669', 0.97], ['1056', 0.97], ['1059', 0.97], ['1145', 0.97], ['1472', 0.97], ['1595', 0.97], ['50', 0.94], ['158', 0.94], ['197', 0.94], ['290', 0.94], ['304', 0.94], ['322', 0.94], ['334', 0.94], ['397', 0.94], ['505', 0.94], ['511', 0.94], ['550', 0.94], ['756', 0.94], ['848', 0.94], ['895', 0.94], ['966', 0.94], ['985', 0.94], ['1034', 0.94], ['1072', 0.94], ['1073', 0.94], ['1130', 0.94], ['1153', 0.94], ['1214', 0.94], ['1238', 0.94], ['1245', 0.94], ['1258', 0.94], ['1277', 0.94], ['1330', 0.94], ['1341', 0.94], ['1377', 0.94], ['1503', 0.94], ['1665', 0.94], ['1674', 0.94], ['1706', 0.94], ['1711', 0.94], ['1748', 0.94], ['1753', 0.94], ['206', 0.72], ['266', 0.72], ['394', 0.72], ['757', 0.72], ['835', 0.72], ['856', 0.72], ['944', 0.72], ['1195', 0.72], ['1406'

In [15]:
# Statistics, how many videos under the topics

df['Video Count'] = df['Video Titles'].apply(lambda x: len(set(x.split("\n")) if x else []))

pd.set_option('display.max_rows', None)  
print(df[['Topic ID', 'Video Count']])
pd.reset_option('display.max_rows')  


    Topic ID  Video Count
0       Z140          144
1       Z142           66
2       Z139           76
3       Z141          271
4       Z143           64
5       Z145          164
6       Z144           31
7       Z134          282
8       Z135          275
9       Z136          220
10      Z138          280
11      Z137          219
12      Z183           27
13      Z182          121
14      Z181          469
15      Z176          173
16      Z177          263
17      Z175          110
18      Z178          118
19      Z180           87
20      Z179           28
21      Z159           52
22      Z160           46
23      Z158          203
24      Z164          151
25      Z165          284
26      Z161           72
27      Z163          105
28      Z162           73
29      Z115          159
30       Z17          159
31       Z14          219
32      Z110           30
33       Z16          181
34      Z116          128
35      Z112           28
36      Z113          108
37       Z19