In [26]:
# Store topics as list of lists
import json
# Load the JSON file
filename = 'fil-1.nodes.json'
filepath = './output-jsons/' + filename
with open(filepath, 'r') as f:
    data = json.load(f)

def flatten_tree(node, depth=1, result=[]):
    """Recursively flattens a tree structure into a list."""
    result.append({
        "id": node["id"],
        "texts": node["text"].split(),
        "level": depth
    })
    
    for child in node.get("children", []):
        flatten_tree(child, depth + 1, result)
    
    return result

# Process each root node in the JSON
flat_list = []
for root in data:
    flat_list.extend(flatten_tree(root))

In [27]:
# Gensim attempt
from gensim.models.coherencemodel import CoherenceModel
from gensim.corpora.dictionary import Dictionary

topic_list = [item["texts"] for item in flat_list]


dictionary = Dictionary(topic_list)
corpus = [dictionary.doc2bow(text) for text in topic_list]

cm = CoherenceModel(topics=topic_list, corpus=corpus, dictionary=dictionary, coherence='u_mass')

print("Total Coherence:", cm.get_coherence(), "\n")

individual_coherence_scores = []

# Compute coherence for each topic separately
for topic in topic_list:
    cm = CoherenceModel(topics=[topic], corpus=corpus, dictionary=dictionary, coherence='u_mass')
    coherence_score = cm.get_coherence()
    individual_coherence_scores.append(coherence_score)

id_list = [item["id"] for item in flat_list]
id_hierarchy = [item["level"] for item in flat_list]



Total Coherence: -0.3897177449803579 



In [28]:
# Print coherence scores for each topic
for i, score in enumerate(individual_coherence_scores):
    print(f"{id_list[i]}: {score}, level: {id_hierarchy[i]}")

Z41: -0.49197812725430456, level: 1
Z31: -0.49197812725430456, level: 2
Z22: -0.8195664988364186, level: 3
Z18: 1.7449997001404864e-10, level: 4
Z17: -0.3960841030026115, level: 4
Z16: -0.3300700858105812, level: 4
Z15: -0.7261541890105438, level: 4
Z21: -0.8276569291683186, level: 3
Z14: -0.8086717104933112, level: 4
Z12: -0.8317766165279717, level: 4
Z13: 1.7449997001404864e-10, level: 4
Z11: -0.6007275563166026, level: 4
Z32: -0.8949615209449598, level: 2
Z23: -0.4951051288218181, level: 3
Z110: -0.39608410301507585, level: 4
Z111: -0.39608410301507585, level: 4
Z19: -0.46209812019879687, level: 4
Z26: -0.7074916140213328, level: 3
Z119: -0.2703100719848596, level: 4
Z120: -0.4054651080209144, level: 4
Z118: 8.724998501083059e-11, level: 4
Z25: -0.7324081923013745, level: 3
Z116: -0.5780743514309079, level: 4
Z115: -0.2970630772124886, level: 4
Z117: 5.81665826598646e-11, level: 4
Z24: -0.7211907603257904, level: 3
Z114: -0.3300700858105812, level: 4
Z113: -0.4158883081876422, level

In [40]:
# Coherence score per hierarchy tier
import numpy as np
from collections import defaultdict

hierarchy_coherence = defaultdict(list)

for i, score in enumerate(individual_coherence_scores):
    hierarchy_coherence[id_hierarchy[i]].append(score)

average_coherence_scores = {}

for hierarchy_id, scores in hierarchy_coherence.items():
    average_coherence_scores[hierarchy_id] = np.mean(scores)

for hierarchy_id, avg_score in average_coherence_scores.items():
    print(f"Hierarchy {hierarchy_id}: Average Coherence Score = {avg_score}")



Hierarchy 1: Average Coherence Score = -0.719087152558066
Hierarchy 2: Average Coherence Score = -0.7580699945457431
Hierarchy 3: Average Coherence Score = -0.6430071111998164
Hierarchy 4: Average Coherence Score = -0.27636368881287476


In [None]:
# Determine topics per video
import json
from collections import defaultdict

topic_map_path = './output-jsons/fil-1.topics.json'
with open(topic_map_path, 'r') as f:
    topic_mapping_data = json.load(f)

topics_per_vid_mapping = defaultdict(list)

# use dictionary to change mapping to list topics per video
for topic_obj in topic_mapping_data:
    topic = topic_obj["topic"]
    
    for doc_entry in topic_obj["doc"]:
        doc_number, probability = doc_entry
        topics_per_vid_mapping[doc_number].append((topic, probability))

for doc, topic_list in topics_per_vid_mapping.items():
    topics_str = ", ".join(f"{t}: {p:.2f}" for t, p in topic_list)
    print(f"Document {doc} - {topics_str}")

# Document numbers are based on the order they are fed into the HLTA model or which line they are in orig text file, starts from 0 til n

In [None]:
# link the top words to the video instead of just the topic id
flat_mappipng = {entry["id"]: entry["texts"] for entry in flat_list}

In [None]:
#Save information to a file
outputpath = "./evaluation-metrics/" + filename + ".txt"

with open(outputpath, "w") as file:
    file.write(filename + "\n")
    file.write("Total Coherence: " + str(cm.get_coherence()) + "\n")

    for hierarchy_id, avg_score in average_coherence_scores.items():
        file.write(f"Hierarchy {hierarchy_id}: Average Coherence Score = {avg_score}\n")