In [19]:
# Store topics as list of lists
import json
# Load the JSON file
filename = '1800.nodes.json'
filepath = '../output-jsons/' + filename
with open(filepath, 'r') as f:
    data = json.load(f)

def flatten_tree(node, depth=1, result=[]):
    """Recursively flattens a tree structure into a list."""
    result.append({
        "id": node["id"],
        "texts": node["text"].split(),
        "level": depth
    })
    
    for child in node.get("children", []):
        flatten_tree(child, depth + 1, result)
    
    return result

# Process each root node in the JSON
flat_list = []
for root in data:
    flat_list.extend(flatten_tree(root))

In [20]:
# Gensim attempt
from gensim.models.coherencemodel import CoherenceModel
from gensim.corpora.dictionary import Dictionary

topic_list = [item["texts"] for item in flat_list]


dictionary = Dictionary(topic_list)
corpus = [dictionary.doc2bow(text) for text in topic_list]

cm = CoherenceModel(topics=topic_list, corpus=corpus, dictionary=dictionary, coherence='u_mass')

print("Total Coherence (umass):", cm.get_coherence(), "\n")

individual_coherence_scores = []

# Compute coherence for each topic separately
for topic in topic_list:
    cm = CoherenceModel(topics=[topic], corpus=corpus, dictionary=dictionary, coherence='u_mass')
    coherence_score = cm.get_coherence()
    individual_coherence_scores.append(coherence_score)

id_list = [item["id"] for item in flat_list]
id_hierarchy = [item["level"] for item in flat_list]

print(topic_list)



Total Coherence (umass): -0.38869912285255465 

[['island', 'peso', 'walk', 'road', 'price', 'car', 'beach'], ['peso', 'price', 'plan', 'month', 'property', 'cheap', 'flight'], ['flight', 'airport', 'driver', 'busy', 'plane', 'short', 'clothes'], ['driver', 'busy', 'journey'], ['clothes', 'shirt', 'shoe', 'bunch'], ['flight', 'airport', 'plane'], ['short', 'luck', 'wave'], ['property', 'concrete', 'land', 'plant', 'house-house', 'field', 'wall'], ['property', 'concrete', 'land'], ['plant', 'house-house', 'field', 'pressure', 'board', 'space'], ['wall', 'pipe', 'folk', 'jay'], ['plan', 'month', 'meet', 'corner', 'week', 'pick', 'notice'], ['meet', 'advice', 'online'], ['plan', 'month', 'corner', 'week', 'project'], ['pick', 'start', 'notice', 'comfortable', 'process', 'difference', 'boy'], ['peso', 'price', 'cheap', 'dollar', 'cheaper'], ['cheaper', 'cheap'], ['peso', 'price', 'dollar'], ['wife', 'daniel', 'filipina', 'gas', 'electricity', 'challenge', 'internet'], ['wife', 'filipina', 

In [21]:
# Print coherence scores for each topic
for i, score in enumerate(individual_coherence_scores):
    print(f"{id_list[i]}: {score}, level: {id_hierarchy[i]}")

Z41: -0.8855805160039462, level: 1
Z32: -1.0649599925588604, level: 2
Z28: -0.6769663197006881, level: 3
Z140: -0.4620981201779635, level: 4
Z142: -0.3465735900455726, level: 4
Z139: -0.27031007195490947, level: 4
Z141: -0.4620981201388968, level: 4
Z29: -0.6109523025156648, level: 3
Z143: -0.27031007195490947, level: 4
Z145: -0.41588830812500716, level: 4
Z144: -0.3465735900455726, level: 4
Z26: -0.6551767430884733, level: 3
Z134: -0.4620981201388968, level: 4
Z135: -0.5205379369287033, level: 4
Z136: -0.2970630771540146, level: 4
Z27: -0.7742402020233582, level: 3
Z138: 1.1720002745387195e-10, level: 4
Z137: -0.4620981202756302, level: 4
Z36: -0.5455835726790205, level: 2
Z222: -0.47956955549399705, level: 3
Z183: -0.2703100719679317, level: 4
Z182: -0.7324081922630953, level: 4
Z181: 7.813327762257254e-11, level: 4
Z220: -0.49510512877047497, level: 3
Z176: 1.1720002745387195e-10, level: 4
Z177: -0.4620981201388968, level: 4
Z175: 1.1720002745387195e-10, level: 4
Z221: -0.6109523025

In [22]:
# Coherence score per hierarchy tier
import numpy as np
from collections import defaultdict

hierarchy_coherence = defaultdict(list)

for i, score in enumerate(individual_coherence_scores):
    hierarchy_coherence[id_hierarchy[i]].append(score)

average_coherence_scores = {}

for hierarchy_id, scores in hierarchy_coherence.items():
    average_coherence_scores[hierarchy_id] = np.mean(scores)

for hierarchy_id, avg_score in average_coherence_scores.items():
    print(f"Hierarchy {hierarchy_id}: Average Coherence Score = {avg_score}")



Hierarchy 1: Average Coherence Score = -0.6940108086231039
Hierarchy 2: Average Coherence Score = -0.7680175748579444
Hierarchy 3: Average Coherence Score = -0.6230515291970717
Hierarchy 4: Average Coherence Score = -0.28915433669692986


In [23]:
# Compactness
import gensim
import numpy as np
from gensim.models import KeyedVectors
from itertools import combinations

w2v_model = KeyedVectors.load_word2vec_format("../../GoogleNews-vectors-negative300.bin", binary=True)


In [25]:
def topic_compactness(topic_words, model):
    """
    Computes the compactness of a topic by averaging cosine similarity between all word pairs.

    :param topic_words: List of words in a topic
    :param model: Word2Vec model (GoogleNews)
    :return: Compactness score (average pairwise similarity)
    """
    valid_words = [word for word in topic_words if word in model]
    
    if len(valid_words) < 2:
        return 0  # Avoid division by zero if only one valid word exists

    
    similarities = [
        model.similarity(word1, word2)
        for word1, word2 in combinations(valid_words, 2)
    ]
    
    return np.mean(similarities) if similarities else np.nan


compactness_scores = [topic_compactness(topic, w2v_model) for topic in topic_list]

print("Compactness Scores:", compactness_scores)

average_compactness = np.mean([score for score in compactness_scores if not np.isnan(score)])
print("Average Compactness:", average_compactness)


Compactness Scores: [0.14891918, 0.103850015, 0.17024547, 0.07486392, 0.26767072, 0.58259666, 0.0981718, 0.17114286, 0.29909515, 0.083104335, 0.13600712, 0.13987996, 0.061470956, 0.18669066, 0.10386133, 0.31632435, 0.6376089, 0.38540682, 0.069852166, 0.09711573, 0.15191023, 0.045166235, 0, 0.15371507, 0.13137142, 0.16718502, 0.33183512, 0.033596966, 0.06846443, 0.072908975, 0, 0.35421813, 0.10892762, 0.10851393, 0.13918598, 0.14235492, 0.40416414, 0.57564783, 0.21905595, 0.27827424, 0, 0.26465753, 0.17240135, 0.16986284, 0.22971019, 0.15250184, 0.39055955, 0.1411922, 0.12966265, 0.112780295, 0.14484526, 0.0721798, 0.118949376, 0.116874225, 0.7584654, 0.12717633, 0.15280716, 0.073152475, 0.08230693, 0.22942348, 0.18639886, 0.14768803, 0.28287482, 0.12977934, 0.15927587, 0.12147424, 0.021367343, 0.25561404, 0.38808572, 0.3703831, 0.16606234, 0.2967972, 0.2524555, 0.7011443, 0.24254316, 0.3631526, 0.16198798, 0.40551257, 0.14161885, 0.12032196, 0.50515497, 0.028555341, 0.1828981, 0.081887