In [29]:
# Store topics as list of lists
import json
# Load the JSON file
filename = '1800.nodes.json'
filepath = '../output-jsons/' + filename
with open(filepath, 'r') as f:
    data = json.load(f)

def flatten_tree(node, depth=1, result=None):
    if result is None:
        result = []
    """Recursively flattens a tree structure into a list."""
    result.append({
        "id": node["id"],
        "texts": node["text"].split(),
        "level": depth
    })
    
    for child in node.get("children", []):
        flatten_tree(child, depth + 1, result)
    
    return result

# Process each root node in the JSON
flat_list = []
for root in data:
    flat_list.extend(flatten_tree(root))

topic_list = [item["texts"] for item in flat_list]

# Coherence

In [30]:
# Gensim attempt
from gensim.models.coherencemodel import CoherenceModel
from gensim.corpora.dictionary import Dictionary
from gensim.utils import simple_preprocess

dictionary = Dictionary(topic_list)
corpus = [dictionary.doc2bow(text) for text in topic_list]

raw_corpus =[]
tokenized_corpus = []

# Open the 1800.txt file and store its content as raw corpus
with open('../1800.txt', 'r') as file:
    raw_corpus = file.readlines()

 # Tokenize the raw corpus
tokenized_corpus = [simple_preprocess(doc) for doc in raw_corpus]

corpus = [dictionary.doc2bow(text) for text in tokenized_corpus]

cm_umass = CoherenceModel(topics=topic_list, corpus=corpus, dictionary=dictionary, coherence='u_mass')
cm_cv = CoherenceModel(topics=topic_list, texts=tokenized_corpus, dictionary=dictionary, coherence='c_v')
cm_npmi = CoherenceModel(topics=topic_list, texts=tokenized_corpus, dictionary=dictionary, coherence='c_npmi')

print("Total Coherence (u_mass):", cm_umass.get_coherence())
print("Total Coherence (c_v):", cm_cv.get_coherence())
print("Total Coherence (NPMI):", cm_npmi.get_coherence())

individual_coherence_scores = []

# Compute coherence for each topic separately
for topic in topic_list:
    cm = CoherenceModel(topics=[topic], corpus=corpus, dictionary=dictionary, coherence='u_mass')
    coherence_score = cm.get_coherence()
    individual_coherence_scores.append(coherence_score)

id_list = [item["id"] for item in flat_list]
id_hierarchy = [item["level"] for item in flat_list]



Total Coherence (u_mass): -2.8724111805887307
Total Coherence (c_v): nan
Total Coherence (NPMI): inf


In [31]:
# Print coherence scores for each topic
for i, score in enumerate(individual_coherence_scores):
    print(f"{id_list[i]}: {score}, level: {id_hierarchy[i]}")

Z41: -1.2150985311251454, level: 1
Z32: -1.6950804930766026, level: 2
Z28: -1.5875498607828813, level: 3
Z140: -1.5005387061217121, level: 4
Z142: -1.6697160944185103, level: 4
Z139: -1.048532812475656, level: 4
Z141: -2.311092074226601, level: 4
Z29: -5.74169798732148, level: 3
Z143: -1.1532630396868033, level: 4
Z145: -2.973520542532887, level: 4
Z144: -1.821253722832908, level: 4
Z26: -1.7609346251578541, level: 3
Z134: -1.8125857318879086, level: 4
Z135: -1.7349147595852272, level: 4
Z136: -1.6082182482746763, level: 4
Z27: -1.3610369138900649, level: 3
Z138: -0.7252354950715328, level: 4
Z137: -1.0811005126647095, level: 4
Z36: -2.13567232263381, level: 2
Z222: -2.262157325307606, level: 3
Z183: -1.443256754074845, level: 4
Z182: -10.225729100780095, level: 4
Z181: -1.2117097868102635, level: 4
Z220: -4.065546530071412, level: 3
Z176: -2.271376460716905, level: 4
Z177: -2.8325102299900187, level: 4
Z175: -8.792695554966281, level: 4
Z221: -10.485505921322268, level: 3
Z178: -10.22

In [32]:
# Coherence score per hierarchy tier
import numpy as np
from collections import defaultdict

hierarchy_coherence = defaultdict(list)

for i, score in enumerate(individual_coherence_scores):
    hierarchy_coherence[id_hierarchy[i]].append(score)

average_coherence_scores = {}

for hierarchy_id, scores in hierarchy_coherence.items():
    average_coherence_scores[hierarchy_id] = np.mean(scores)

for hierarchy_id, avg_score in average_coherence_scores.items():
    print(f"Hierarchy {hierarchy_id}: Average Coherence Score = {avg_score}")



Hierarchy 1: Average Coherence Score = -3.4375756803945032
Hierarchy 2: Average Coherence Score = -3.985504392547007
Hierarchy 3: Average Coherence Score = -3.120432742060405
Hierarchy 4: Average Coherence Score = -2.7056418341577406


# Compactness

In [33]:
# Compactness
import gensim
import numpy as np
from gensim.models import KeyedVectors
from itertools import combinations

w2v_model = KeyedVectors.load_word2vec_format("../../GoogleNews-vectors-negative300.bin", binary=True)


In [34]:
def topic_compactness(topic_words, model):
    """
    Computes the compactness of a topic by averaging cosine similarity between all word pairs.

    :param topic_words: List of words in a topic
    :param model: Word2Vec model (GoogleNews)
    :return: Compactness score (average pairwise similarity)
    """
    valid_words = [word for word in topic_words if word in model]
    
    if len(valid_words) < 2:
        return 0  # Avoid division by zero if only one valid word exists

    
    similarities = [
        model.similarity(word1, word2)
        for word1, word2 in combinations(valid_words, 2)
    ]
    
    return np.mean(similarities) if similarities else np.nan


compactness_scores = [topic_compactness(topic, w2v_model) for topic in topic_list]

print("Compactness Scores:", compactness_scores)

average_compactness = np.mean([score for score in compactness_scores if not np.isnan(score)])
print("Average Compactness:", average_compactness)


Compactness Scores: [0.14891918, 0.103850015, 0.17024547, 0.07486392, 0.26767072, 0.58259666, 0.0981718, 0.17114286, 0.29909515, 0.083104335, 0.13600712, 0.13987996, 0.061470956, 0.18669066, 0.10386133, 0.31632435, 0.6376089, 0.38540682, 0.069852166, 0.09711573, 0.15191023, 0.045166235, 0, 0.15371507, 0.13137142, 0.16718502, 0.33183512, 0.033596966, 0.06846443, 0.072908975, 0, 0.35421813, 0.10892762, 0.10851393, 0.13918598, 0.14235492, 0.40416414, 0.57564783, 0.21905595, 0.27827424, 0, 0.26465753, 0.17240135, 0.16986284, 0.22971019, 0.15250184, 0.39055955, 0.1411922, 0.12966265, 0.112780295, 0.14484526, 0.0721798, 0.118949376, 0.116874225, 0.7584654, 0.12717633, 0.15280716, 0.073152475, 0.08230693, 0.22942348, 0.18639886, 0.14768803, 0.28287482, 0.12977934, 0.15927587, 0.12147424, 0.021367343, 0.25561404, 0.38808572, 0.3703831, 0.16606234, 0.2967972, 0.2524555, 0.7011443, 0.24254316, 0.3631526, 0.16198798, 0.40551257, 0.14161885, 0.12032196, 0.50515497, 0.028555341, 0.1828981, 0.081887

# Topic Diversity

In [35]:
def topic_diversity(topic_words, top_n=10):
  
    topic_words = [topic[:top_n] for topic in topic_words]  # Consider only top N words per topic
    all_words = [word for topic in topic_words for word in topic]  # Flatten list
    unique_words = set(all_words)  # Get unique words
    
    return 1 - len(unique_words) / len(all_words) if len(all_words) > 0 else 0

diversity_score = topic_diversity(topic_list)
print("Topic Diversity:", diversity_score)

Topic Diversity: 0.4141104294478528


# Jaccard Similarity

In [36]:
from itertools import combinations

def jaccard_similarity(set1, set2):
    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))
    return intersection / union if union != 0 else 0

def average_jaccard_similarity(topic_words):

    topic_sets = [set(topic) for topic in topic_words]
    jaccard_scores = [
        jaccard_similarity(set1, set2)
        for set1, set2 in combinations(topic_sets, 2)
    ]
    
    return sum(jaccard_scores) / len(jaccard_scores) if jaccard_scores else 0


avg_jaccard = average_jaccard_similarity(topic_list)
print("Average Jaccard Similarity:", avg_jaccard)

Average Jaccard Similarity: 0.0016929637503709878
