In [2]:
# Store topics as list of lists
import json
# Load the JSON file
filename = 'T3.nodes.json'
filepath = '../output-jsons/' + filename
with open(filepath, 'r') as f:
    data = json.load(f)

def flatten_tree(node, depth=1, result=None):
    if result is None:
        result = []
    """Recursively flattens a tree structure into a list."""
    result.append({
        "id": node["id"],
        "texts": node["text"].split(),
        "level": depth
    })
    
    for child in node.get("children", []):
        flatten_tree(child, depth + 1, result)
    
    return result

# Process each root node in the JSON
flat_list = []
for root in data:
    flat_list.extend(flatten_tree(root))

topic_list = [item["texts"] for item in flat_list]

# Coherence

In [12]:
from gensim.models.coherencemodel import CoherenceModel
from gensim.corpora.dictionary import Dictionary
from gensim.utils import simple_preprocess
import pandas as pd

# Load and preprocess text
with open('../T3-new-stopwords.txt', 'r') as file:
    raw_text = file.readlines()

tokenized = [simple_preprocess(doc) for doc in raw_text]

# Create dictionary and filter extremes
dictionary = Dictionary(tokenized)
dictionary.filter_extremes(no_below=5, no_above=0.5)  # Adjust these values

# Create corpus
corpus = [dictionary.doc2bow(text) for text in tokenized]

# Filter topics to only include words that exist in dictionary
def filter_topic_words(topics, dictionary):
    filtered_topics = []
    for topic in topics:
        filtered_topic = [word for word in topic if dictionary.token2id.get(word)]
        if len(filtered_topic) >= 2:  # Need at least 2 words for coherence
            filtered_topics.append(filtered_topic)
    return filtered_topics

filtered_topics = filter_topic_words(topic_list, dictionary)

cm_umass = None
cm_cv = None
cm_npmi = None

# Calculate coherence scores
if filtered_topics:
    cm_umass = CoherenceModel(topics=filtered_topics, corpus=corpus, 
                            dictionary=dictionary, coherence='u_mass')
    cm_cv = CoherenceModel(topics=filtered_topics, texts=tokenized, 
                          corpus=corpus, dictionary=dictionary, coherence='c_v')
    cm_npmi = CoherenceModel(topics=filtered_topics, texts=tokenized,
                            corpus=corpus, dictionary=dictionary, coherence='c_npmi')

    print("Total Coherence (u_mass):", cm_umass.get_coherence())
    print("Total Coherence (c_v):", cm_cv.get_coherence())
    print("Total Coherence (NPMI):", cm_npmi.get_coherence())
else:
    print("No valid topics after filtering")


metrics = ['u_mass', 'c_v', 'c_npmi']
scores = [
    cm_umass.get_coherence() if filtered_topics else None,
    cm_cv.get_coherence() if filtered_topics else None,
    cm_npmi.get_coherence() if filtered_topics else None
]

df_metrics = pd.DataFrame({'metric': metrics, 'score': scores})
print(df_metrics)

df_metrics.to_csv('hlta-coherence.csv', index=False)



Total Coherence (u_mass): -1.7523811352417202
Total Coherence (c_v): 0.7137993820988936
Total Coherence (NPMI): 0.14104984019578662
   metric     score
0  u_mass -1.752381
1     c_v  0.713799
2  c_npmi  0.141050


In [4]:
individual_coherence_scores = []

# Compute coherence for each topic separately
for topic in topic_list:
    cm = CoherenceModel(topics=[topic], corpus=corpus, dictionary=dictionary, coherence='u_mass')
    coherence_score = cm.get_coherence()
    individual_coherence_scores.append(coherence_score)

id_list = [item["id"] for item in flat_list]
id_hierarchy = [item["level"] for item in flat_list]

# Print coherence scores for each topic
for i, score in enumerate(individual_coherence_scores):
    print(f"{id_list[i]}: {score}, level: {id_hierarchy[i]}")

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


Z35: -1.3394718185692296, level: 1
Z214: -1.9960790624660827, level: 2
Z170: -1.5712775056772192, level: 3
Z172: nan, level: 3
Z171: -1.866420997934191, level: 3
Z213: -1.5625176762786361, level: 2
Z168: -0.8799230876791372, level: 3
Z167: -0.894172926599411, level: 3
Z169: -1.8654768547500442, level: 3
Z315: -2.062337597923038, level: 1
Z242: -2.1004900395368957, level: 2
Z1161: -1.6239664560142961, level: 3
Z1162: -1.613595922526431, level: 3
Z1160: -1.8538133427593961, level: 3
Z244: -2.0852669116665106, level: 2
Z1170: -2.366594992401294, level: 3
Z1166: -2.160966358635213, level: 3
Z1169: -2.639117236529277, level: 3
Z1168: -1.364315454361927, level: 3
Z1167: -2.3178629665621853, level: 3
Z243: -1.9580669156184929, level: 2
Z1163: -1.2143190963584483, level: 3
Z1164: -1.987481292526801, level: 3
Z1165: -1.5287777239395437, level: 3
Z37: -1.3175623238225431, level: 1
Z222: -0.7272821256558711, level: 2
Z196: -0.9334023723331503, level: 3
Z197: -0.5921248027384535, level: 3
Z221: -1

In [5]:
# Coherence score per hierarchy tier
import numpy as np
from collections import defaultdict

hierarchy_coherence = defaultdict(list)

for i, score in enumerate(individual_coherence_scores):
    hierarchy_coherence[id_hierarchy[i]].append(score)

average_coherence_scores = {}

for hierarchy_id, scores in hierarchy_coherence.items():
    average_coherence_scores[hierarchy_id] = np.mean(scores)

for hierarchy_id, avg_score in average_coherence_scores.items():
    print(f"Hierarchy {hierarchy_id}: Average Coherence Score = {avg_score}")



Hierarchy 1: Average Coherence Score = -1.3625529019358922
Hierarchy 2: Average Coherence Score = -1.6265689558866905
Hierarchy 3: Average Coherence Score = nan


# Compactness

In [6]:
# Compactness
import gensim
import numpy as np
from gensim.models import KeyedVectors
from itertools import combinations

w2v_model = KeyedVectors.load_word2vec_format("../../GoogleNews-vectors-negative300.bin", binary=True)


In [7]:
def topic_compactness(topic_words, model):
    """
    Computes the compactness of a topic by averaging cosine similarity between all word pairs.

    :param topic_words: List of words in a topic
    :param model: Word2Vec model (GoogleNews)
    :return: Compactness score (average pairwise similarity)
    """
    valid_words = [word for word in topic_words if word in model]
    
    if len(valid_words) < 2:
        return 0  # Avoid division by zero if only one valid word exists

    
    similarities = [
        model.similarity(word1, word2)
        for word1, word2 in combinations(valid_words, 2)
    ]
    
    return np.mean(similarities) if similarities else np.nan


compactness_scores = [topic_compactness(topic, w2v_model) for topic in topic_list]

print("Compactness Scores:", compactness_scores)

average_compactness = np.mean([score for score in compactness_scores if not np.isnan(score)])
print("Average Compactness:", average_compactness)


Compactness Scores: [0.46513158, 0.50431436, 0.26193258, 0, 0.56328726, 0.3424141, 0.3594669, 0.56144077, 0.25687903, 0.26695612, 0.17316696, 0.24573433, 0.8380803, 0.150697, 0.40481082, 0.15352595, 0.07050461, 0.099383324, 0.79014826, 0.24635366, 0.30890188, 0.650118, 0.17483066, 0.43139413, 0.32695115, 0.46912456, 0.60849714, 0.34237242, 0.44998875, 0, 0.3208941, 0.74550235, 0.40958264, 0.2725931, 0.1818581, 0.36734998, 0.40958264, 0.33303618, 0, 0.13891512, 0.15955617, 0.24887152, 0.18298838, 0.10908553, 0.022208909, 0.08576179, 0.09230634, 0.13980578, 0.102254905, 0.27704936, 0.054357916, 0.12766588, 0.14172262, 0.13399203, 0.14844787, 0.14643963, 0.09840547, 0, 0.17158733, 0.34517944, 0.21190906, 0.31247514, 0.35909358, 0.2801892, 0.5316955, 0.18829194, 0.14472704, 0.2152567, 0.44952935, 0.48205543, 0.44354463, 0.6130097, 0.28039038, 0.099986024, 0, 0.36954978, 0.21232674, 0.22701798, 0.36518723, 0.11120105, 0.19079971, 0.3604807, 0.16500725, 0.08611638, -0.007503092, 0.4709963, 0

# Topic Diversity

In [8]:
def topic_diversity(topic_words, top_n=10):
  
    topic_words = [topic[:top_n] for topic in topic_words]  # Consider only top N words per topic
    all_words = [word for topic in topic_words for word in topic]  # Flatten list
    unique_words = set(all_words)  # Get unique words
    
    return 1 - len(unique_words) / len(all_words) if len(all_words) > 0 else 0

diversity_score = topic_diversity(topic_list)
print("Topic Diversity:", diversity_score)

Topic Diversity: 0.34453124999999996


# Jaccard Similarity

In [9]:
from itertools import combinations

def jaccard_similarity(set1, set2):
    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))
    return intersection / union if union != 0 else 0

def average_jaccard_similarity(topic_words):

    topic_sets = [set(topic) for topic in topic_words]
    jaccard_scores = [
        jaccard_similarity(set1, set2)
        for set1, set2 in combinations(topic_sets, 2)
    ]
    
    return sum(jaccard_scores) / len(jaccard_scores) if jaccard_scores else 0


avg_jaccard = average_jaccard_similarity(topic_list)
print("Average Jaccard Similarity:", avg_jaccard)

Average Jaccard Similarity: 0.0018793736410949375
