In [2]:
import os
import json
from gensim.models.coherencemodel import CoherenceModel
from gensim.corpora.dictionary import Dictionary
from gensim.utils import simple_preprocess

# Define the folder paths
json_folder = 'hyper-param-testing-outputs/'
text_file_path = 'T3-new-stopwords.txt'  # Assuming same text corpus is used for all

# Load and preprocess the raw corpus
with open(text_file_path, 'r') as file:
    raw_corpus = file.readlines()

tokenized_corpus = [simple_preprocess(doc) for doc in raw_corpus]

# Iterate through all JSON files in the folder
for filename in os.listdir(json_folder):
    if filename.endswith('.json'):
        filepath = os.path.join(json_folder, filename)
        print(f"\nProcessing: {filename}")

        # Load the JSON file
        with open(filepath, 'r') as f:
            data = json.load(f)

        # Flatten the tree structure
        def flatten_tree(node, depth=1, result=None):
            if result is None:
                result = []
            result.append({
                "id": node["id"],
                "texts": node["text"].split(),
                "level": depth
            })
            for child in node.get("children", []):
                flatten_tree(child, depth + 1, result)
            return result

        flat_list = []
        for root in data:
            flat_list.extend(flatten_tree(root))

        topic_list = [item["texts"] for item in flat_list]

        # Create dictionary and corpus for the topics
        dictionary = Dictionary(topic_list)
        corpus = [dictionary.doc2bow(text) for text in tokenized_corpus]

        # Compute total u_mass coherence
        cm_umass = CoherenceModel(
            topics=topic_list,
            corpus=corpus,
            dictionary=dictionary,
            coherence='u_mass'
        )
        print("Total Coherence (u_mass):", cm_umass.get_coherence())

        # Optionally: compute individual coherence scores per topic
        # Uncomment below if needed
        # individual_coherence_scores = []
        # for topic in topic_list:
        #     cm = CoherenceModel(topics=[topic], corpus=corpus, dictionary=dictionary, coherence='u_mass')
        #     score = cm.get_coherence()
        #     individual_coherence_scores.append(score)
        # print("Individual topic coherence scores:", individual_coherence_scores)



Processing: 2-15-25-none.nodes.json
Total Coherence (u_mass): -3.2702709524036218

Processing: 2-5-20-3.nodes.json
Total Coherence (u_mass): -4.137619198533649

Processing: 3-15-30-5.nodes.json
Total Coherence (u_mass): -3.582806663280821

Processing: base.nodes.json
Total Coherence (u_mass): -3.074685605006344
