In [None]:
import json
from bertopic import BERTopic

# Load your JSON data
with open('../pre_processed_data_non_english_removed.json', 'r') as file:
    data = json.load(file)

# Initialize BERTopic with nr_topics set to 'auto'
model = BERTopic(nr_topics=17)

# Fit the model and transform your data to topics
topics, probs = model.fit_transform(data)

# Check the number of topics after merging
final_topics = set(model.get_topics().keys())
print(f"Final number of topics: {len(final_topics)}")

In [None]:
# Get the number of unique topics
unique_topics = set(topics) - {-1}  # Exclude -1 if it's there (it's the outlier topic)
topic_names = model.get_topic_info().Name

# Iterate through each unique topic and get its keywords
for topic in unique_topics:
    topic_info = model.get_topic(topic)
    
    # Check if topic_info is not None and extract keywords
    if topic_info:
        keywords = [word for word, _ in topic_info]

        print(f"Topic {topic_names[topic + 1]}:")
        print("Keywords:", keywords)
        print("----------")
    else:
        print(f"Topic {topic} has no keywords.")

In [None]:
model.get_topic_freq()

In [None]:
model.get_topic_info()

In [None]:
topic_names = model.get_topic_info().Name

for doc, topic in zip(data, topics):
    print("Document:", doc)
    print("Assigned Topic:", topic_names[topic + 1])
    print("----------")

In [None]:
import csv

# Retrieve topic information
topic_info = model.get_topic_info()
topic_names = model.get_topic_info().Name

# Get the number of unique topics
unique_topics = set(topics) - {-1}  # Exclude -1 if it's there (it's the outlier topic)
counts = model.get_topic_info()['Count']

# Open a CSV file to save the data
with open('non_english/top_info_counts_per_topic.csv', 'w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(["Topic", "Topic Count", "Keywords", "Document"])
    
    # Iterate through each topic
    for topic in unique_topics:
        topic_info = model.get_topic(topic)
        count = counts[topic + 1]
    
        # Check if topic_info is not None and extract keywords
        if topic_info:
            keywords = [word for word, _ in topic_info]
            print(f"Topic {topic_names[topic + 1]}:")
            print("Keywords:", keywords)
            print("----------")
        else:
            print(f"Topic {topic} has no keywords.")
            continue

        # Get documents for each topic
        doc_indices = [i for i, t in enumerate(topics) if t == topic]
        documents = [data[i] for i in doc_indices]

        # If fewer than 10 documents, repeat them until we have 10
        while len(documents) < 10:
            documents.extend(documents)

        # Write the top 10 (or fewer if not available) documents to CSV
        for doc in documents[:10]:
            writer.writerow([topic_names[topic + 1], count, keywords, doc])

print("Data saved to top_documents_per_topic.csv")