In [1]:
import json
from bertopic import BERTopic

# Load your JSON data
with open('../pre_processed_data_first_prompts.json', 'r') as file:
    data = json.load(file)

# Initialize BERTopic with nr_topics set to 'auto'
model = BERTopic(nr_topics=17)

# Fit the model and transform your data to topics
topics, probs = model.fit_transform(data)

# Check the number of topics after merging
final_topics = set(model.get_topics().keys())
print(f"Final number of topics: {len(final_topics)}")

  from .autonotebook import tqdm as notebook_tqdm


Final number of topics: 17


### Print keywords for each topic

In [2]:
unique_topics = set(topics) - {-1}  # Exclude -1 if it's there (it's the outlier topic)
topic_names = model.get_topic_info().Name

# Iterate through each unique topic and get its keywords
for topic in unique_topics:
    topic_info = model.get_topic(topic)
    
    # Check if topic_info is not None and extract keywords
    if topic_info:
        keywords = [word for word, _ in topic_info]

        print(f"Topic {topic_names[topic + 1]}:")
        print("Keywords:", keywords)
        print("----------")
    else:
        print(f"Topic {topic} has no keywords.")

Topic 0_const_import_from_the:
Keywords: ['const', 'import', 'from', 'the', 'files', 'task', 'export', 'set', 'and', 'example']
----------
Topic 1_the_from_files_const:
Keywords: ['the', 'from', 'files', 'const', 'import', 'task', 'at', 'and', 'await', 'to']
----------
Topic 2_the_junior_and_to:
Keywords: ['the', 'junior', 'and', 'to', 'for', 'with', 'ai', 'is', 'in', 'project']
----------
Topic 3_string_defaultvalue_type_scratchargumenttypestring:
Keywords: ['string', 'defaultvalue', 'type', 'scratchargumenttypestring', 'return', 'argsstring', 'blocktype', 'opcode', 'typescript', 'case']
----------
Topic 4_int_float_device_constant:
Keywords: ['int', 'float', 'device', 'constant', 'const', 'int64t', 'for', 'sumith', '0f', 'uint8t']
----------
Topic 5_def_str_none_if:
Keywords: ['def', 'str', 'none', 'if', 'john', 'in', 'jeff', 'action', 'for', 'raise']
----------
Topic 6_to_the_image_and:
Keywords: ['to', 'the', 'image', 'and', 'files', 'images', 'using', 'echo', 'script', 'task']
---

In [3]:
model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,552,-1_the_to_and_of,"[the, to, and, of, in, is, for, you, not, if]",[# Working set README.md: ``` Warn: This READM...
1,0,331,0_const_import_from_the,"[const, import, from, the, files, task, export...","[You are AI Junior, you code like Donald Knuth..."
2,1,153,1_the_from_files_const,"[the, from, files, const, import, task, at, an...","[You are AI Junior, you code like Donald Knuth..."
3,2,39,2_the_junior_and_to,"[the, junior, and, to, for, with, ai, is, in, ...","[You are Junior, an AI system aiding developer..."
4,3,38,3_string_defaultvalue_type_scratchargumenttype...,"[string, defaultvalue, type, scratchargumentty...",[I have an array of type ({ something: string ...
5,4,28,4_int_float_device_constant,"[int, float, device, constant, const, int64t, ...",[This is a quantitation implementations using ...
6,5,27,5_def_str_none_if,"[def, str, none, if, john, in, jeff, action, f...",[import re import requests from typing import ...
7,6,27,6_to_the_image_and,"[to, the, image, and, files, images, using, ec...",[# Working set docs/roadmap.md: ``` # Roadmap ...
8,7,26,7_var_is_what_youtube,"[var, is, what, youtube, data, wini, and, nd, ...",[The total length of the content that I want t...
9,8,25,8_string_player_param_the,"[string, player, param, the, otp, number, algo...",[I got an error when I start my test in spring...


In [None]:
import csv

topic_info = model.get_topic_info()
topic_names = model.get_topic_info().Name

unique_topics = set(topics) - {-1}  # Exclude -1 if it's there (it's the outlier topic)
counts = model.get_topic_info()['Count']

with open('top_info_counts_per_topic.csv', 'w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(["Topic", "Topic Count", "Keywords", "Document"])
    
    for topic in unique_topics:
        topic_info = model.get_topic(topic)
        count = counts[topic + 1]
    
        if topic_info:
            keywords = [word for word, _ in topic_info]
        else:
            continue

        # Get documents for each topic
        doc_indices = [i for i, t in enumerate(topics) if t == topic]
        documents = [data[i] for i in doc_indices]

        # If fewer than 10 documents, repeat them until we have 10
        while len(documents) < 10:
            documents.extend(documents)

        # Write the top 10 (or fewer if not available) documents to CSV
        for doc in documents[:10]:
            writer.writerow([topic_names[topic + 1], count, keywords, doc])

print("Data saved to top_info_counts_per_topic.csv")