In [2]:
import json
from bertopic import BERTopic

# Load your JSON data
with open('../pre_processed_data_first_prompts.json', 'r') as file:
    data = json.load(file)

# Initialize BERTopic with nr_topics set to 'auto'
model = BERTopic(nr_topics=17)

# Fit the model and transform your data to topics
topics, probs = model.fit_transform(data)

# Check the number of topics after merging
final_topics = set(model.get_topics().keys())
print(f"Final number of topics: {len(final_topics)}")

Final number of topics: 17


In [8]:
# Get the number of unique topics
unique_topics = set(topics) - {-1}  # Exclude -1 if it's there (it's the outlier topic)
topic_names = model.get_topic_info().Name

# Iterate through each unique topic and get its keywords
for topic in unique_topics:
    topic_info = model.get_topic(topic)
    
    # Check if topic_info is not None and extract keywords
    if topic_info:
        keywords = [word for word, _ in topic_info]

        print(f"Topic {topic_names[topic + 1]}:")
        print("Keywords:", keywords)
        print("----------")
    else:
        print(f"Topic {topic} has no keywords.")

Topic 0_import_from_const_the:
Keywords: ['import', 'from', 'const', 'the', 'files', 'task', 'export', 'example', 'set', 'and']
----------
Topic 1_in_if_for_int:
Keywords: ['in', 'if', 'for', 'int', 'def', 'the', 'is', 'to', 'return', 'none']
----------
Topic 2_the_from_files_import:
Keywords: ['the', 'from', 'files', 'import', 'const', 'task', 'and', 'prompt', 'await', 'to']
----------
Topic 3_const_files_the_filteredlist:
Keywords: ['const', 'files', 'the', 'filteredlist', 'import', 'from', 'task', 'not', 'multiselect', 'resultset']
----------
Topic 4_the_junior_to_and:
Keywords: ['the', 'junior', 'to', 'and', 'with', 'for', 'ai', 'in', 'of', 'your']
----------
Topic 5_string_defaultvalue_type_scratchargumenttypestring:
Keywords: ['string', 'defaultvalue', 'type', 'scratchargumenttypestring', 'return', 'arguments', 'argsstring', 'blocktype', 'opcode', 'typescript']
----------
Topic 6_to_the_and_image:
Keywords: ['to', 'the', 'and', 'image', 'files', 'echo', 'using', 'script', 'images

In [3]:
model.get_topic_freq()

Unnamed: 0,Topic,Count
3,-1,461
1,0,327
5,1,124
4,2,116
6,3,51
7,4,48
14,5,42
8,6,33
10,7,24
12,8,22


In [14]:
model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,461,-1_the_to_and_of,"[the, to, and, of, in, is, for, you, not, from]","[You are AI Junior, you code like Donald Knuth..."
1,0,327,0_import_from_const_the,"[import, from, const, the, files, task, export...","[You are AI Junior, you code like Donald Knuth..."
2,1,124,1_in_if_for_int,"[in, if, for, int, def, the, is, to, return, n...",[This is a quantitation implementations using ...
3,2,116,2_the_from_files_import,"[the, from, files, import, const, task, and, p...","[You are AI Junior, you code like Donald Knuth..."
4,3,51,3_const_files_the_filteredlist,"[const, files, the, filteredlist, import, from...","[You are AI Junior, you code like Donald Knuth..."
5,4,48,4_the_junior_to_and,"[the, junior, to, and, with, for, ai, in, of, ...","[You are Junior, an AI system aiding developer..."
6,5,42,5_string_defaultvalue_type_scratchargumenttype...,"[string, defaultvalue, type, scratchargumentty...",[You are an agent in a gridworld. The environm...
7,6,33,6_to_the_and_image,"[to, the, and, image, files, echo, using, scri...",[# Working set docs/roadmap.md: ``` # Roadmap ...
8,7,24,7_player_public_game_string,"[player, public, game, string, the, class, mov...","[How using this example, public class Main { ..."
9,8,22,8_table_sql_null_primary,"[table, sql, null, primary, rows, integer, it,...",[I have a sqlite database. Here's the SQL for ...


In [5]:
topic_names = model.get_topic_info().Name

for doc, topic in zip(data, topics):
    print("Document:", doc)
    print("Assigned Topic:", topic_names[topic + 1])
    print("----------")

Document:  button Button       ::-webkit-scrollbar {         display: none !important;       }       html,       textarea {         background: lightgoldenrodyellow;       }       html,       body,       #container {         height: 100%;         width: 100%;         overflow-x: hidden;       }       #writebox {         font-size: large;         padding: 20px;         width: 100%;         height: 100%;         border: none;         letter-spacing: 2px;         color: rgb(27, 77, 63);         font-family: serif;         font-weight: bold;         line-height: 1.69;         border: none;         outline: none;       }       #clearbutton {         position: absolute;         right: 10px;         bottom: 10px;         width: 70px;         height: 70px;         background-color: pink;         border-radius: 50%;       }       @media (max-width: 600px) {         #clearbutton {           display: block;         }       }       @media (min-width: 601px) {         #clearbutton {           displ

In [15]:
import csv

# Retrieve topic information
topic_info = model.get_topic_info()
topic_names = model.get_topic_info().Name

# Get the number of unique topics
unique_topics = set(topics) - {-1}  # Exclude -1 if it's there (it's the outlier topic)
counts = model.get_topic_info()['Count']

# Open a CSV file to save the data
with open('top_info_counts_per_topic.csv', 'w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(["Topic", "Topic Count", "Keywords", "Document"])
    
    # Iterate through each topic
    for topic in unique_topics:
        topic_info = model.get_topic(topic)
        count = counts[topic + 1]
    
        # Check if topic_info is not None and extract keywords
        if topic_info:
            keywords = [word for word, _ in topic_info]
            print(f"Topic {topic_names[topic + 1]}:")
            print("Keywords:", keywords)
            print("----------")
        else:
            print(f"Topic {topic} has no keywords.")
            continue

        # Get documents for each topic
        doc_indices = [i for i, t in enumerate(topics) if t == topic]
        documents = [data[i] for i in doc_indices]

        # If fewer than 10 documents, repeat them until we have 10
        while len(documents) < 10:
            documents.extend(documents)

        # Write the top 10 (or fewer if not available) documents to CSV
        for doc in documents[:10]:
            writer.writerow([topic_names[topic + 1], count, keywords, doc])

print("Data saved to top_documents_per_topic.csv")

Topic 0_import_from_const_the:
Keywords: ['import', 'from', 'const', 'the', 'files', 'task', 'export', 'example', 'set', 'and']
----------
Topic 1_in_if_for_int:
Keywords: ['in', 'if', 'for', 'int', 'def', 'the', 'is', 'to', 'return', 'none']
----------
Topic 2_the_from_files_import:
Keywords: ['the', 'from', 'files', 'import', 'const', 'task', 'and', 'prompt', 'await', 'to']
----------
Topic 3_const_files_the_filteredlist:
Keywords: ['const', 'files', 'the', 'filteredlist', 'import', 'from', 'task', 'not', 'multiselect', 'resultset']
----------
Topic 4_the_junior_to_and:
Keywords: ['the', 'junior', 'to', 'and', 'with', 'for', 'ai', 'in', 'of', 'your']
----------
Topic 5_string_defaultvalue_type_scratchargumenttypestring:
Keywords: ['string', 'defaultvalue', 'type', 'scratchargumenttypestring', 'return', 'arguments', 'argsstring', 'blocktype', 'opcode', 'typescript']
----------
Topic 6_to_the_and_image:
Keywords: ['to', 'the', 'and', 'image', 'files', 'echo', 'using', 'script', 'images