In [72]:
import json
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.representation import KeyBERTInspired, PartOfSpeech, MaximalMarginalRelevance

# Load your JSON data
with open('../pre_processed_data.json', 'r') as file:
    data = json.load(file)


try:
    model = BERTopic.load("bertopic_model")
    print("Model loaded")

    topics = model.get_topics()
except:

    representation_model = KeyBERTInspired()
    model = BERTopic(representation_model=representation_model)

    topics, _ = model.fit_transform(data)

    model.save("bertopic_model")

# Check the number of topics after merging
final_topics = set(model.get_topics().keys())
print(f"Final number of topics: {len(final_topics)}")

2023-12-02 21:36:56,687 - BERTopic - Transformed documents to Embeddings
2023-12-02 21:36:58,679 - BERTopic - Reduced dimensionality
2023-12-02 21:36:58,697 - BERTopic - Clustered reduced embeddings


Final number of topics: 18


In [73]:
# Get the number of unique topics
unique_topics = set(topics) - {-1}  # Exclude -1 if it's there (it's the outlier topic)
topic_names = model.get_topic_info().Name

# Iterate through each unique topic and get its keywords
for topic in unique_topics:
    topic_info = model.get_topic(topic)
    
    # Check if topic_info is not None and extract keywords
    if topic_info:
        keywords = [word for word, _ in topic_info]

        print(f"Topic {topic_names[topic + 1]}:")
        print("Keywords:", keywords)
        print("----------")
    else:
        print(f"Topic {topic} has no keywords.")

Topic 0_localstoragesetitemthisuser_localstoragesetitemuser_gamejs_javascript:
Keywords: ['localstoragesetitemthisuser', 'localstoragesetitemuser', 'gamejs', 'javascript', 'getuser', 'userelement', 'banzuke', 'rikishi', 'reactdomdevelopmentjs20279', 'reactdomdevelopmentjs21794']
----------
Topic 1_dict_optionalstr_python_elif:
Keywords: ['dict', 'optionalstr', 'python', 'elif', 'extract', 'qr', 'validnountags', 'import', 'possiblecorefvalues', 'valueerrorfinvalid']
----------
Topic 2_kernel_uint32t_qelemsize_uint64t:
Keywords: ['kernel', 'uint32t', 'qelemsize', 'uint64t', 'blockq3k', 'int64t', 'floatx', 'uint2', 'uint8t', 'yl32']
----------
Topic 3_github_git_commits_repository:
Keywords: ['github', 'git', 'commits', 'repository', 'commit', 'repo', 'branch', 'ubuntulatest', 'stabledocs', 'versioned']
----------
Topic 4_images_image_pictures_files:
Keywords: ['images', 'image', 'pictures', 'files', 'file', 'folder', 'zoom', 'uri', 'script', 'tailwind']
----------
Topic 5_react_reactcomp

In [74]:
from scipy.cluster import hierarchy as sch

# Hierarchical topics
linkage_function = lambda x: sch.linkage(x, 'single', optimal_ordering=True)
hierarchical_topics = model.hierarchical_topics(data, linkage_function=linkage_function)

model.visualize_hierarchy(hierarchical_topics=hierarchical_topics)

100%|██████████| 16/16 [00:01<00:00, 12.72it/s]


In [75]:
tree = model.get_topic_tree(hierarchical_topics)
print(tree)

.
├─■──numbers_sum_maximum_grid_24 ── Topic: 16
└─code_write_error_string_await
     ├─code_const_write_await_int64t
     │    ├─code_const_await_create_write
     │    │    ├─■──java_tomcat_objectmapper_myservlet_systemoutprintlnplayergetopponentgetusername ── Topic: 9
     │    │    └─const_code_await_return_function
     │    │         ├─code_function_const_await_return
     │    │         │    ├─javascript_code_function_const_await
     │    │         │    │    ├─■──react_reactcomponent_reactinfinitescrollcomponent_reactapprewired_reactrouterdom ── Topic: 5
     │    │         │    │    └─javascript_const_code_await_function
     │    │         │    │         ├─javascript_code_await_var_const
     │    │         │    │         │    ├─■──kernel_uint32t_qelemsize_uint64t_blockq3k ── Topic: 2
     │    │         │    │         │    └─javascript_var_code_display_await
     │    │         │    │         │         ├─localstoragesetitemthisuser_localstoragesetitemuser_gamejs_javascript_ge

In [76]:
model.visualize_topics()

In [7]:
# topic_names = model.get_topic_info().Name

# for doc, topic in zip(data, topics):
#     print("Document:", doc)
#     print("Assigned Topic:", topic_names[topic + 1])
#     print("----------")

In [77]:
import csv
import pandas as pd

# Retrieve topic information
topic_info = model.get_topic_info()
topic_names = model.get_topic_info().Name

# Get the number of unique topics
unique_topics = set(topics) - {-1}  # Exclude -1 if it's there (it's the outlier topic)
counts = model.get_topic_info()['Count']


# Prepare your documents to be used in a dataframe
documents = pd.DataFrame({"Document": data,
                          "ID": range(len(data)),
                          "Topic": model.topics_})

# Extract the top 10 representative documents for each topic
repr_docs, _, _, _ = model._extract_representative_docs(c_tf_idf=model.c_tf_idf_,
                                                        documents=documents,
                                                        topics=model.topic_representations_,
                                                        nr_repr_docs=10)

# Open a CSV file to save the data
with open('./output/top_info_counts_per_topic.csv', 'w', newline='', encoding='utf-8') as file:
 
    writer = csv.writer(file)
    writer.writerow(["Topic", "Topic Count", "Keywords", "Document"])
    

    # Iterate through each topic and its representative documents
    for topic, most_rep_docs in repr_docs.items():
        if topic == -1:
            continue    
        topic_info = model.get_topic(topic)
        count = counts[topic + 1]
        # Check if topic_info is not None and extract keywords
        if topic_info:
            keywords = [word for word, _ in topic_info]
            print(f"Topic {topic_names[topic + 1]}:")
            print("Keywords:", keywords)
            print("Doc ids:", most_rep_docs)
            print("----------")
        else:
            print(f"Topic {topic} has no keywords.")
            continue
        
        for doc in most_rep_docs:
            # Retrieve the document using its ID
            # Write the topic, document ID, and document to the CSV file
            writer.writerow([topic, count, keywords, doc])
    
print("Data saved to top_documents_per_topic.csv")


Topic 0_localstoragesetitemthisuser_localstoragesetitemuser_gamejs_javascript:
Keywords: ['localstoragesetitemthisuser', 'localstoragesetitemuser', 'gamejs', 'javascript', 'getuser', 'userelement', 'banzuke', 'rikishi', 'reactdomdevelopmentjs20279', 'reactdomdevelopmentjs21794']
Doc ids: ['I want this game to rely on local storage to remember who I am and who my picks were in previous contests. A contest is January, March, May, July, September, or November of a given year. The current contest is July 2023. We will assume I am in admin mode and I can switch users to record everyone\'s picks (which are visible to everyone) and backfill old results. Please add at least one new test.  index.html        Banzuke Surfing Game                     -->       Welcome to Banzuke Surfing Game!     Select your Rikishi and start playing!              Rikishi 1         Rikishi 2                   Start Playing              game.js function startPlaying() {     var rikishi = $(\'#rikishi\').val();     

In [78]:
with open('./output/top_info_counts_per_topic.csv', 'r', encoding='utf-8') as file:
    topics_data = pd.read_csv(file)

    # Group the data by topic
    grouped_topics = topics_data.groupby('Topic').agg({
            'Topic Count': 'first',  # Just need the first occurrence as all values are the same for each topic
            'Keywords': 'first',  # Same as above
            'Document': lambda x: list(x)[:10]  # Get the first 10 example sentences for each topic
        }).reset_index()

# Save the grouped topics to a CSV file
grouped_topics. to_csv('./output/keywords_and_10_most_representative_sentences.csv', index=False)

In [79]:
model.visualize_barchart(top_n_topics = 17, n_words = 10)

In [90]:
# Iterate through each topic and its representative documents
for topic, most_rep_docs in repr_docs.items():
    if topic == -1:
        continue    
    topic_info = model.get_topic(topic)
    count = counts[topic + 1]
    # Check if topic_info is not None and extract keywords
    if topic_info:
        keywords = [word for word, _ in topic_info]
    else:
        continue

    
    with open(f'./prompts/prompts_{topic}.txt', 'w', newline='', encoding='utf-8') as file:

        file.write("I have a topic that contains the following most representative documents:\n")

        for i, doc in enumerate(most_rep_docs):
            file.write(f"<sentence{i}>{doc}</sentence{i}>\n")   

        file.write("\n\n")

        file.write("The topic is described by the following keywords:")
        for keyword in keywords:
            file.write(f" {keyword},")

        file.write("\n")
        file.write("\n")

        file.write('Based on the information about the topic above, \n')
        file.write('1) Analyze the keywords and come up with a general label. Explain why? \n')
        file.write('2) cluster the most representative sentences to come up with sub-labels. Explain why? \n\n')
        file.write('DO NOT FORGET TO include the first 10 character of that sentences that you use to come up with that sub-category, so that I can read the sentences you have clustered and see if you come up with correct label for that. \n')
        file.write('Take your time and think, then come up with the best, precise, and meaningful label and sub-labels.\n\n')
        file.write('Now Take a deep breath and start')

    
