In [62]:
import json
from bertopic import BERTopic

# Load your JSON data
with open('../pre_processed_data_non_english_removed.json', 'r') as file:
    data = json.load(file)

# Initialize BERTopic with nr_topics set to 'auto'
model = BERTopic(nr_topics="auto")

# Fit the model and transform your data to topics
topics, probs = model.fit_transform(data)

# Check the number of topics after merging
final_topics = set(model.get_topics().keys())
print(f"Final number of topics: {len(final_topics)}")

Final number of topics: 10


In [63]:
from scipy.cluster import hierarchy as sch

# Hierarchical topics
linkage_function = lambda x: sch.linkage(x, 'single', optimal_ordering=True)
hierarchical_topics = model.hierarchical_topics(data, linkage_function=linkage_function)

100%|██████████| 8/8 [00:00<00:00, 318.79it/s]


In [64]:
model.visualize_hierarchy(hierarchical_topics=hierarchical_topics)

In [65]:
doc_info = model.get_document_info(data)

doc_info = doc_info[doc_info['Topic'] != -1]

In [67]:
doc_info

Unnamed: 0,Document,Topic,Name,Representation,Representative_Docs,Top_n_words,Probability,Representative_document
0,server.js\n// Required libraries\nimport cors ...,1,1_the_const_rikishi_to,"[the, const, rikishi, to, picks, var, user, fr...",[None of the localStorage stuff renders on the...,the - const - rikishi - to - picks - var - use...,0.810934,False
2,"i have a pr for merging `develop` to `main`, w...",1,1_the_const_rikishi_to,"[the, const, rikishi, to, picks, var, user, fr...",[None of the localStorage stuff renders on the...,the - const - rikishi - to - picks - var - use...,1.000000,False
3,i got \n\n\n\nfrom github action but i got \n\...,1,1_the_const_rikishi_to,"[the, const, rikishi, to, picks, var, user, fr...",[None of the localStorage stuff renders on the...,the - const - rikishi - to - picks - var - use...,0.963899,False
4,Today when i check the github desktop of my we...,1,1_the_const_rikishi_to,"[the, const, rikishi, to, picks, var, user, fr...",[None of the localStorage stuff renders on the...,the - const - rikishi - to - picks - var - use...,0.768012,False
6,"Give me an list of User in python, \n\nUser is...",0,0_the_in_if_for,"[the, in, if, for, is, to, const, int, you, and]",[You are an agent in a gridworld.\nThe environ...,the - in - if - for - is - to - const - int - ...,0.834373,False
...,...,...,...,...,...,...,...,...
709,I am writing a data methods section where I de...,2,2_and_the_data_of,"[and, the, data, of, to, for, analysis, task, ...","[As a user, I will ask questions related to ac...",and - the - data - of - to - for - analysis - ...,0.810084,False
713,Create a Fourier series fit to the time-series...,0,0_the_in_if_for,"[the, in, if, for, is, to, const, int, you, and]",[You are an agent in a gridworld.\nThe environ...,the - in - if - for - is - to - const - int - ...,0.830004,False
715,How is it called if all bits are set to zero?,0,0_the_in_if_for,"[the, in, if, for, is, to, const, int, you, and]",[You are an agent in a gridworld.\nThe environ...,the - in - if - for - is - to - const - int - ...,0.806649,False
716,"what's differents of frontend: Dialog ,Readlin...",0,0_the_in_if_for,"[the, in, if, for, is, to, const, int, you, and]",[You are an agent in a gridworld.\nThe environ...,the - in - if - for - is - to - const - int - ...,1.000000,False


In [60]:
doc_info.sort_values(by='Topic', inplace=True)

doc_info.to_csv('doc_info_non_eng.csv', index=False)

In [2]:
# Get the number of unique topics
unique_topics = set(topics) - {-1}  # Exclude -1 if it's there (it's the outlier topic)
topic_names = model.get_topic_info().Name

# Iterate through each unique topic and get its keywords
for topic in unique_topics:
    topic_info = model.get_topic(topic)
    
    # Check if topic_info is not None and extract keywords
    if topic_info:
        keywords = [word for word, _ in topic_info]

        print(f"Topic {topic_names[topic + 1]}:")
        print("Keywords:", keywords)
        print("----------")
    else:
        print(f"Topic {topic} has no keywords.")

Topic 0_player_the_public_return:
Keywords: ['player', 'the', 'public', 'return', 'class', 'game', 'moves', 'err', 'move', 'string']
----------
Topic 1_object_the_you_are:
Keywords: ['object', 'the', 'you', 'are', 'an', 'of', 'is', 'that', 'in', 'to']
----------
Topic 2_to_available_please_resource:
Keywords: ['to', 'available', 'please', 'resource', 'required', 'your', 'community', 'tab', 'you', 'added']
----------
Topic 3_const_the_to_rikishi:
Keywords: ['const', 'the', 'to', 'rikishi', 'user', 'picks', 'var', 'from', 'function', 'and']
----------
Topic 4_github_git_to_the:
Keywords: ['github', 'git', 'to', 'the', 'that', 'if', 'writeoutput', 'branch', 'is', 'then']
----------
Topic 5_hflasite_install_from_docker:
Keywords: ['hflasite', 'install', 'from', 'docker', 'serverport', 'thisserverport', 'thisfscopytpl', 'pip', 'shell', 'mlflow']
----------
Topic 6_at_no_such_file:
Keywords: ['at', 'no', 'such', 'file', 'to', 'opam', 'is', 'prover', 'version', 'submission']
----------
Topic 

In [3]:
model.get_topic_freq()

Unnamed: 0,Topic,Count
1,-1,214
0,3,148
3,11,57
6,1,52
5,9,43
2,4,37
9,10,37
10,0,30
4,7,25
7,8,19


In [4]:
model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,214,-1_the_to_of_and,"[the, to, of, and, in, you, for, is, some, if]",[synovial cell SubClassOf Nothing\nsynovial ce...
1,0,30,0_player_the_public_return,"[player, the, public, return, class, game, mov...","[func (e *Db) Update(ctx context.Context, req ..."
2,1,52,1_object_the_you_are,"[object, the, you, are, an, of, is, that, in, to]",[I have a JS function `countToken(str)` that r...
3,2,13,2_to_available_please_resource,"[to, available, please, resource, required, yo...","[I got this command line script, can you write..."
4,3,148,3_const_the_to_rikishi,"[const, the, to, rikishi, user, picks, var, fr...",[server.js\n// Required libraries\nimport cors...
5,4,37,4_github_git_to_the,"[github, git, to, the, that, if, writeoutput, ...",[Can you convert the solution below to bash/re...
6,5,16,5_hflasite_install_from_docker,"[hflasite, install, from, docker, serverport, ...",[generate missing code in the below dockerfile...
7,6,13,6_at_no_such_file,"[at, no, such, file, to, opam, is, prover, ver...",[translate to somali\nNo images to download.\n...
8,7,25,7_table_sql_that_to,"[table, sql, that, to, rows, the, primary, cre...","[Create a table dogs with id, species, name co..."
9,8,19,8_var_youtube_to_on,"[var, youtube, to, on, wini, and, is, how, vid...","[output audio of the following sentence;\n\n""D..."


In [5]:
topic_names = model.get_topic_info().Name

for doc, topic in zip(data, topics):
    print("Document:", doc)
    print("Assigned Topic:", topic_names[topic + 1])
    print("----------")

Document: server.js
// Required libraries
import cors from 'cors';
import axios from 'axios';
import fs from 'fs';
import express from 'express';
import  from '

// Define HTTPS credentials using the File System (fs) to read the key and certificate files
const options = {
  key: fs.readFileSync('/opt/bitnami/apache/conf/mindfulai.equalreality.com.key'),   // Path to private key
  cert: fs.readFileSync('/opt/bitnami/apache/conf/mindfulai.equalreality.com.crt')   // Path to certificate file
};

// Create an instance of an Express application
const app = express();

let promptResponse = {};

//API's
import PromptGPT from './PromptGPT.js';
import { Speak, ResetCache } from './ElevenLabsServer.js'; 
import Transcribe from './WhisperTranscriberServer.js';


// Use cors middleware for handling Cross-Origin Resource Sharing
app.use(cors());

// Tell Express to parse JSON in the body of incoming requests.
app.use(express.json());

// Log all incoming requests
app.use(function(req, res, next) {


In [6]:
import csv

# Retrieve topic information
topic_info = model.get_topic_info()
topic_names = model.get_topic_info().Name

# Get the number of unique topics
unique_topics = set(topics) - {-1}  # Exclude -1 if it's there (it's the outlier topic)
counts = model.get_topic_info()['Count']

# Open a CSV file to save the data
with open('./output/non_english/ne_top_info_per_topic.csv', 'w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(["Topic", "Topic Count", "Keywords", "Document"])
    
    # Iterate through each topic
    for topic in unique_topics:
        topic_info = model.get_topic(topic)
        count = counts[topic + 1]
    
        # Check if topic_info is not None and extract keywords
        if topic_info:
            keywords = [word for word, _ in topic_info]
            print(f"Topic {topic_names[topic + 1]}:")
            print("Keywords:", keywords)
            print("----------")
        else:
            print(f"Topic {topic} has no keywords.")
            continue

        # Get documents for each topic
        doc_indices = [i for i, t in enumerate(topics) if t == topic]
        documents = [data[i] for i in doc_indices]

        # If fewer than 10 documents, repeat them until we have 10
        while len(documents) < 10:
            documents.extend(documents)

        # Write the top 10 (or fewer if not available) documents to CSV
        for doc in documents[:10]:
            writer.writerow([topic_names[topic + 1], count, keywords, doc])

print("Data saved to ne_top_info_per_topic.csv")

Topic 0_player_the_public_return:
Keywords: ['player', 'the', 'public', 'return', 'class', 'game', 'moves', 'err', 'move', 'string']
----------
Topic 1_object_the_you_are:
Keywords: ['object', 'the', 'you', 'are', 'an', 'of', 'is', 'that', 'in', 'to']
----------
Topic 2_to_available_please_resource:
Keywords: ['to', 'available', 'please', 'resource', 'required', 'your', 'community', 'tab', 'you', 'added']
----------
Topic 3_const_the_to_rikishi:
Keywords: ['const', 'the', 'to', 'rikishi', 'user', 'picks', 'var', 'from', 'function', 'and']
----------
Topic 4_github_git_to_the:
Keywords: ['github', 'git', 'to', 'the', 'that', 'if', 'writeoutput', 'branch', 'is', 'then']
----------
Topic 5_hflasite_install_from_docker:
Keywords: ['hflasite', 'install', 'from', 'docker', 'serverport', 'thisserverport', 'thisfscopytpl', 'pip', 'shell', 'mlflow']
----------
Topic 6_at_no_such_file:
Keywords: ['at', 'no', 'such', 'file', 'to', 'opam', 'is', 'prover', 'version', 'submission']
----------
Topic 