In [1]:
import nltk
nltk.download('stopwords')

# NLTK Stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\esagdic\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
import re
import numpy as np
import pandas as pd
from pprint import pprint

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

In [3]:

# Import Dataset
df = pd.read_json('../pre_processed_data_non_english_removed.json')

data = df.values.tolist()

for row in data[:10]:
    print(row)

['server.js\n// Required libraries\nimport cors from \'cors\';\nimport axios from \'axios\';\nimport fs from \'fs\';\nimport express from \'express\';\nimport  from \'\n\n// Define HTTPS credentials using the File System (fs) to read the key and certificate files\nconst options = {\n  key: fs.readFileSync(\'/opt/bitnami/apache/conf/mindfulai.equalreality.com.key\'),   // Path to private key\n  cert: fs.readFileSync(\'/opt/bitnami/apache/conf/mindfulai.equalreality.com.crt\')   // Path to certificate file\n};\n\n// Create an instance of an Express application\nconst app = express();\n\nlet promptResponse = {};\n\n//API\'s\nimport PromptGPT from \'./PromptGPT.js\';\nimport { Speak, ResetCache } from \'./ElevenLabsServer.js\'; \nimport Transcribe from \'./WhisperTranscriberServer.js\';\n\n\n// Use cors middleware for handling Cross-Origin Resource Sharing\napp.use(cors());\n\n// Tell Express to parse JSON in the body of incoming requests.\napp.use(express.json());\n\n// Log all incoming r

In [4]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations


data_words = list(sent_to_words(data))

for row in data_words[:10]:
    print(row)

['server', 'js', 'required', 'libraries', 'nimport', 'cors', 'from', 'cors', 'nimport', 'axios', 'from', 'axios', 'nimport', 'fs', 'from', 'fs', 'nimport', 'express', 'from', 'express', 'nimport', 'from', 'define', 'https', 'credentials', 'using', 'the', 'file', 'system', 'fs', 'to', 'read', 'the', 'key', 'and', 'certificate', 'files', 'nconst', 'options', 'key', 'fs', 'readfilesync', 'opt', 'bitnami', 'apache', 'conf', 'mindfulai', 'equalreality', 'com', 'key', 'path', 'to', 'private', 'key', 'cert', 'fs', 'readfilesync', 'opt', 'bitnami', 'apache', 'conf', 'mindfulai', 'equalreality', 'com', 'crt', 'path', 'to', 'certificate', 'file', 'create', 'an', 'instance', 'of', 'an', 'express', 'application', 'nconst', 'app', 'express', 'nlet', 'promptresponse', 'api', 'nimport', 'promptgpt', 'from', 'promptgpt', 'js', 'nimport', 'speak', 'resetcache', 'from', 'js', 'nimport', 'transcribe', 'from', 'js', 'use', 'cors', 'middleware', 'for', 'handling', 'cross', 'origin', 'resource', 'sharing', 

In [5]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# See trigram example
for trigram in trigram_mod[bigram_mod[data_words[:10]]]:
    print(trigram)

['server', 'js', 'required', 'libraries', 'nimport', 'cors', 'from', 'cors', 'nimport', 'axios', 'from', 'axios', 'nimport', 'fs', 'from', 'fs', 'nimport', 'express', 'from', 'express', 'nimport', 'from', 'define', 'https', 'credentials', 'using', 'the', 'file', 'system', 'fs', 'to', 'read', 'the', 'key', 'and', 'certificate', 'files', 'nconst', 'options', 'key', 'fs_readfilesync', 'opt', 'bitnami', 'apache', 'conf', 'mindfulai', 'equalreality', 'com', 'key', 'path', 'to', 'private', 'key', 'cert', 'fs_readfilesync', 'opt', 'bitnami', 'apache', 'conf', 'mindfulai', 'equalreality', 'com', 'crt', 'path', 'to', 'certificate', 'file', 'create', 'an', 'instance', 'of', 'an', 'express', 'application', 'nconst', 'app', 'express', 'nlet', 'promptresponse', 'api', 'nimport', 'promptgpt', 'from', 'promptgpt', 'js', 'nimport', 'speak', 'resetcache', 'from', 'js', 'nimport', 'transcribe', 'from', 'js', 'use', 'cors', 'middleware', 'for', 'handling', 'cross', 'origin', 'resource', 'sharing', 'napp'

In [6]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]


In [7]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# print
for row in data_words_bigrams[:10]:
    print(row)

['server', 'js', 'required', 'libraries', 'nimport', 'cors', 'cors', 'nimport', 'axios', 'axios', 'nimport', 'fs', 'fs', 'nimport', 'express', 'express', 'nimport', 'define', 'https', 'credentials', 'using', 'file', 'system', 'fs', 'read', 'key', 'certificate', 'files', 'nconst', 'options', 'key', 'fs_readfilesync', 'opt', 'bitnami', 'apache', 'conf', 'mindfulai', 'equalreality', 'com', 'key', 'path', 'private', 'key', 'cert', 'fs_readfilesync', 'opt', 'bitnami', 'apache', 'conf', 'mindfulai', 'equalreality', 'com', 'crt', 'path', 'certificate', 'file', 'create', 'instance', 'express', 'application', 'nconst', 'app', 'express', 'nlet', 'promptresponse', 'api', 'nimport', 'promptgpt', 'promptgpt', 'js', 'nimport', 'speak', 'resetcache', 'js', 'nimport', 'transcribe', 'js', 'cors', 'middleware', 'handling', 'cross', 'origin', 'resource', 'sharing', 'napp', 'cors', 'tell', 'express', 'parse', 'json', 'body', 'incoming', 'requests', 'napp', 'express', 'json', 'log', 'incoming', 'requests',

In [8]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.5.0/en_core_web_sm-3.5.0-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     --------------------------------------- 0.0/12.8 MB 682.7 kB/s eta 0:00:19
     --------------------------------------- 0.1/12.8 MB 825.8 kB/s eta 0:00:16
      --------------------------------------- 0.2/12.8 MB 2.0 MB/s eta 0:00:07
     - -------------------------------------- 0.4/12.8 MB 3.1 MB/s eta 0:00:05
     - -------------------------------------- 0.6/12.8 MB 3.6 MB/s eta 0:00:04
     -- ------------------------------------- 0.8/12.8 MB 4.0 MB/s eta 0:00:04
     --- ------------------------------------ 1.1/12.8 MB 4.3 MB/s eta 0:00:03
     ---- ----------------------------------- 1.3/12.8 MB 4.7 MB/s eta 0:00:03
     ---- ----------------------------------- 1.6/12.8 MB 5.0 MB/s eta 0:00:03
     ----- ----------------------------



In [9]:
# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [10]:
# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

for row in data_lemmatized[:10]:
    print(row)

['server', 'require', 'library', 'nimport', 'cor', 'cor', 'nimport', 'axio', 'axios', 'nimport', 'nimport', 'express', 'express', 'nimport', 'define', 'https', 'credential', 'use', 'file', 'system', 'read', 'key', 'certificate', 'file', 'nconst', 'option', 'key', 'key', 'path', 'private', 'key', 'cert', 'crt', 'path', 'certificate', 'file', 'create', 'instance', 'express', 'application', 'transcribe', 'cor', 'middleware', 'handle', 'cross', 'origin', 'resource', 'share', 'napp', 'cor', 'tell', 'express', 'parse', 'body', 'incoming', 'request', 'napp', 'log', 'incoming', 'request', 'napp', 'function', 'req_re', 'next', 'method', 'request', 'next', 'control', 'next', 'middleware', 'function', 'speak', 'function', 'lab', 'speak', 'speak', 'transcribe', 'function', 'route', 'handler', 'whisper', 'transcribe', 'transcribe', 'restart', 'server', 'get', 'function', 'req_re', 'gpt', 'old', 'version', 'judgegpt', 'askgpt', 'function', 'req_re', 'log', 'body', 'request', 'body', 'extract', 'yout

In [11]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

corpus = [doc for doc in corpus if len(doc)>1]

for row in corpus[:10]:
    print(row)

[(0, 1), (1, 2), (2, 1), (3, 6), (4, 1), (5, 2), (6, 3), (7, 1), (8, 3), (9, 3), (10, 12), (11, 2), (12, 2), (13, 3), (14, 2), (15, 1), (16, 1), (17, 1), (18, 2), (19, 12), (20, 2), (21, 5), (22, 1), (23, 1), (24, 10), (25, 5), (26, 1), (27, 2), (28, 1), (29, 1), (30, 2), (31, 3), (32, 5), (33, 9), (34, 1), (35, 1), (36, 1), (37, 1), (38, 4), (39, 6), (40, 1), (41, 1), (42, 1), (43, 2), (44, 15), (45, 1), (46, 3), (47, 3), (48, 2), (49, 3), (50, 2), (51, 17), (52, 4), (53, 2), (54, 1), (55, 1), (56, 4), (57, 8), (58, 1), (59, 2), (60, 2), (61, 5), (62, 1), (63, 8), (64, 2), (65, 5), (66, 3), (67, 1), (68, 1), (69, 3), (70, 2), (71, 5), (72, 4), (73, 1), (74, 1), (75, 1), (76, 2), (77, 1), (78, 10), (79, 1), (80, 2), (81, 3), (82, 1), (83, 7), (84, 1), (85, 1), (86, 1), (87, 1), (88, 5), (89, 2), (90, 1), (91, 1), (92, 2), (93, 4), (94, 2), (95, 4), (96, 3), (97, 1), (98, 1), (99, 1), (100, 3), (101, 1), (102, 2), (103, 1), (104, 8), (105, 2), (106, 3), (107, 9), (108, 1), (109, 1), (11

In [12]:
# Human readable format of corpus (term-frequency)
for row in [[(id2word[id], freq) for id, freq in cp] for cp in corpus[:10]]:
    print(row)

[('accept', 1), ('add', 2), ('addcallback', 1), ('api', 6), ('app', 1), ('append', 2), ('application', 3), ('arraybuffer', 1), ('askgpt', 3), ('async', 3), ('audio', 12), ('audiofile', 2), ('authorization', 2), ('await', 3), ('axio', 2), ('axios', 1), ('back', 1), ('base', 1), ('binary', 2), ('body', 12), ('buffer', 2), ('cache', 5), ('cachekey', 1), ('call', 1), ('callback', 10), ('catch', 5), ('cert', 1), ('certificate', 2), ('choice', 1), ('clear', 1), ('completetime', 2), ('console', 3), ('console_log', 5), ('const', 9), ('constructor', 1), ('content', 1), ('contenttype', 1), ('control', 1), ('cor', 4), ('create', 6), ('credential', 1), ('cross', 1), ('crt', 1), ('date', 2), ('datum', 15), ('debugging', 1), ('default', 3), ('define', 3), ('else', 2), ('env', 3), ('err', 2), ('error', 17), ('express', 4), ('extract', 2), ('fail', 1), ('false', 1), ('fetch', 4), ('file', 8), ('filename', 1), ('finish', 2), ('form', 2), ('formdata', 5), ('fs', 1), ('function', 8), ('generate', 2), ('g

In [13]:
# # Build LDA model
# lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
#                                            id2word=id2word,
#                                            num_topics=20, 
#                                            random_state=100,
#                                            update_every=1,
#                                            chunksize=100,
#                                            passes=10,
#                                            alpha='auto',
#                                            per_word_topics=True)

# lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
#                                            id2word=id2word,
#                                            num_topics=16,  # Updated number of topics
#                                            random_state=100,  # Keeping as it is
#                                            update_every=1,  # Keeping as it is
#                                            chunksize=100,  # Keeping as it is
#                                            passes=2000,  # Updated, but you might want to tweak this based on your corpus
#                                            alpha=50/16,  # Updated alpha
#                                            eta=0.01,  # Updated beta, using eta as the parameter name
#                                            per_word_topics=True)  # Keeping as it is

In [14]:
# Print the Keyword in the 10 topics
# pprint(lda_model.print_topics())
# doc_lda = lda_model[corpus]

In [15]:
# Visualize the topics
# pyLDAvis.enable_notebook()
# vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
# vis

In [16]:
import importlib
import ldamallet
importlib.reload(ldamallet)
import os

In [17]:
mallet_dir =  os.path.join(os.getcwd(), 'mallet-2.0.8')
mallet_path =  os.path.join(mallet_dir, 'bin', 'mallet') # update this path
# mallet_path = 'C:\\Users\\esagdic\\Desktop\\swe_research_mining_challenge\\mallet-2.0.8\\bin\\mallet.bat'

In [18]:
os.environ['MALLET_HOME'] = mallet_dir

In [19]:
#print MALLET_HOME
print(os.environ['MALLET_HOME'])

c:\Users\esagdic\Desktop\swe_research_mining_challenge\lda_mallet\mallet-2.0.8


In [20]:
# Initializing the LdaMallet model
ldamalletmodel = ldamallet.LdaMallet(mallet_path,
                    corpus=corpus,
                    num_topics=16,  # Setting number of topics as 16 (K=16)
                    id2word=id2word,
                    alpha=50/16,  # Setting alpha as 50/K
                    iterations=2000)  # Setting iterations as 2000 (Ir=2000)

# Show Topics
pprint(ldamalletmodel.show_topics(formatted=False))

[('9: async',
  [('file', 0.019753086419753086),
   ('model', 0.018518518518518517),
   ('build', 0.015637860082304528),
   ('version', 0.015226337448559672),
   ('make', 0.013991769547325103),
   ('datum', 0.012757201646090535),
   ('create', 0.011522633744855968),
   ('usr_gem', 0.009876543209876543),
   ('analysis', 0.00905349794238683),
   ('data', 0.008641975308641974)]),
 ('11: audiofile',
  [('number', 0.04459524963645177),
   ('string', 0.034415899175957346),
   ('option', 0.03296170625302957),
   ('param', 0.02084343189529811),
   ('time', 0.01793504604944256),
   ('return', 0.0174503150751333),
   ('secret', 0.015511391177896268),
   ('counter', 0.015026660203587009),
   ('base', 0.015026660203587009),
   ('element', 0.014057198254968492)]),
 ('2: addcallback',
  [('player', 0.0426098535286285),
   ('return', 0.041944074567243674),
   ('system', 0.03262316910785619),
   ('move', 0.02862849533954727),
   ('game', 0.018641810918774968),
   ('string', 0.017976031957390146),
   (

In [21]:
# Compute Coherence Score
coherence_model_ldamallet = CoherenceModel(model=ldamalletmodel, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_ldamallet = coherence_model_ldamallet.get_coherence()
print('\nCoherence Score: ', coherence_ldamallet)


Coherence Score:  0.46077713207231075


In [22]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(ldamallet.malletmodel2ldamodel(ldamalletmodel), corpus, id2word)
vis

  result = func(self.values, **kwargs)
  result = func(self.values, **kwargs)
  result = func(self.values, **kwargs)


In [23]:
def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=3, iterations=2000):
    """
    Compute c_v coherence for various number of topics

    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    texts : List of input texts
    limit : Max num of topics

    Returns:
    -------
    model_list : List of LDA topic models
    coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        model = ldamallet.LdaMallet(mallet_path,
                            corpus=corpus,
                            num_topics=16,  # Setting number of topics as 16 (K=16)
                            id2word=id2word,
                            iterations=iterations)
        model = ldamallet.LdaMallet(mallet_path, corpus=corpus, num_topics=num_topics, id2word=id2word)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())

    return model_list, coherence_values

In [24]:
# import pickle

# limit=51; start=2; step=1;
# ir_values = [100, 500, 1000, 2000]

# if os.path.exists('all_model_list_non_english_removed') and os.path.exists('all_coherence_values_non_english_removed'):
#     # read all_model_list
#     with open('all_model_list_non_english_removed', 'rb') as f:
#         all_model_list = pickle.load(f)

#     # read all_coherence_values
#     with open('all_coherence_values_non_english_removed', 'rb') as f:
#         all_coherence_values = pickle.load(f)
# else:
#     all_coherence_values = []
#     all_model_list = []
#     for ir in ir_values:
#         model_list, coherence_values = compute_coherence_values(
#             dictionary=id2word, corpus=corpus, texts=data_lemmatized, 
#             start=start, limit=limit, step=step, iterations=ir)
#         all_coherence_values.append(coherence_values)
#         all_model_list.append(model_list)

#     with open('all_coherence_values_non_english_removed', 'wb') as fp:
#         pickle.dump(all_coherence_values, fp)

#     with open('all_model_list_non_english_removed', 'wb') as fp:
#         pickle.dump(all_model_list, fp)

In [25]:
# for i in range(len(ir_values)):
#     ir = ir_values[i]
#     coherence_values = all_coherence_values[i]
#     x = range(start, limit, step)
#     print("Itrations = {}".format(ir))
#     for m, cv in zip(x, all_coherence_values[i]):
#         print("Num Topics =", m, " has Coherence Value of", round(cv, 4))
#     print()
    
#     x = range(start, limit, step)
#     plt.plot(x, coherence_values)
#     plt.xlabel("Num Topics")
#     plt.ylabel("Coherence score")
#     plt.legend(("coherence_values"), loc='best')
#     plt.title('Coherence Score vs Num Topics (step={} ir={})'.format(step, ir))
#     plt.show()

#     x_step = 6
#     x = range(start, limit, x_step)
#     plt.plot(x, coherence_values[::x_step])
#     plt.xlabel("Num Topics")
#     plt.ylabel("Coherence score")
#     plt.legend(("coherence_values"), loc='best')
#     plt.title('Coherence Score vs Num Topics (step={} ir={})'.format(x_step, ir))
#     plt.show()

#     print()

In [26]:
# Initializing the LdaMallet model
ldamalletmodel35 = ldamallet.LdaMallet(mallet_path,
                    corpus=corpus,
                    num_topics=35,  
                    id2word=id2word,
                    alpha=50/35,  
                    iterations=2000) 

# Show Topics
pprint(ldamalletmodel.show_topics(formatted=False))

[('2: addcallback',
  [('player', 0.0426098535286285),
   ('return', 0.041944074567243674),
   ('system', 0.03262316910785619),
   ('move', 0.02862849533954727),
   ('game', 0.018641810918774968),
   ('string', 0.017976031957390146),
   ('input', 0.017310252996005325),
   ('import', 0.016644474034620507),
   ('point', 0.015312916111850865),
   ('println', 0.013981358189081226)]),
 ('8: askgpt',
  [('react', 0.03425042111173498),
   ('style', 0.023582257158899493),
   ('return', 0.021897810218978103),
   ('component', 0.02133632790567097),
   ('page', 0.01909039865244245),
   ('comm', 0.017967434025828188),
   ('false', 0.014598540145985401),
   ('command', 0.01403705783267827),
   ('user', 0.01403705783267827),
   ('display', 0.01403705783267827)]),
 ('12: authorization',
  [('const', 0.04844061048440611),
   ('device', 0.0404777704047777),
   ('sum', 0.033178500331785),
   ('int', 0.0325149303251493),
   ('image', 0.03185136031851361),
   ('float', 0.029860650298606503),
   ('step', 0

In [27]:
# Compute Coherence Score
coherence_model_ldamallet = CoherenceModel(model=ldamalletmodel35, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_ldamallet = coherence_model_ldamallet.get_coherence()
print('\nCoherence Score: ', coherence_ldamallet)


Coherence Score:  0.5043461806101851


In [28]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(ldamallet.malletmodel2ldamodel(ldamalletmodel35), corpus, id2word)
vis

  result = func(self.values, **kwargs)
  result = func(self.values, **kwargs)
  result = func(self.values, **kwargs)


In [29]:
def format_topics_sentences(ldamodel, corpus, texts):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Store the topics, contributions, and keywords for each document in lists
    topics_data = []
    
    # Get main topic in each document
    for i, row_list in enumerate(ldamodel[corpus]):
        if len(row_list) == 0:
            # Handle the case where a document doesn't have a topic distribution
            continue
        
        # Sort the topics by contribution and take the dominant topic
        row_list = sorted(row_list, key=lambda x: (x[1]), reverse=True)
        dominant_topic_num, dominant_prop_topic = row_list[0]
        
        # Get the topic keywords
        topic_keywords = ", ".join([word for word, prop in ldamodel.show_topic(dominant_topic_num)])
        
        # Append the data to the list
        topics_data.append((dominant_topic_num, id2word[dominant_topic_num], round(dominant_prop_topic, 4), topic_keywords))
    
    # Create a DataFrame from the collected data
    if topics_data:
        sent_topics_df = pd.DataFrame(topics_data, columns=['Dominant_Topic_Num', 'Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords'])
    
    # Add original text to the end of the output
    if not sent_topics_df.empty:
        sent_topics_df['Text'] = pd.Series(texts)
    
    return sent_topics_df

In [30]:
df_topic_sents_keywords = format_topics_sentences(ldamalletmodel, corpus, data)

In [31]:
# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic_Num', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']
# drop document_no column
df_dominant_topic = df_dominant_topic.drop(columns=['Document_No'])
# Show
df_dominant_topic.head(10)

Unnamed: 0,Dominant_Topic_Num,Dominant_Topic,Topic_Perc_Contrib,Keywords,Text
0,13,await,0.8548,"error, file, create, const, function, datum, b...",[server.js\n// Required libraries\nimport cors...
1,5,append,0.4387,"type, return, response, function, error, text,...",[write a readme file for this cli:\n\nimport {...
2,1,add,0.4005,"error, run, extension, head, install, treturn,...","[i have a pr for merging `develop` to `main`, ..."
3,1,add,0.2806,"error, run, extension, head, install, treturn,...",[i got \n\n\n\nfrom github action but i got \n...
4,9,async,0.2556,"file, model, build, version, make, datum, crea...",[Today when i check the github desktop of my w...
5,8,askgpt,0.4767,"react, style, return, component, page, comm, f...","[img = np.fromfile(dph_files[0], dtype=np.uint..."
6,12,authorization,0.2774,"const, device, sum, int, image, float, step, u...","[Give me an list of User in python, \n\nUser i..."
7,11,audiofile,0.4438,"number, string, option, param, time, return, s...",[Write a function that can return the long des...
8,9,async,0.2938,"file, model, build, version, make, datum, crea...",[I have a repository for Real-world job board ...
9,6,application,0.3434,"health, literacy, long, high, reduce, people, ...",[reviews.csvSpreadsheetI want you to act as a ...


In [32]:
# get the number of unique topics
unique_topics = df_dominant_topic['Dominant_Topic_Num'].unique()
print("Number of unique topics: {}".format(len(unique_topics)))

Number of unique topics: 16


In [33]:
# save to csv
df_dominant_topic.to_csv('./output/non_english/dominant_topics_16_non_english_removed.csv')

In [34]:
# Group top 5 sentences under each topic
sent_topics_sorteddf_mallet = pd.DataFrame()

sent_topics_outdf_grpd = df_topic_sents_keywords.groupby('Dominant_Topic_Num')

for i, grp in sent_topics_outdf_grpd:
    sent_topics_sorteddf_mallet = pd.concat([sent_topics_sorteddf_mallet, 
                                             grp.sort_values(['Perc_Contribution'], ascending=[0]).head(10)], 
                                            axis=0)

# Reset Index    
sent_topics_sorteddf_mallet.reset_index(drop=True, inplace=True)

# Format
sent_topics_sorteddf_mallet.columns = ['Topic_Num', "Topic", "Topic_Perc_Contrib", "Keywords", "Text"]

# Show
sent_topics_sorteddf_mallet

Unnamed: 0,Topic_Num,Topic,Topic_Perc_Contrib,Keywords,Text
0,0,accept,0.9858,"file, text, output, return, click, true, tag, ...",[I have a nice table describing a curriculum f...
1,0,accept,0.9491,"file, text, output, return, click, true, tag, ...","[On android, the app icon I have is appearing ..."
2,0,accept,0.8535,"file, text, output, return, click, true, tag, ...",[what are a list of python and tkinter tools i...
3,0,accept,0.8248,"file, text, output, return, click, true, tag, ...",[I have a list of file indexes followed by the...
4,0,accept,0.8191,"file, text, output, return, click, true, tag, ...",[Create TS types for the OSM notes API return ...
...,...,...,...,...,...
155,15,axios,0.6461,"issue, step, process, repository, project, wor...",[xy_HOLISTIC_OPENSIM.csvSpreadsheetI'm hoping ...
156,15,axios,0.5833,"issue, step, process, repository, project, wor...",[With a maven pom.xm and one dependency how pr...
157,15,axios,0.5801,"issue, step, process, repository, project, wor...",[import click \n import frontmatter \n \n fro...
158,15,axios,0.5765,"issue, step, process, repository, project, wor...",[I want to get a PNG image of some stat cards ...


In [35]:
# Number of Documents for Each Topic
topic_counts = df_topic_sents_keywords['Dominant_Topic_Num'].value_counts()

# Percentage of Documents for Each Topic
topic_contribution = round(topic_counts/topic_counts.sum(), 4)

# Topic Number and Keywords
topic_num_keywords = df_topic_sents_keywords[['Dominant_Topic', 'Topic_Keywords']]
topic_num_keywords = topic_num_keywords.drop_duplicates().reset_index(drop=True)
# topic_num_keywords = df_topic_sents_keywords['Dominant_Topic']
# unique_topics = topic_num_keywords['Dominant_Topic'].unique()

# Concatenate Column wise
df_dominant_topics_a = pd.concat([topic_num_keywords, topic_counts, topic_contribution], axis=1)

# Change Column names
df_dominant_topics_a.columns = ['Dominant_Topic', 'Topic_Keywords', 'Num_Documents', 'Perc_Documents']

# Show
df_dominant_topics_a.sort_values(by=['Dominant_Topic']).reset_index(drop=True)

Unnamed: 0,Dominant_Topic,Topic_Keywords,Num_Documents,Perc_Documents
0,accept,"file, text, output, return, click, true, tag, ...",10,0.014
1,add,"error, run, extension, head, install, treturn,...",23,0.0323
2,addcallback,"player, return, system, move, game, string, in...",50,0.0702
3,api,"model, label, shape, definition, disease, prec...",61,0.0857
4,app,"string, public, web, object, table, key, set, ...",57,0.0801
5,append,"type, return, response, function, error, text,...",55,0.0772
6,application,"health, literacy, long, high, reduce, people, ...",19,0.0267
7,arraybuffer,"user, pick, org_gradle, return, string, intern...",36,0.0506
8,askgpt,"react, style, return, component, page, comm, f...",37,0.052
9,async,"file, model, build, version, make, datum, crea...",30,0.0421


In [36]:
# save to csv
sent_topics_sorteddf_mallet.to_csv('./output/non_english/top10_sentences_16_non_english_removed.csv')


In [37]:
df_topic_sents_keywords = format_topics_sentences(ldamalletmodel35, corpus, data)

In [38]:
# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic_Num', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']
# drop document_no column
df_dominant_topic = df_dominant_topic.drop(columns=['Document_No'])
# Show
df_dominant_topic.head(10)

Unnamed: 0,Dominant_Topic_Num,Dominant_Topic,Topic_Perc_Contrib,Keywords,Text
0,16,back,0.9141,"error, response, file, function, datum, requir...",[server.js\n// Required libraries\nimport cors...
1,16,back,0.4929,"error, response, file, function, datum, requir...",[write a readme file for this cli:\n\nimport {...
2,23,call,0.6851,"issue, time, project, work, list, commit, good...","[i have a pr for merging `develop` to `main`, ..."
3,29,clear,0.5666,"write, run, script, file, version, create, rep...",[i got \n\n\n\nfrom github action but i got \n...
4,23,call,0.3742,"issue, time, project, work, list, commit, good...",[Today when i check the github desktop of my w...
5,8,askgpt,0.2859,"import, append, shape, return, dtype, datetime...","[img = np.fromfile(dph_files[0], dtype=np.uint..."
6,23,call,0.539,"issue, time, project, work, list, commit, good...","[Give me an list of User in python, \n\nUser i..."
7,29,clear,0.2992,"write, run, script, file, version, create, rep...",[Write a function that can return the long des...
8,29,clear,0.4205,"write, run, script, file, version, create, rep...",[I have a repository for Real-world job board ...
9,21,cache,0.2439,"datum, analysis, learn, make, business, nissue...",[reviews.csvSpreadsheetI want you to act as a ...


In [39]:
# save in csv
df_dominant_topic.to_csv('./output/non_english/dominant_topics_35_non_english_removed.csv')

In [40]:
# Group top 5 sentences under each topic
sent_topics_sorteddf_mallet = pd.DataFrame()

sent_topics_outdf_grpd = df_topic_sents_keywords.groupby('Dominant_Topic_Num')

for i, grp in sent_topics_outdf_grpd:
    sent_topics_sorteddf_mallet = pd.concat([sent_topics_sorteddf_mallet, 
                                             grp.sort_values(['Perc_Contribution'], ascending=[0]).head(10)], 
                                            axis=0)

# Reset Index    
sent_topics_sorteddf_mallet.reset_index(drop=True, inplace=True)

# Format
sent_topics_sorteddf_mallet.columns = ['Topic_Num', "Topic", "Topic_Perc_Contrib", "Keywords", "Text"]

# Show
sent_topics_sorteddf_mallet

Unnamed: 0,Topic_Num,Topic,Topic_Perc_Contrib,Keywords,Text
0,0,accept,0.8688,"type, datum, backup, message, device, interfac...",[I am working on a Quarto book \nI need to kn...
1,0,accept,0.8359,"type, datum, backup, message, device, interfac...",[Right now I got stuck on accessing files on A...
2,0,accept,0.7516,"type, datum, backup, message, device, interfac...",[import click \n import frontmatter \n \n fro...
3,0,accept,0.7045,"type, datum, backup, message, device, interfac...","[Generally speaking, how would you order the p..."
4,0,accept,0.6965,"type, datum, backup, message, device, interfac...",[in typescript is there kind of ordered dict? ...
...,...,...,...,...,...
307,34,constructor,0.7028,"file, return, click, text, true, key, main, pa...",[in the following it actually gets stuck at se...
308,34,constructor,0.6821,"file, return, click, text, true, key, main, pa...",[This is likely a very basic networking questi...
309,34,constructor,0.6543,"file, return, click, text, true, key, main, pa...",[How to run one particular spring boot applica...
310,34,constructor,0.5780,"file, return, click, text, true, key, main, pa...",[Can you write a python script to load this cs...


In [41]:
# save in csv
sent_topics_sorteddf_mallet.to_csv('./output/non_english/top10_sentences_35_non_english_removed.csv')