In [1]:
import nltk
nltk.download('stopwords')

# NLTK Stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
# stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\esagdic\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
import re
import numpy as np
import pandas as pd
from pprint import pprint

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

In [3]:

# Import Dataset
df = pd.read_json('../pre_processed_data_first_prompts.json')

data = df.values.tolist()

for row in data[:10]:
    print(row)

[' button Button       ::-webkit-scrollbar {         display: none !important;       }       html,       textarea {         background: lightgoldenrodyellow;       }       html,       body,       #container {         height: 100%;         width: 100%;         overflow-x: hidden;       }       #writebox {         font-size: large;         padding: 20px;         width: 100%;         height: 100%;         border: none;         letter-spacing: 2px;         color: rgb(27, 77, 63);         font-family: serif;         font-weight: bold;         line-height: 1.69;         border: none;         outline: none;       }       #clearbutton {         position: absolute;         right: 10px;         bottom: 10px;         width: 70px;         height: 70px;         background-color: pink;         border-radius: 50%;       }       @media (max-width: 600px) {         #clearbutton {           display: block;         }       }       @media (min-width: 601px) {         #clearbutton {           display: none

In [4]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations


data_words = list(sent_to_words(data))

for row in data_words[:10]:
    print(row)

['button', 'button', 'webkit', 'scrollbar', 'display', 'none', 'important', 'html', 'textarea', 'background', 'html', 'body', 'container', 'height', 'width', 'overflow', 'hidden', 'writebox', 'font', 'size', 'large', 'padding', 'px', 'width', 'height', 'border', 'none', 'letter', 'spacing', 'px', 'color', 'rgb', 'font', 'family', 'serif', 'font', 'weight', 'bold', 'line', 'height', 'border', 'none', 'outline', 'none', 'clearbutton', 'position', 'absolute', 'right', 'px', 'bottom', 'px', 'width', 'px', 'height', 'px', 'background', 'color', 'pink', 'border', 'radius', 'media', 'max', 'width', 'px', 'clearbutton', 'display', 'block', 'media', 'min', 'width', 'px', 'clearbutton', 'display', 'none']
['server', 'js', 'required', 'libraries', 'import', 'cors', 'from', 'cors', 'import', 'axios', 'from', 'axios', 'import', 'fs', 'from', 'fs', 'import', 'express', 'from', 'express', 'import', 'from', 'define', 'https', 'credentials', 'using', 'the', 'file', 'system', 'fs', 'to', 'read', 'the', 

In [5]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# See trigram example
for trigram in trigram_mod[bigram_mod[data_words[:10]]]:
    print(trigram)

['button', 'button', 'webkit', 'scrollbar', 'display', 'none', 'important', 'html', 'textarea', 'background', 'html', 'body', 'container', 'height', 'width', 'overflow', 'hidden', 'writebox', 'font_size', 'large', 'padding', 'px', 'width', 'height', 'border', 'none', 'letter', 'spacing', 'px', 'color_rgb', 'font', 'family', 'serif', 'font_weight', 'bold', 'line', 'height', 'border', 'none', 'outline', 'none', 'clearbutton', 'position', 'absolute', 'right', 'px', 'bottom', 'px', 'width_px', 'height_px', 'background_color', 'pink', 'border_radius', 'media', 'max', 'width_px', 'clearbutton', 'display', 'block', 'media', 'min', 'width_px', 'clearbutton', 'display', 'none']
['server', 'js', 'required', 'libraries', 'import', 'cors', 'from', 'cors', 'import', 'axios', 'from', 'axios', 'import', 'fs', 'from', 'fs', 'import', 'express', 'from', 'express', 'import', 'from', 'define', 'https', 'credentials', 'using', 'the', 'file', 'system', 'fs', 'to', 'read', 'the', 'key', 'and', 'certificate'

In [6]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]


In [7]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# print
for row in data_words_bigrams[:10]:
    print(row)

['button', 'button', 'webkit', 'scrollbar', 'display', 'none', 'important', 'html', 'textarea', 'background', 'html', 'body', 'container', 'height', 'width', 'overflow', 'hidden', 'writebox', 'font_size', 'large', 'padding', 'px', 'width', 'height', 'border', 'none', 'letter', 'spacing', 'px', 'color_rgb', 'font', 'family', 'serif', 'font_weight', 'bold', 'line', 'height', 'border', 'none', 'outline', 'none', 'clearbutton', 'position', 'absolute', 'right', 'px', 'bottom', 'px', 'width_px', 'height', 'px', 'background_color', 'pink', 'border_radius', 'media', 'max', 'width_px', 'clearbutton', 'display', 'block', 'media', 'min', 'width_px', 'clearbutton', 'display', 'none']
['server', 'js', 'required', 'libraries', 'import', 'cors', 'cors', 'import', 'axios', 'axios', 'import', 'fs', 'fs', 'import', 'express', 'express', 'import', 'define', 'https', 'credentials', 'using', 'file', 'system', 'fs', 'read', 'key', 'certificate', 'files', 'const', 'options', 'key', 'fs_readfilesync', 'opt', 

In [8]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.5.0/en_core_web_sm-3.5.0-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     --------------------------------------- 0.0/12.8 MB 640.0 kB/s eta 0:00:20
     --------------------------------------- 0.1/12.8 MB 812.7 kB/s eta 0:00:16
      --------------------------------------- 0.2/12.8 MB 1.5 MB/s eta 0:00:09
      --------------------------------------- 0.3/12.8 MB 1.9 MB/s eta 0:00:07
     - -------------------------------------- 0.4/12.8 MB 1.9 MB/s eta 0:00:07
     - -------------------------------------- 0.5/12.8 MB 2.1 MB/s eta 0:00:06
     - -------------------------------------- 0.6/12.8 MB 2.2 MB/s eta 0:00:06
     -- ------------------------------------- 0.8/12.8 MB 2.3 MB/s eta 0:00:06
     -- ------------------------------------- 0.9/12.8 MB 2.3 MB/s eta 0:00:06
     --- ------------------------------



In [9]:
# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [10]:
# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

for row in data_lemmatized[:10]:
    print(row)

['display', 'none', 'important', 'container', 'height', 'width', 'overflow', 'hidden', 'writebox', 'font_size', 'large', 'padding', 'width', 'height', 'border', 'none', 'letter', 'space', 'family', 'serif', 'font_weight', 'bold', 'line', 'height', 'border', 'none', 'outline', 'none', 'position', 'absolute', 'bottom', 'px', 'width_px', 'height', 'pink', 'medium', 'width_px', 'display', 'block', 'medium', 'min', 'width_px', 'clearbutton', 'display', 'none']
['server', 'require', 'library', 'import', 'cor', 'cor', 'import', 'axio', 'axio', 'import', 'fs', 'import', 'express', 'express', 'import', 'define', 'https', 'credential', 'use', 'file', 'system', 'read', 'key', 'certificate', 'file', 'const', 'option', 'key', 'key', 'path', 'private', 'key', 'cert', 'crt', 'path', 'certificate', 'file', 'create', 'instance', 'express', 'application', 'const', 'let', 'promptresponse', 'api', 'import', 'promptgpt', 'promptgpt', 'use', 'cor', 'middleware', 'handle', 'cross', 'origin', 'resource', 'sha

In [11]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

corpus = [doc for doc in corpus if len(doc)>1]

for row in corpus[:10]:
    print(row)

[(0, 1), (1, 1), (2, 1), (3, 2), (4, 1), (5, 1), (6, 1), (7, 3), (8, 1), (9, 1), (10, 1), (11, 4), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 2), (18, 1), (19, 5), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1), (27, 1), (28, 2), (29, 3), (30, 1)]
[(13, 1), (31, 1), (32, 2), (33, 1), (34, 7), (35, 3), (36, 2), (37, 3), (38, 1), (39, 3), (40, 3), (41, 12), (42, 2), (43, 2), (44, 3), (45, 4), (46, 1), (47, 1), (48, 1), (49, 2), (50, 6), (51, 2), (52, 5), (53, 1), (54, 1), (55, 10), (56, 4), (57, 1), (58, 1), (59, 2), (60, 1), (61, 1), (62, 1), (63, 2), (64, 3), (65, 4), (66, 11), (67, 1), (68, 1), (69, 1), (70, 1), (71, 4), (72, 6), (73, 1), (74, 1), (75, 1), (76, 2), (77, 15), (78, 1), (79, 3), (80, 3), (81, 2), (82, 8), (83, 1), (84, 17), (85, 3), (86, 5), (87, 2), (88, 1), (89, 1), (90, 4), (91, 8), (92, 1), (93, 2), (94, 2), (95, 5), (96, 2), (97, 8), (98, 2), (99, 5), (100, 3), (101, 1), (102, 3), (103, 1), (104, 5), (105, 4), (106, 1), (107, 13), (108, 2), (10

In [12]:
# Human readable format of corpus (term-frequency)
for row in [[(id2word[id], freq) for id, freq in cp] for cp in corpus[:10]]:
    print(row)

[('absolute', 1), ('block', 1), ('bold', 1), ('border', 2), ('bottom', 1), ('clearbutton', 1), ('container', 1), ('display', 3), ('family', 1), ('font_size', 1), ('font_weight', 1), ('height', 4), ('hidden', 1), ('important', 1), ('large', 1), ('letter', 1), ('line', 1), ('medium', 2), ('min', 1), ('none', 5), ('outline', 1), ('overflow', 1), ('padding', 1), ('pink', 1), ('position', 1), ('px', 1), ('serif', 1), ('space', 1), ('width', 2), ('width_px', 3), ('writebox', 1)]
[('important', 1), ('accept', 1), ('add', 2), ('addcallback', 1), ('api', 7), ('app', 3), ('append', 2), ('application', 3), ('arraybuffer', 1), ('askgpt', 3), ('async', 3), ('audio', 12), ('audiofile', 2), ('authorization', 2), ('await', 3), ('axio', 4), ('axios', 1), ('back', 1), ('base', 1), ('binary', 2), ('body', 6), ('buffer', 2), ('cache', 5), ('cachekey', 1), ('call', 1), ('callback', 10), ('catch', 4), ('catch_err', 1), ('cert', 1), ('certificate', 2), ('choice', 1), ('class', 1), ('clear', 1), ('completetim

In [13]:
import os

def train_or_load_lda_model(corpus, id2word, num_topics, save_path='./models'):
    # Construct the model filename based on the number of topics
    model_filename = f'lda_model_{num_topics}_topics.lda'
    model_filepath = os.path.join(save_path, model_filename)
    
    # Check if the model file exists
    if os.path.isfile(model_filepath):
        print(f"Loading model from {model_filepath}")
        lda_model = gensim.models.ldamodel.LdaModel.load(model_filepath)
    else:
        print(f"Training new LDA model with {num_topics} topics.")
        # Train the LDA model
        lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                                    id2word=id2word,
                                                    num_topics=num_topics,
                                                    random_state=100,
                                                    update_every=1,
                                                    chunksize=100,
                                                    passes=2000,
                                                    alpha=50/num_topics,  # Updated alpha
                                                    eta=0.01,  # Updated eta (beta)
                                                    per_word_topics=True)
        
        # Save the model to disk
        print(f"Saving model to {model_filepath}")
        lda_model.save(model_filepath)
    
    return lda_model


In [14]:
lda_model =  train_or_load_lda_model(corpus, id2word, num_topics=16)

Loading model from ./models\lda_model_16_topics.lda


In [15]:
# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.116*"error" + 0.062*"const" + 0.055*"await" + 0.055*"response" + '
  '0.040*"message" + 0.030*"app" + 0.026*"status" + 0.025*"console_log" + '
  '0.025*"async" + 0.023*"try"'),
 (1,
  '0.116*"return" + 0.062*"self" + 0.062*"name" + 0.055*"string" + '
  '0.047*"value" + 0.044*"type" + 0.038*"key" + 0.033*"none" + 0.029*"object" '
  '+ 0.028*"action"'),
 (2,
  '0.119*"const" + 0.053*"prop" + 0.049*"return" + 0.045*"createsignal" + '
  '0.038*"default" + 0.037*"item" + 0.037*"createeffect" + 0.032*"frontend" + '
  '0.030*"requirement" + 0.028*"import"'),
 (3,
  '0.259*"create" + 0.149*"text" + 0.111*"output" + 0.105*"model" + '
  '0.078*"new" + 0.059*"implement" + 0.035*"change" + 0.025*"default" + '
  '0.021*"single" + 0.020*"use"'),
 (4,
  '0.141*"file" + 0.062*"solve" + 0.050*"set" + 0.049*"edit" + 0.040*"need" + '
  '0.030*"example" + 0.028*"working" + 0.027*"promise" + 0.027*"favor" + '
  '0.027*"small_avoid"'),
 (5,
  '0.149*"function" + 0.100*"result" + 0.079*"let" + 0.04

In [16]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
vis

In [17]:
lda_model_35 =  train_or_load_lda_model(corpus, id2word, num_topics=35)

Loading model from ./models\lda_model_35_topics.lda


In [18]:
# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.116*"error" + 0.062*"const" + 0.055*"await" + 0.055*"response" + '
  '0.040*"message" + 0.030*"app" + 0.026*"status" + 0.025*"console_log" + '
  '0.025*"async" + 0.023*"try"'),
 (1,
  '0.116*"return" + 0.062*"self" + 0.062*"name" + 0.055*"string" + '
  '0.047*"value" + 0.044*"type" + 0.038*"key" + 0.033*"none" + 0.029*"object" '
  '+ 0.028*"action"'),
 (2,
  '0.119*"const" + 0.053*"prop" + 0.049*"return" + 0.045*"createsignal" + '
  '0.038*"default" + 0.037*"item" + 0.037*"createeffect" + 0.032*"frontend" + '
  '0.030*"requirement" + 0.028*"import"'),
 (3,
  '0.259*"create" + 0.149*"text" + 0.111*"output" + 0.105*"model" + '
  '0.078*"new" + 0.059*"implement" + 0.035*"change" + 0.025*"default" + '
  '0.021*"single" + 0.020*"use"'),
 (4,
  '0.141*"file" + 0.062*"solve" + 0.050*"set" + 0.049*"edit" + 0.040*"need" + '
  '0.030*"example" + 0.028*"working" + 0.027*"promise" + 0.027*"favor" + '
  '0.027*"small_avoid"'),
 (5,
  '0.149*"function" + 0.100*"result" + 0.079*"let" + 0.04

In [19]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
vis

In [20]:
for i, row_list in enumerate(lda_model[corpus]):
    print(row_list)
    if isinstance(row, tuple) and len(row) == 2 and isinstance(row[1], (float, int)):

        print(sorted(row_list, key=lambda x: (x[1]), reverse=True))
    if i > 5:
        break

([(0, 0.038112376), (1, 0.09908983), (2, 0.038112376), (3, 0.038112376), (4, 0.038112376), (5, 0.12115441), (6, 0.064051546), (7, 0.038340226), (8, 0.05030593), (9, 0.0874085), (10, 0.1600552), (11, 0.06670499), (12, 0.038112376), (13, 0.038112376), (14, 0.04610273), (15, 0.038112376)], [(0, []), (1, [5]), (2, [9]), (3, [6]), (4, [5]), (5, []), (6, [5, 6]), (7, [11, 14]), (8, []), (9, [10]), (10, []), (11, [10]), (12, []), (13, [9]), (14, [9]), (15, []), (16, [5, 9, 7]), (17, []), (18, []), (19, [1]), (20, [9]), (21, []), (22, [5]), (23, []), (24, [8]), (25, [10]), (26, []), (27, [10]), (28, [5]), (29, [10]), (30, [])], [(0, []), (1, [(5, 0.999963)]), (2, [(9, 0.9988096)]), (3, [(6, 1.9992911)]), (4, [(5, 0.99978113)]), (5, []), (6, [(5, 0.8723726), (6, 0.127574)]), (7, [(11, 2.3454845), (14, 0.65411234)]), (8, []), (9, [(10, 0.9997276)]), (10, []), (11, [(10, 3.9996717)]), (12, []), (13, [(9, 0.9997209)]), (14, [(9, 0.9998502)]), (15, []), (16, [(5, 0.9373782), (7, 0.018682122), (9, 0

In [21]:
def format_topics_sentences(ldamodel, corpus, texts):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Store the topics, contributions, and keywords for each document in lists
    topics_data = []
    
    # To keep track of document index that has a valid topic distribution
    indexes_with_topics = []
    
    # Get main topic in each document
    for i, row in enumerate(ldamodel[corpus]):
        # row is a tuple of 3 items in the format you've provided.
        # We only care about the first item for the dominant topic.
        topic_proportions, word_ids, word_contributions = row
        
        # Now we sort only the topic_proportions part to find the dominant topic.
        if topic_proportions:  # Check if topic_proportions is not empty
            topic_proportions = sorted(topic_proportions, key=lambda x: x[1], reverse=True)
            dominant_topic_num, dominant_prop_topic = topic_proportions[0]
            
            # Extract the dominant topic keywords
            topic_keywords = ", ".join([word for word, prob in ldamodel.show_topic(dominant_topic_num)])
            
            # Append to the topics_data list
            topics_data.append((id2word[dominant_topic_num], round(dominant_prop_topic, 4), topic_keywords))
            indexes_with_topics.append(i)  # Add index to the list

    # Create the DataFrame from the topics_data list
    if topics_data:
        sent_topics_df = pd.DataFrame(topics_data, columns=['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords'])
        
    # Add the texts only if sent_topics_df is not empty and only for those indexes with valid topic distribution
    if not sent_topics_df.empty:
        # Filter texts to include only those that correspond to indexes with topics
        filtered_texts = [text for i, text in enumerate(texts) if i in indexes_with_topics]
        sent_topics_df['Text'] = filtered_texts

    return sent_topics_df


In [22]:
df_topic_sents_keywords = format_topics_sentences(lda_model, corpus, data)

In [23]:
# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']
# drop document_no column
df_dominant_topic = df_dominant_topic.drop(columns=['Document_No'])
# Show
df_dominant_topic.head(10)

Unnamed: 0,Dominant_Topic,Topic_Perc_Contrib,Keywords,Text
0,font_weight,0.1601,"use, set, echo, part, write, call, work, inste...",[ button Button ::-webkit-scrollbar { ...
1,absolute,0.3962,"error, const, await, response, message, app, s...",[server.js // Required libraries import cors f...
2,block,0.1875,"return, self, name, string, value, type, key, ...",[write a readme file for this cli: import { pr...
3,important,0.1086,"write, start, code, example, plan, work, need,...","[i have a pr for merging `develop` to `main`, ..."
4,large,0.2167,"user, get, new, test, class, say, web, public,...",[i got ``` ============================= test...
5,font_size,0.1009,"junior, ai, project, prompt, file, development...",[Today when i check the github desktop of my w...
6,family,0.0884,"datum, issue, make, add, go, store, first, tim...","[img = np.fromfile(dph_files[0], dtype=np.uint..."
7,block,0.11,"return, self, name, string, value, type, key, ...","[Give me an list of User in python, User is a..."
8,clearbutton,0.0916,"function, result, let, line, color, package, n...",[Write a function that can return the long des...
9,clearbutton,0.2115,"function, result, let, line, color, package, n...",[ let urlParams = new URL(location.href); ...


In [24]:
# save to csv
df_dominant_topic.to_csv('./output/dominant_topics_16_first_prompts_gensim_only.csv')

In [25]:
# Group top 5 sentences under each topic
sent_topics_sorteddf_mallet = pd.DataFrame()

sent_topics_outdf_grpd = df_topic_sents_keywords.groupby('Dominant_Topic')

for i, grp in sent_topics_outdf_grpd:
    sent_topics_sorteddf_mallet = pd.concat([sent_topics_sorteddf_mallet, 
                                             grp.sort_values(['Perc_Contribution'], ascending=[0]).head(10)], 
                                            axis=0)

# Reset Index    
sent_topics_sorteddf_mallet.reset_index(drop=True, inplace=True)

# Format
sent_topics_sorteddf_mallet.columns = ["Topic", "Topic_Perc_Contrib", "Keywords", "Text"]

# Show
sent_topics_sorteddf_mallet

Unnamed: 0,Topic,Topic_Perc_Contrib,Keywords,Text
0,absolute,0.4583,"error, const, await, response, message, app, s...","[You are Junior, an AI system aiding developer..."
1,absolute,0.4546,"error, const, await, response, message, app, s...",[How to check type hints in a whole Python rep...
2,absolute,0.3983,"error, const, await, response, message, app, s...",[# Working set src/backend/setupRoutes.js: ```...
3,absolute,0.3962,"error, const, await, response, message, app, s...",[server.js // Required libraries import cors f...
4,absolute,0.3887,"error, const, await, response, message, app, s...",[# Working set src/backend/setupRoutes.js: ```...
...,...,...,...,...
155,letter,0.2836,"import, src, file, export, component, frontend...",[# Working set src/frontend/App.jsx: ``` impor...
156,letter,0.2822,"import, src, file, export, component, frontend...",[# Working set src/frontend/App.jsx: ``` impor...
157,letter,0.2812,"import, src, file, export, component, frontend...",[# Working set src/frontend/App.jsx: ``` impor...
158,letter,0.2782,"import, src, file, export, component, frontend...",[# Working set src/frontend/App.jsx: ``` impor...


In [26]:
# save to csv
sent_topics_sorteddf_mallet.to_csv('./output/top10_sentences_16_first_prompts_gensim_only.csv')


In [28]:
df_topic_sents_keywords = format_topics_sentences(lda_model_35, corpus, data)

In [29]:
# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']
# drop document_no column
df_dominant_topic = df_dominant_topic.drop(columns=['Document_No'])
# Show
df_dominant_topic.head(10)

Unnamed: 0,Dominant_Topic,Topic_Perc_Contrib,Keywords,Text
0,outline,0.1633,"key, output, try, possible, display, simple, v...",[ button Button ::-webkit-scrollbar { ...
1,absolute,0.2012,"error, message, status, err, console_log, re, ...",[server.js // Required libraries import cors f...
2,pink,0.1059,"return, value, none, true, else, option, false...",[write a readme file for this cli: import { pr...
3,line,0.0612,"main, command, web, package, git, game, module...","[i have a pr for merging `develop` to `main`, ..."
4,large,0.1634,"test, say, include, end, default, configuratio...",[i got ``` ============================= test...
5,padding,0.0621,"self, project, api, development, contributor, ...",[Today when i check the github desktop of my w...
6,writebox,0.0601,"model, datum, next, data, body, size, applicat...","[img = np.fromfile(dph_files[0], dtype=np.uint..."
7,pink,0.0601,"return, value, none, true, else, option, false...","[Give me an list of User in python, User is a..."
8,line,0.0574,"main, command, web, package, git, game, module...",[Write a function that can return the long des...
9,clearbutton,0.1076,"set, script, full, description, output, heredo...",[ let urlParams = new URL(location.href); ...


In [30]:
# save in csv
df_dominant_topic.to_csv('./output/dominant_topics_35_first_prompts_gensim_only.csv')

In [60]:
# Group top 5 sentences under each topic
sent_topics_sorteddf_mallet = pd.DataFrame()

sent_topics_outdf_grpd = df_topic_sents_keywords.groupby('Dominant_Topic')

for i, grp in sent_topics_outdf_grpd:
    sent_topics_sorteddf_mallet = pd.concat([sent_topics_sorteddf_mallet, 
                                             grp.sort_values(['Perc_Contribution'], ascending=[0]).head(10)], 
                                            axis=0)

# Reset Index    
sent_topics_sorteddf_mallet.reset_index(drop=True, inplace=True)

# Format
sent_topics_sorteddf_mallet.columns = ["Topic", "Topic_Perc_Contrib", "Keywords", "Text"]

# Show
sent_topics_sorteddf_mallet

Unnamed: 0,Topic,Topic_Perc_Contrib,Keywords,Text
0,absolute,0.3663,"error, message, status, err, console_log, re, ...","[You are Junior, an AI system aiding developer..."
1,absolute,0.3342,"error, message, status, err, console_log, re, ...",[# Working set src/backend/setupRoutes.js: ```...
2,absolute,0.3334,"error, message, status, err, console_log, re, ...",[# Working set src/backend/setupRoutes.js: ```...
3,absolute,0.2512,"error, message, status, err, console_log, re, ...",[How to check type hints in a whole Python rep...
4,absolute,0.2332,"error, message, status, err, console_log, re, ...",[# Working set src/frontend/components/GitStat...
...,...,...,...,...
345,writebox,0.0887,"model, datum, next, data, body, size, applicat...",[# Working set src/execute/executeCode.js: ```...
346,writebox,0.0880,"model, datum, next, data, body, size, applicat...",[# Working set src/execute/executeAndForwardOu...
347,writebox,0.0842,"model, datum, next, data, body, size, applicat...",[# Working set src/execute/executeAndForwardOu...
348,writebox,0.0818,"model, datum, next, data, body, size, applicat...",[The `websocat` program has a number of option...


In [None]:
# save in csv
sent_topics_sorteddf_mallet.to_csv('./output/top10_sentences_35_first_prompts_gensim_only.csv')