In [2]:
#Import Lib
import pandas as pd
from bs4 import BeautifulSoup
import nltk
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import json
import re
from nltk.sentiment import SentimentIntensityAnalyzer
import spacy
from collections import defaultdict, Counter
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string
import swifter
from sklearn.model_selection import train_test_split
from scipy.sparse import hstack
import warnings
warnings.filterwarnings("ignore")
from wordcloud import WordCloud,STOPWORDS


from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, fpmax, fpgrowth


color_pal = sns.color_palette()
nlp = spacy.load('en_core_web_sm')
sia = SentimentIntensityAnalyzer()

  from .autonotebook import tqdm as notebook_tqdm


## Data Collection:
- We use StackExchange Query function to get data
```sql
SELECT TOP 25000 
  q.Id AS QuestionId, 
  q.Title,
  q.Body,
  q.CreationDate,
  q.Score,
  q.ViewCount,
  q.AnswerCount,
  a.Id AS AcceptedAnswerId,
  a.Body AS AcceptedAnswerBody,
  a.Score AS AcceptedAnswerScore
FROM Posts q
LEFT JOIN Posts a ON q.AcceptedAnswerId = a.Id
JOIN PostTags pt ON q.Id = pt.PostId
JOIN Tags t ON pt.TagId = t.Id
WHERE t.TagName = 'nlp'
ORDER BY q.CreationDate DESC

In [3]:
#Read file
df_post = pd.read_csv("QueryResults_Post.csv")
df_post

Unnamed: 0,QuestionId,Title,Body,CreationDate,Score,ViewCount,AnswerCount,AcceptedAnswerId,AcceptedAnswerBody,AcceptedAnswerScore
0,79557354,Sentencepiece not generating models after prep...,<p>So this is the log that I see on the termin...,2025-04-05 18:21:09,0,20,0,,,
1,79557315,How should I approach the word synonyms for re...,<p>I have created an aspect based list for ana...,2025-04-05 17:32:46,-1,92,1,,,
2,79557313,No attention output in jinaai/jina-embeddings-...,<p>When I use this model like so -</p>\n<pre><...,2025-04-05 17:29:15,0,15,0,,,
3,79549787,Why does Presidio with spacy nlp engine not re...,<p>I'm using spaCy with the pl_core_news_lg mo...,2025-04-02 05:56:11,0,68,1,79552218.0,<p>The configuration file is missing the 'labe...,1.0
4,79548202,GPT-2 and other models from huggingface -100 l...,<p>I understand the -100 label id is used so t...,2025-04-01 09:21:17,0,46,1,79551169.0,<p>The author of the tutorial you mentioned se...,1.0
...,...,...,...,...,...,...,...,...,...,...
20437,42489,"How to implement a ""related"" degree measure al...",<p>I was going to Ask a Question earlier today...,2008-09-03 20:21:04,8,456,2,42532.0,<p>One such way to implement such an algorithm...,5.0
20438,41424,"How do you implement a ""Did you mean""?",<blockquote>\n <p><strong>Possible Duplicate:...,2008-09-03 10:36:13,118,33200,11,41448.0,<p>Actually what Google does is very much non-...,87.0
20439,36533,Vista speech recognition in multiple languages,"<p>my primary language is spanish, but I use a...",2008-08-31 01:08:48,3,5661,6,36684.0,"<p>Citation from Vista <a href=""http://blogs.m...",8.0
20440,25332,What's a good natural language library to use ...,<p>I'm looking for an existing library to summ...,2008-08-24 20:57:33,14,6491,4,,,


### Generate graphs using popular python libraries to visualise the data.

## Preporcess Data

### Function Preparation

In [4]:
# This function is to clean text, can remove html tag, some punctuation, non-ASCII, and intensifier

intensifiers = {
    "very", "really", "extremely", "absolutely", "totally", "highly", "deeply", 
    "strongly", "incredibly", "exceptionally", "remarkably", "unbelievably", 
    "insanely", "awfully", "horribly", "hugely", "immensely", "overly", 
    "particularly", "significantly", "seriously", "tremendously", "wildly",
    "super", "ultra", "crazy", "majorly"
}
def clean_text(text,remove_code=True):

    # Check if text is None, empty, or NaN
    if text is None or text == "" or (isinstance(text, float) and np.isnan(text)):
        return ""
    # Convert to string if it's not already (handles numbers, etc.)
    if not isinstance(text, str):
        text = str(text)
    
    soup = BeautifulSoup(text, "html.parser")
    # Remove code blocks if requested
    if remove_code:
        for code in soup.find_all(['code', 'pre']):
            code.decompose()
    
    text = soup.get_text(separator=" ", strip=True)
    # Clean up excessive whitespace
    text = re.sub(r"\s+", " ", text).strip()
    # Remove urls
    text = re.sub(r"http\S+", "", text)
    
    text = re.sub(r'[\"\'!?\.;,:\-\(\)]', '', text)

    # Remove '@' character using regex
    text = re.sub(r'@\w+', '', text)

    # Remove non-ASCII characters
    text = ''.join([char for char in text if ord(char) < 128])
    text = re.sub(r'\[|\]', '', text)
    
    text = text.split()
    text = [word for word in text if word.lower() not in intensifiers]
    text = " ".join(text)

    # Return the cleaned text as a string
    return text

In [5]:
#This function to help us extract the code part of the text out and then store it in another column
def get_code(html_content):
    # Handle None or empty content
    if html_content is None or html_content == "" or (isinstance(html_content, float) and np.isnan(html_content)):
        return ""  # Return empty string for consistency

    # Parse HTML
    soup = BeautifulSoup(html_content, "html.parser")
    
    # Extract code blocks
    code_blocks = []
    for code in soup.find_all(['code', 'pre']):
        code_text = code.get_text(strip=True)
        if code_text:  # Only add non-empty code blocks
            code_blocks.append(code_text)
        # Remove code blocks from the soup to avoid duplication
        code.decompose()
    code_content = "\n---\n".join(code_blocks) if code_blocks else ""
    return code_content

In [6]:
# Function to remove stop words
def remove_stopwords(text):
    # Handle non-string inputs
    if not isinstance(text, str):
        return ""
    
    # Tokenize the text
    tokens = word_tokenize(text.lower())
    
    # Get English stopwords
    stop_words = set(stopwords.words('english'))
    
    # Filter out stopwords
    filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
    
    # Join tokens back into a string
    return ' '.join(filtered_tokens)

In [7]:
'''Define function to lematize can work parallel'''
# Lemmatization function
def lemma_texts_parallel(texts): #code is help by AI - Reference (4)
    texts = [str(text) for text in texts]
    cleaned_texts_lemma = []
    # Lemmatization with spaCy using parallel processing
    for doc in nlp.pipe(texts, batch_size=50, n_process=4):  # Process in parallel
        cleaned_tokens = [
            token.lemma_ for token in doc
        ]
        cleaned_texts_lemma.append(" ".join(cleaned_tokens))  # Join cleaned words back
    
    return pd.Series(cleaned_texts_lemma)

In [8]:
'''parameter:
- data will be a single dataframe columns where we want to combine all the text and provide wordcloud
- Title will be the output name of wordcloud make sure it's meaningful'''
def wc_generating(data,title):
    #Define the model
    stopwords = STOPWORDS
    wc = WordCloud(
        background_color='white',
        stopwords=stopwords,
        height=600,
        width=400
    )
    #Combine all text of each rows to big text
    all_text = ' '.join(data.fillna(''))
    wc.generate(all_text)
    wc.to_file(f'{title}.png')

In [9]:
'''Define a function to removing some noise word
This will return a new column in dataframe the text that remove defined noise word'''

def remove_noise_word(text,noise_words):
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    words = text.split()
    # Remove noise words
    filtered = [word for word in words if word.lower() not in noise_words]
    return " ".join(filtered)

In [10]:
# We only want to keep post that has accepted answers
df_post_answer = df_post.loc[
    df_post['AcceptedAnswerBody'].notnull()
].reset_index()

In [11]:
df_post_answer['Question_Code'] = df_post_answer['Body'].apply(get_code)
df_post_answer['Answer_Code'] = df_post_answer['AcceptedAnswerBody'].apply(get_code)

In [12]:
df_post_answer['Title_Clean'] = df_post_answer['Title'].apply(clean_text) 
df_post_answer['Body_Clean'] = df_post_answer['Body'].apply(clean_text)
df_post_answer['AcceptedAnswerBody_Clean'] = df_post_answer['AcceptedAnswerBody'].apply(clean_text)

In [None]:
df_post_answer['combination_text_all'] = df_post_answer['Title_Clean'] + " " + df_post_answer['Body_Clean'] + " " +  df_post_answer['AcceptedAnswerBody_Clean']

In [None]:
# Test with remove stop word for all combination text
df_post_answer['combination_text_all_no_stopw'] = df_post_answer['combination_text'].apply(remove_stopwords)

In [None]:
#Lemma Text
text = df_post_answer['combination_text_all_no_stopw'].tolist()
lemma_text = lemma_texts_parallel(text)
df_post_answer['combination_text_all_lemma'] = lemma_text

In [14]:
df_post_answer

Unnamed: 0,index,QuestionId,Title,Body,CreationDate,Score,ViewCount,AnswerCount,AcceptedAnswerId,AcceptedAnswerBody,AcceptedAnswerScore,Question_Code,Answer_Code,Title_Clean,Body_Clean,AcceptedAnswerBody_Clean,combination_text,combination_text_no_stopw,combination_text_clean
0,3,79549787,Why does Presidio with spacy nlp engine not re...,<p>I'm using spaCy with the pl_core_news_lg mo...,2025-04-02 05:56:11,0,68,1,79552218.0,<p>The configuration file is missing the 'labe...,1.0,"import spacy\n\nnlp = spacy.load(""pl_core_news...",labels_to_ignore:\n - O\n---\nnlp_engine_na...,Why does Presidio with spacy nlp engine not re...,Im using spaCy with the pl_core_news_lg model ...,The configuration file is missing the labels_t...,Why does Presidio with spacy nlp engine not re...,presidio spacy nlp engine recognize organizati...,presidio spacy nlp engine recognize organizati...
1,4,79548202,GPT-2 and other models from huggingface -100 l...,<p>I understand the -100 label id is used so t...,2025-04-01 09:21:17,0,46,1,79551169.0,<p>The author of the tutorial you mentioned se...,1.0,,-100\n---\nignore_index\n---\nignore_index\n--...,GPT2 and other models from huggingface 100 lab...,I understand the 100 label id is used so that ...,The author of the tutorial you mentioned sets ...,GPT2 and other models from huggingface 100 lab...,gpt2 models huggingface 100 label index traini...,gpt2 model huggingface 100 label index trainin...
2,12,79523269,Trouble getting importing gensim to work in colab,<p>I am trying to import gensim into colab.</p...,2025-03-20 14:36:02,0,125,1,79523777.0,<p>You have to restart the session for the und...,1.0,!pip install gensim\n---\n/usr/local/lib/pytho...,numpy\n---\nnumpy\n---\nscipy,Trouble getting importing gensim to work in colab,I am trying to import gensim into colab I get ...,You have to restart the session for the underl...,Trouble getting importing gensim to work in co...,trouble getting importing gensim work colab tr...,trouble getting import gensim work colab try i...
3,21,79501178,Store images instead of showing in a server,<p>I am running the code found on this <a href...,2025-03-11 14:50:31,0,36,1,79501337.0,<p>I can't test it but ...</p>\n<p>I checked <...,1.0,server\n---\nSSH\n---\nskip_tokens = [1] # sk...,"matplotlib\n---\nshow=True\n---\nfig, ax\n---\...",Store images instead of showing in a server,I am running the code found on this site in my...,I cant test it but I checked source code and i...,Store images instead of showing in a server I ...,store images instead showing server running co...,store image instead show server run code find ...
4,29,79482283,Presidio with Langchain Experimental does not ...,<p>I am using presidio/langchain_experimental ...,2025-03-03 22:27:07,4,230,2,79495969.0,<p>After some test I was able to find the solu...,-2.0,from presidio_anonymizer import PresidioAnonym...,"config = {\n ""nlp_engine_name"": ""spacy"",\n ...",Presidio with Langchain Experimental does not ...,I am using presidio/langchain_experimental to ...,After some test I was able to find the solutio...,Presidio with Langchain Experimental does not ...,presidio langchain experimental detect polish ...,presidio langchain experimental detect polish ...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8505,20436,62328,Is there an algorithm that tells the semantic ...,"<p>input: phrase 1, phrase 2</p>\n\n<p>output:...",2008-09-15 12:26:42,65,49889,11,63076.0,<hr>\n\n<p>You might want to check out this pa...,44.0,,,Is there an algorithm that tells the semantic ...,input phrase 1 phrase 2 output semantic simila...,You might want to check out this paper Sentenc...,Is there an algorithm that tells the semantic ...,algorithm tells semantic similarity two phrase...,algorithm tell semantic similarity two phrase ...
8506,20437,42489,"How to implement a ""related"" degree measure al...",<p>I was going to Ask a Question earlier today...,2008-09-03 20:21:04,8,456,2,42532.0,<p>One such way to implement such an algorithm...,5.0,,,How to implement a related degree measure algo...,I was going to Ask a Question earlier today wh...,One such way to implement such an algorithm wo...,How to implement a related degree measure algo...,implement related degree measure algorithm goi...,implement relate degree measure algorithm go a...
8507,20438,41424,"How do you implement a ""Did you mean""?",<blockquote>\n <p><strong>Possible Duplicate:...,2008-09-03 10:36:13,118,33200,11,41448.0,<p>Actually what Google does is very much non-...,87.0,<spell_checked_word>,,How do you implement a Did you mean,Possible Duplicate How does the Google Did you...,Actually what Google does is much nontrivial a...,How do you implement a Did you mean Possible D...,implement mean possible duplicate google mean ...,implement mean possible duplicate google mean ...
8508,20439,36533,Vista speech recognition in multiple languages,"<p>my primary language is spanish, but I use a...",2008-08-31 01:08:48,3,5661,6,36684.0,"<p>Citation from Vista <a href=""http://blogs.m...",8.0,,,Vista speech recognition in multiple languages,my primary language is spanish but I use all m...,Citation from Vista speech recognition blog In...,Vista speech recognition in multiple languages...,vista speech recognition multiple languages pr...,vista speech recognition multiple language pri...


In [None]:
wc_generating(df_post_answer['combination_text_all_lemma'],"First Attempt Combination")

In [13]:
'''Define a function to removing some noise word
This will return a new column in dataframe the text that remove defined noise word'''
noise_words1 = {'word','example','well','output','code','text','string','sentence','model','work','result','see'}
noise_words2 = {'word','example','well','output','code','text','string','sentence','model','work','result','see','one','give','need','list','find','know','look','follow','file','m','etc','two','try','way','use','want','different','two','seem'}
def remove_noise_word(text,noise_words):
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    words = text.split()
    # Remove noise words
    filtered = [word for word in words if word.lower() not in noise_words]
    return " ".join(filtered)

In [None]:
noise_words1 = {'word','example','well','output','code','text','string','sentence','model','work','result','see'}
df_post_answer['combination_text_all_lemma_no_noise1'] = df_post_answer['combination_text_all_lemma'].apply(lambda x: remove_noise_word(x, noise_words1))

In [None]:
wc_generating(df_post_answer['combination_text_all_lemma_no_noise1'],"No Noise Word 1")

In [24]:
noise_words2 = {'word','example','well','output','code','text','string','sentence','model','work','result','see','one','give','need','list','find','know','look','follow','file',}
df_post_answer['combination_text_all_lemma_no_noise2'] = df_post_answer['combination_text_all_lemma_no_noise1'].apply(lambda x: remove_noise_word(x, noise_words2))

KeyError: 'combination_text_all_lemma_no_noise1'

In [None]:
wc_generating(df_post_answer['combination_text_all_lemma_no_noise2'],"No Noise Word 2")

#### Can do more text cleaning later

## Strategy:

- Remove some intensifier words (from Assignment1), noise word (could use wordcloud to detect them, define some lists)
- Clustering them by embedding first then DBSCAN
- Then could try regex pattern

In [14]:
from sentence_transformers import SentenceTransformer
# 1. Load a pretrained Sentence Transformer model
model_embedding = SentenceTransformer("all-MiniLM-L6-v2")

  np.bool8: (False, True),
  np.bool8: (False, True),


In [15]:
from sklearn.cluster import KMeans,DBSCAN
def cluster_embedding(dframe,column,model_embedding,cluster_type="kmean",):
    embeddings_title = model_embedding.encode(dframe[column].tolist())
    dbscan = DBSCAN(eps=0.5,min_samples=5,metric='euclidean')
    k = 10  # or whatever value you want to test
    kmeans = KMeans(n_clusters=k, random_state=42)
    if cluster_type == "kmean":
        cluster_kmean = kmeans.fit_predict(embeddings_title)
        dframe['cluster_kmean'] = cluster_kmean
    elif cluster_type == "dbscan":
        clusters_dbscan = dbscan.fit_predict(embeddings_title)
        dframe['cluster_kmean'] = cluster_kmean
    else:
        cluster_kmean = kmeans.fit_predict(embeddings_title)
        dframe['cluster_kmean'] = cluster_kmean
        clusters_dbscan = dbscan.fit_predict(embeddings_title)
        dframe['cluster_kmean'] = cluster_kmean
    
        

In [None]:
cluster_embedding(df_post_answer,'combination_text_all_lemma',model_embedding)

In [None]:
df_post_answer[['combination_text_all_lemma','cluster_kmean']]

Unnamed: 0,combination_text_clean,cluster_kmean
0,presidio spacy nlp engine recognize organizati...,5
1,gpt2 model huggingface 100 label index trainin...,6
2,trouble getting import gensim work colab try i...,5
3,store image instead show server run code find ...,5
4,presidio langchain experimental detect polish ...,9
...,...,...
8505,algorithm tell semantic similarity two phrase ...,3
8506,implement relate degree measure algorithm go a...,3
8507,implement mean possible duplicate google mean ...,9
8508,vista speech recognition multiple language pri...,9


### Define Tags:

Plan to get tags?

- Keep filtering the Word Cloud and then manually create 10 topics?

: first to identify the words that better represent a certain document
second to encode them into vectors
BERT-based keyword extraction algorithm
KeyBERT [20], and a pre-defined dictionary of
topic-keywords developed by the team
The computation of semantic text similarity is performed after encoding the
keywords with Sentence-BERT

The first
stream of methods include solutions such as a set of logical rules that map words to topics
or comparison with a user defined taxonomy or ontology

The long-standing Latent Semantic Analysis (LSA) [5] and Latent Dirichlet Allocation (LDA) [3] models are well
suited to perform information reduction and exploratory analysis tasks
    However, this unsupervised approach has a few drawbacks: topic models might be unstable when not optimized [1] and their outputs might often be difficult to understand [4], as each topic corresponds with a combination of words which need to be interpreted by the user
    Another downside is the fact that the researcher needs to make assumptions on the number of topics to be retained from a certain collection

    Some scholars tried to mitigate all these issues by developing semi-supervised models that include ”anchor words” [7], or using partial labeling strategies 

KeyBERT

Create a keyword dict, I can start using word cloud for this to do 1 by one

In [None]:
from keybert import KeyBERT
kw_model = KeyBERT()

In [25]:
def extract_keyword(text):
    # Extract keywords using KeyBert; returns list of (keyword, score)
    keyword_tuples = kw_model.extract_keywords(
        text, 
        keyphrase_ngram_range=(1, 2),
        use_maxsum=True, 
        nr_candidates=10, 
        top_n=5
    )
    # Extract only the keyword strings from the tuples
    keywords = [kw for kw, score in keyword_tuples]
    return ",".join(keywords)

In [None]:
df_post_answer['keywords_fromBert'] = df_post_answer['combination_text_all_lemma_no_noise2'].apply(extract_keyword)

In [36]:
df_post_answer[['keywords_fromBert','cluster_kmean']].to_csv("keywords_fromBert.csv", index=False)

In [None]:
for_category = (
    df_post_answer[['keywords_fromBert', 'cluster_kmean']]
    .replace('', np.nan)    # Replace empty strings with NaN
    .dropna(axis=0)        # Now drop rows that have NaN
)

In [None]:
for_category['keywords_list'] = for_category['keywords_fromBert'].str.split(',') 
cluster_group = for_category['cluster_kmean'].unique()
cluster_group.sort()

pattern_dfs = []


for cluster in cluster_group:
    cluster_data = for_category.loc[for_category['cluster_kmean']==cluster,'keywords_list']
    pattern =  get_frequency(cluster_data)
    pattern = pattern.loc[pattern['itemsets'].apply(lambda x:len(x) > 1)]
    # Add a new column for the cluster
    pattern['cluster'] = cluster
    pattern_dfs.append(pattern)
    
# Concatenate all clusters into one dataframe
frequency_df = pd.concat(pattern_dfs, ignore_index=True)
# Rename columns as required: score, pattern, and cluster
frequency_df.rename(columns={'support': 'score', 'itemsets': 'pattern'}, inplace=True)
    

In [31]:
def get_frequency(data,min_support=0.005):
    te = TransactionEncoder()
    item_te = te.fit(data).transform(data)
    df_items_1hot = pd.DataFrame(item_te, columns=te.columns_)
    frequent_itemsets_apriori = apriori(df_items_1hot, min_support=min_support, use_colnames=True)
    return frequent_itemsets_apriori
'''data: dataframe with columns. Passing keywords and cluster columns'''
def category_frequency(data,keyword_col,cluster_col):
    for_category = (data[[keyword_col,cluster_col]]
    .replace('', np.nan)    # Replace empty strings with NaN
    .dropna(axis=0)        # Now drop rows that have NaN
)
    for_category['keywords_list'] = for_category[keyword_col].str.split(',') 
    cluster_group = for_category[cluster_col].unique()
    cluster_group.sort()
    
    pattern_dfs = []
    for cluster in cluster_group:
        cluster_data = for_category.loc[for_category['cluster_kmean']==cluster,'keywords_list']
        pattern =  get_frequency(cluster_data)
        pattern = pattern.loc[pattern['itemsets'].apply(lambda x:len(x) > 1)]
        # Add a new column for the cluster
        pattern['cluster'] = cluster
        pattern_dfs.append(pattern)
    
    # Concatenate all clusters into one dataframe
    frequency_df = pd.concat(pattern_dfs, ignore_index=True)
    # Rename columns as required: score, pattern, and cluster
    frequency_df.rename(columns={'support': 'score', 'itemsets': 'pattern'}, inplace=True)
    return frequency_df



In [49]:
frequency_df

Unnamed: 0,score,pattern,cluster
0,0.013143,(pattern match),0
1,0.020811,(regular expression),0
2,0.013143,(spacy),0
3,0.010953,(split token),0
4,0.012048,(tokenize),0
...,...,...,...
94,0.033133,(natural language),9
95,0.022590,(nlp),9
96,0.014307,(sentiment analysis),9
97,0.015060,(use nltk),9


### Try and compare with BERT topic

In [102]:
from bertopic import BERTopic

In [None]:
# Initialize and fit the BERTopic model
docs = df_post_answer['Title_Clean_No_Noise2'].tolist()
topic_model = BERTopic()
topics, probs = topic_model.fit_transform(docs)

In [112]:
# Get a summary of topics
topic_info = topic_model.get_topic_info()
topic_info

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,4872,-1_find_nltk_language_file,"[find, nltk, language, file, sentences, error,...","[identify strings two different lists, compare..."
1,0,411,0_pandas_dataframe_column_frame,"[pandas, dataframe, column, frame, columns, ro...","[implement function pandas dataframe column, p..."
2,1,344,1_bert_bertmodel_embeddings_fine,"[bert, bertmodel, embeddings, fine, finetuning...","[language training bert, bert classification, ..."
3,2,285,2_sentiment_analysis_reviews_aspect,"[sentiment, analysis, reviews, aspect, negativ...",[sentiment analysis object attribute sentiment...
4,3,262,3_word2vec_text2vec_vectors_pretrained,"[word2vec, text2vec, vectors, pretrained, vect...","[using word2vec embedding sentences, using wor..."
...,...,...,...,...,...
270,269,10,269_ner_rulers_identity_types,"[ner, rulers, identity, types, entities, overa...",[large difference overall f score custom spacy...
271,270,10,270_layoutlm_javamodel_falcon7b40b_simpletrans...,"[layoutlm, javamodel, falcon7b40b, simpletrans...","[prepare custom training data layoutlm, input ..."
272,271,10,271_java_classifiers_svmhmm_program,"[java, classifiers, svmhmm, program, virtual, ...","[classification java, using multiple classifie..."
273,272,10,272_cuda_colab_usecuda_torchoutofmemoryerror,"[cuda, colab, usecuda, torchoutofmemoryerror, ...","[running process error saying cuda memory, err..."


Source:
-https://medium.com/@davidlfliang/intro-getting-started-with-text-embeddings-using-bert-9f8c3b98dee6
- https://www.sciencedirect.com/science/article/pii/S1877050922008766

Differences between the two algorithms:

DBSCAN is a density-based clustering algorithm, whereas K-Means is a centroid-based clustering algorithm.
DBSCAN can discover clusters of arbitrary shapes, whereas K-Means assumes that the clusters are spherical.
DBSCAN does not require the number of clusters to be specified in advance, whereas K-Means requires the number of clusters to be specified.
DBSCAN is less sensitive to initialization than K-Means.
When to use DBSCAN vs. K-Means?

Use DBSCAN when the data has irregular shapes or when there is no prior knowledge about the number of clusters.
Use K-Means when the data has spherical shapes and when the number of clusters is known beforehand.
If you are unsure which algorithm to use, it is always a good idea to try both algorithms and compare their results.

## Only using Title and Body of Title to classify

In [None]:
# Create a new column only have Title and Body Title
df_post_answer['combination_text_only_question'] = df_post_answer['Title_Clean'] + " " + df_post_answer['Body_Clean']

#Remove noise
df_post_answer['combination_text_only_question_no_stopw'] = df_post_answer['combination_text_only_question'].apply(remove_stopwords)

#Lemma Text
text_for_lemma = df_post_answer['combination_text_only_question_no_stopw'].tolist()
lemma_text = lemma_texts_parallel(text_for_lemma)
df_post_answer['combination_text_only_question_lemma'] = lemma_text

In [17]:
df_post_answer[['combination_text_only_question_lemma','combination_text_only_question']]

Unnamed: 0,combination_text_only_question_lemma,combination_text_only_question
0,presidio spacy nlp engine recognize organizati...,Why does Presidio with spacy nlp engine not re...
1,gpt2 model huggingface 100 label index trainin...,GPT2 and other models from huggingface 100 lab...
2,trouble getting import gensim work colab try i...,Trouble getting importing gensim to work in co...
3,store image instead show server run code find ...,Store images instead of showing in a server I ...
4,presidio langchain experimental detect polish ...,Presidio with Langchain Experimental does not ...
...,...,...
8505,algorithm tell semantic similarity two phrase ...,Is there an algorithm that tells the semantic ...
8506,implement relate degree measure algorithm go a...,How to implement a related degree measure algo...
8507,implement mean possible duplicate google mean ...,How do you implement a Did you mean Possible D...
8508,vista speech recognition multiple language pri...,Vista speech recognition in multiple languages...


In [30]:
#Generate WC for this only question and body
wc_generating(df_post_answer['combination_text_only_question_lemma'],"First Attempt Combination Question Only")

df_post_answer['combination_text_only_question_lemma_no_noise'] = df_post_answer['combination_text_only_question_lemma'].apply(lambda x: remove_noise_word(x, noise_words2))
wc_generating(df_post_answer['combination_text_only_question_lemma_no_noise'],"Tile and Body Title No Noise Word")


In [31]:
df_post_answer.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8510 entries, 0 to 8509
Data columns (total 23 columns):
 #   Column                                         Non-Null Count  Dtype  
---  ------                                         --------------  -----  
 0   index                                          8510 non-null   int64  
 1   QuestionId                                     8510 non-null   int64  
 2   Title                                          8510 non-null   object 
 3   Body                                           8510 non-null   object 
 4   CreationDate                                   8510 non-null   object 
 5   Score                                          8510 non-null   int64  
 6   ViewCount                                      8510 non-null   int64  
 7   AnswerCount                                    8510 non-null   int64  
 8   AcceptedAnswerId                               8510 non-null   float64
 9   AcceptedAnswerBody                             8510 

In [33]:
df_post_questions = df_post_answer.drop(['index','combination_text_no_stopw','combination_text_clean'],axis=1)

In [37]:
df_post_questions

Unnamed: 0,QuestionId,Title,Body,CreationDate,Score,ViewCount,AnswerCount,AcceptedAnswerId,AcceptedAnswerBody,AcceptedAnswerScore,Question_Code,Answer_Code,Title_Clean,Body_Clean,AcceptedAnswerBody_Clean,combination_text,combination_text_only_question,combination_text_only_question_no_stopw,combination_text_only_question_lemma,combination_text_only_question_lemma_no_noise
0,79549787,Why does Presidio with spacy nlp engine not re...,<p>I'm using spaCy with the pl_core_news_lg mo...,2025-04-02 05:56:11,0,68,1,79552218.0,<p>The configuration file is missing the 'labe...,1.0,"import spacy\n\nnlp = spacy.load(""pl_core_news...",labels_to_ignore:\n - O\n---\nnlp_engine_na...,Why does Presidio with spacy nlp engine not re...,Im using spaCy with the pl_core_news_lg model ...,The configuration file is missing the labels_t...,Why does Presidio with spacy nlp engine not re...,Why does Presidio with spacy nlp engine not re...,presidio spacy nlp engine recognize organizati...,presidio spacy nlp engine recognize organizati...,presidio spacy nlp engine recognize organizati...
1,79548202,GPT-2 and other models from huggingface -100 l...,<p>I understand the -100 label id is used so t...,2025-04-01 09:21:17,0,46,1,79551169.0,<p>The author of the tutorial you mentioned se...,1.0,,-100\n---\nignore_index\n---\nignore_index\n--...,GPT2 and other models from huggingface 100 lab...,I understand the 100 label id is used so that ...,The author of the tutorial you mentioned sets ...,GPT2 and other models from huggingface 100 lab...,GPT2 and other models from huggingface 100 lab...,gpt2 models huggingface 100 label index traini...,gpt2 model huggingface 100 label index trainin...,gpt2 huggingface 100 label index training inst...
2,79523269,Trouble getting importing gensim to work in colab,<p>I am trying to import gensim into colab.</p...,2025-03-20 14:36:02,0,125,1,79523777.0,<p>You have to restart the session for the und...,1.0,!pip install gensim\n---\n/usr/local/lib/pytho...,numpy\n---\nnumpy\n---\nscipy,Trouble getting importing gensim to work in colab,I am trying to import gensim into colab I get ...,You have to restart the session for the underl...,Trouble getting importing gensim to work in co...,Trouble getting importing gensim to work in co...,trouble getting importing gensim work colab tr...,trouble getting import gensim work colab try i...,trouble getting import gensim colab import gen...
3,79501178,Store images instead of showing in a server,<p>I am running the code found on this <a href...,2025-03-11 14:50:31,0,36,1,79501337.0,<p>I can't test it but ...</p>\n<p>I checked <...,1.0,server\n---\nSSH\n---\nskip_tokens = [1] # sk...,"matplotlib\n---\nshow=True\n---\nfig, ax\n---\...",Store images instead of showing in a server,I am running the code found on this site in my...,I cant test it but I checked source code and i...,Store images instead of showing in a server I ...,Store images instead of showing in a server I ...,store images instead showing server running co...,store image instead show server run code find ...,store image instead show server run site serve...
4,79482283,Presidio with Langchain Experimental does not ...,<p>I am using presidio/langchain_experimental ...,2025-03-03 22:27:07,4,230,2,79495969.0,<p>After some test I was able to find the solu...,-2.0,from presidio_anonymizer import PresidioAnonym...,"config = {\n ""nlp_engine_name"": ""spacy"",\n ...",Presidio with Langchain Experimental does not ...,I am using presidio/langchain_experimental to ...,After some test I was able to find the solutio...,Presidio with Langchain Experimental does not ...,Presidio with Langchain Experimental does not ...,presidio langchain experimental detect polish ...,presidio langchain experimental detect polish ...,presidio langchain experimental detect polish ...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8505,62328,Is there an algorithm that tells the semantic ...,"<p>input: phrase 1, phrase 2</p>\n\n<p>output:...",2008-09-15 12:26:42,65,49889,11,63076.0,<hr>\n\n<p>You might want to check out this pa...,44.0,,,Is there an algorithm that tells the semantic ...,input phrase 1 phrase 2 output semantic simila...,You might want to check out this paper Sentenc...,Is there an algorithm that tells the semantic ...,Is there an algorithm that tells the semantic ...,algorithm tells semantic similarity two phrase...,algorithm tell semantic similarity two phrase ...,algorithm tell semantic similarity phrase inpu...
8506,42489,"How to implement a ""related"" degree measure al...",<p>I was going to Ask a Question earlier today...,2008-09-03 20:21:04,8,456,2,42532.0,<p>One such way to implement such an algorithm...,5.0,,,How to implement a related degree measure algo...,I was going to Ask a Question earlier today wh...,One such way to implement such an algorithm wo...,How to implement a related degree measure algo...,How to implement a related degree measure algo...,implement related degree measure algorithm goi...,implement relate degree measure algorithm go a...,implement relate degree measure algorithm go a...
8507,41424,"How do you implement a ""Did you mean""?",<blockquote>\n <p><strong>Possible Duplicate:...,2008-09-03 10:36:13,118,33200,11,41448.0,<p>Actually what Google does is very much non-...,87.0,<spell_checked_word>,,How do you implement a Did you mean,Possible Duplicate How does the Google Did you...,Actually what Google does is much nontrivial a...,How do you implement a Did you mean Possible D...,How do you implement a Did you mean Possible D...,implement mean possible duplicate google mean ...,implement mean possible duplicate google mean ...,implement mean possible duplicate google mean ...
8508,36533,Vista speech recognition in multiple languages,"<p>my primary language is spanish, but I use a...",2008-08-31 01:08:48,3,5661,6,36684.0,"<p>Citation from Vista <a href=""http://blogs.m...",8.0,,,Vista speech recognition in multiple languages,my primary language is spanish but I use all m...,Citation from Vista speech recognition blog In...,Vista speech recognition in multiple languages...,Vista speech recognition in multiple languages...,vista speech recognition multiple languages pr...,vista speech recognition multiple language pri...,vista speech recognition multiple language pri...


In [38]:
cluster_embedding(df_post_questions,'combination_text_only_question_lemma_no_noise',model_embedding)

In [39]:
df_post_questions

Unnamed: 0,QuestionId,Title,Body,CreationDate,Score,ViewCount,AnswerCount,AcceptedAnswerId,AcceptedAnswerBody,AcceptedAnswerScore,...,Answer_Code,Title_Clean,Body_Clean,AcceptedAnswerBody_Clean,combination_text,combination_text_only_question,combination_text_only_question_no_stopw,combination_text_only_question_lemma,combination_text_only_question_lemma_no_noise,cluster_kmean
0,79549787,Why does Presidio with spacy nlp engine not re...,<p>I'm using spaCy with the pl_core_news_lg mo...,2025-04-02 05:56:11,0,68,1,79552218.0,<p>The configuration file is missing the 'labe...,1.0,...,labels_to_ignore:\n - O\n---\nnlp_engine_na...,Why does Presidio with spacy nlp engine not re...,Im using spaCy with the pl_core_news_lg model ...,The configuration file is missing the labels_t...,Why does Presidio with spacy nlp engine not re...,Why does Presidio with spacy nlp engine not re...,presidio spacy nlp engine recognize organizati...,presidio spacy nlp engine recognize organizati...,presidio spacy nlp engine recognize organizati...,9
1,79548202,GPT-2 and other models from huggingface -100 l...,<p>I understand the -100 label id is used so t...,2025-04-01 09:21:17,0,46,1,79551169.0,<p>The author of the tutorial you mentioned se...,1.0,...,-100\n---\nignore_index\n---\nignore_index\n--...,GPT2 and other models from huggingface 100 lab...,I understand the 100 label id is used so that ...,The author of the tutorial you mentioned sets ...,GPT2 and other models from huggingface 100 lab...,GPT2 and other models from huggingface 100 lab...,gpt2 models huggingface 100 label index traini...,gpt2 model huggingface 100 label index trainin...,gpt2 huggingface 100 label index training inst...,2
2,79523269,Trouble getting importing gensim to work in colab,<p>I am trying to import gensim into colab.</p...,2025-03-20 14:36:02,0,125,1,79523777.0,<p>You have to restart the session for the und...,1.0,...,numpy\n---\nnumpy\n---\nscipy,Trouble getting importing gensim to work in colab,I am trying to import gensim into colab I get ...,You have to restart the session for the underl...,Trouble getting importing gensim to work in co...,Trouble getting importing gensim to work in co...,trouble getting importing gensim work colab tr...,trouble getting import gensim work colab try i...,trouble getting import gensim colab import gen...,4
3,79501178,Store images instead of showing in a server,<p>I am running the code found on this <a href...,2025-03-11 14:50:31,0,36,1,79501337.0,<p>I can't test it but ...</p>\n<p>I checked <...,1.0,...,"matplotlib\n---\nshow=True\n---\nfig, ax\n---\...",Store images instead of showing in a server,I am running the code found on this site in my...,I cant test it but I checked source code and i...,Store images instead of showing in a server I ...,Store images instead of showing in a server I ...,store images instead showing server running co...,store image instead show server run code find ...,store image instead show server run site serve...,4
4,79482283,Presidio with Langchain Experimental does not ...,<p>I am using presidio/langchain_experimental ...,2025-03-03 22:27:07,4,230,2,79495969.0,<p>After some test I was able to find the solu...,-2.0,...,"config = {\n ""nlp_engine_name"": ""spacy"",\n ...",Presidio with Langchain Experimental does not ...,I am using presidio/langchain_experimental to ...,After some test I was able to find the solutio...,Presidio with Langchain Experimental does not ...,Presidio with Langchain Experimental does not ...,presidio langchain experimental detect polish ...,presidio langchain experimental detect polish ...,presidio langchain experimental detect polish ...,9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8505,62328,Is there an algorithm that tells the semantic ...,"<p>input: phrase 1, phrase 2</p>\n\n<p>output:...",2008-09-15 12:26:42,65,49889,11,63076.0,<hr>\n\n<p>You might want to check out this pa...,44.0,...,,Is there an algorithm that tells the semantic ...,input phrase 1 phrase 2 output semantic simila...,You might want to check out this paper Sentenc...,Is there an algorithm that tells the semantic ...,Is there an algorithm that tells the semantic ...,algorithm tells semantic similarity two phrase...,algorithm tell semantic similarity two phrase ...,algorithm tell semantic similarity phrase inpu...,0
8506,42489,"How to implement a ""related"" degree measure al...",<p>I was going to Ask a Question earlier today...,2008-09-03 20:21:04,8,456,2,42532.0,<p>One such way to implement such an algorithm...,5.0,...,,How to implement a related degree measure algo...,I was going to Ask a Question earlier today wh...,One such way to implement such an algorithm wo...,How to implement a related degree measure algo...,How to implement a related degree measure algo...,implement related degree measure algorithm goi...,implement relate degree measure algorithm go a...,implement relate degree measure algorithm go a...,1
8507,41424,"How do you implement a ""Did you mean""?",<blockquote>\n <p><strong>Possible Duplicate:...,2008-09-03 10:36:13,118,33200,11,41448.0,<p>Actually what Google does is very much non-...,87.0,...,,How do you implement a Did you mean,Possible Duplicate How does the Google Did you...,Actually what Google does is much nontrivial a...,How do you implement a Did you mean Possible D...,How do you implement a Did you mean Possible D...,implement mean possible duplicate google mean ...,implement mean possible duplicate google mean ...,implement mean possible duplicate google mean ...,8
8508,36533,Vista speech recognition in multiple languages,"<p>my primary language is spanish, but I use a...",2008-08-31 01:08:48,3,5661,6,36684.0,"<p>Citation from Vista <a href=""http://blogs.m...",8.0,...,,Vista speech recognition in multiple languages,my primary language is spanish but I use all m...,Citation from Vista speech recognition blog In...,Vista speech recognition in multiple languages...,Vista speech recognition in multiple languages...,vista speech recognition multiple languages pr...,vista speech recognition multiple language pri...,vista speech recognition multiple language pri...,1


In [42]:
df_post_questions['keywords_fromBert'] = df_post_questions['combination_text_only_question_lemma_no_noise'].apply(extract_keyword)


In [43]:
df_post_questions

Unnamed: 0,QuestionId,Title,Body,CreationDate,Score,ViewCount,AnswerCount,AcceptedAnswerId,AcceptedAnswerBody,AcceptedAnswerScore,...,Title_Clean,Body_Clean,AcceptedAnswerBody_Clean,combination_text,combination_text_only_question,combination_text_only_question_no_stopw,combination_text_only_question_lemma,combination_text_only_question_lemma_no_noise,cluster_kmean,keywords_fromBert
0,79549787,Why does Presidio with spacy nlp engine not re...,<p>I'm using spaCy with the pl_core_news_lg mo...,2025-04-02 05:56:11,0,68,1,79552218.0,<p>The configuration file is missing the 'labe...,1.0,...,Why does Presidio with spacy nlp engine not re...,Im using spaCy with the pl_core_news_lg model ...,The configuration file is missing the labels_t...,Why does Presidio with spacy nlp engine not re...,Why does Presidio with spacy nlp engine not re...,presidio spacy nlp engine recognize organizati...,presidio spacy nlp engine recognize organizati...,presidio spacy nlp engine recognize organizati...,9,"organization pesel,recognizer presidio,spacy p..."
1,79548202,GPT-2 and other models from huggingface -100 l...,<p>I understand the -100 label id is used so t...,2025-04-01 09:21:17,0,46,1,79551169.0,<p>The author of the tutorial you mentioned se...,1.0,...,GPT2 and other models from huggingface 100 lab...,I understand the 100 label id is used so that ...,The author of the tutorial you mentioned sets ...,GPT2 and other models from huggingface 100 lab...,GPT2 and other models from huggingface 100 lab...,gpt2 models huggingface 100 label index traini...,gpt2 model huggingface 100 label index trainin...,gpt2 huggingface 100 label index training inst...,2,"ignoreindex loss,label prediction,pad token,de..."
2,79523269,Trouble getting importing gensim to work in colab,<p>I am trying to import gensim into colab.</p...,2025-03-20 14:36:02,0,125,1,79523777.0,<p>You have to restart the session for the und...,1.0,...,Trouble getting importing gensim to work in colab,I am trying to import gensim into colab I get ...,You have to restart the session for the underl...,Trouble getting importing gensim to work in co...,Trouble getting importing gensim to work in co...,trouble getting importing gensim work colab tr...,trouble getting import gensim work colab try i...,trouble getting import gensim colab import gen...,4,"getting import,numpy relate,gensim colab,colab..."
3,79501178,Store images instead of showing in a server,<p>I am running the code found on this <a href...,2025-03-11 14:50:31,0,36,1,79501337.0,<p>I can't test it but ...</p>\n<p>I checked <...,1.0,...,Store images instead of showing in a server,I am running the code found on this site in my...,I cant test it but I checked source code and i...,Store images instead of showing in a server I ...,Store images instead of showing in a server I ...,store images instead showing server running co...,store image instead show server run code find ...,store image instead show server run site serve...,4,"connection instance,image instead,store locall..."
4,79482283,Presidio with Langchain Experimental does not ...,<p>I am using presidio/langchain_experimental ...,2025-03-03 22:27:07,4,230,2,79495969.0,<p>After some test I was able to find the solu...,-2.0,...,Presidio with Langchain Experimental does not ...,I am using presidio/langchain_experimental to ...,After some test I was able to find the solutio...,Presidio with Langchain Experimental does not ...,Presidio with Langchain Experimental does not ...,presidio langchain experimental detect polish ...,presidio langchain experimental detect polish ...,presidio langchain experimental detect polish ...,9,"presidio langchain,anonymize,presidio spacy,po..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8505,62328,Is there an algorithm that tells the semantic ...,"<p>input: phrase 1, phrase 2</p>\n\n<p>output:...",2008-09-15 12:26:42,65,49889,11,63076.0,<hr>\n\n<p>You might want to check out this pa...,44.0,...,Is there an algorithm that tells the semantic ...,input phrase 1 phrase 2 output semantic simila...,You might want to check out this paper Sentenc...,Is there an algorithm that tells the semantic ...,Is there an algorithm that tells the semantic ...,algorithm tells semantic similarity two phrase...,algorithm tell semantic similarity two phrase ...,algorithm tell semantic similarity phrase inpu...,0,"tell semantic,input phrase,similarity value,si..."
8506,42489,"How to implement a ""related"" degree measure al...",<p>I was going to Ask a Question earlier today...,2008-09-03 20:21:04,8,456,2,42532.0,<p>One such way to implement such an algorithm...,5.0,...,How to implement a related degree measure algo...,I was going to Ask a Question earlier today wh...,One such way to implement such an algorithm wo...,How to implement a related degree measure algo...,How to implement a related degree measure algo...,implement related degree measure algorithm goi...,implement relate degree measure algorithm go a...,implement relate degree measure algorithm go a...,1,"functionality,relate degree,implement stackove..."
8507,41424,"How do you implement a ""Did you mean""?",<blockquote>\n <p><strong>Possible Duplicate:...,2008-09-03 10:36:13,118,33200,11,41448.0,<p>Actually what Google does is very much non-...,87.0,...,How do you implement a Did you mean,Possible Duplicate How does the Google Did you...,Actually what Google does is much nontrivial a...,How do you implement a Did you mean Possible D...,How do you implement a Did you mean Possible D...,implement mean possible duplicate google mean ...,implement mean possible duplicate google mean ...,implement mean possible duplicate google mean ...,8,"mean possible,website implement,search website..."
8508,36533,Vista speech recognition in multiple languages,"<p>my primary language is spanish, but I use a...",2008-08-31 01:08:48,3,5661,6,36684.0,"<p>Citation from Vista <a href=""http://blogs.m...",8.0,...,Vista speech recognition in multiple languages,my primary language is spanish but I use all m...,Citation from Vista speech recognition blog In...,Vista speech recognition in multiple languages...,Vista speech recognition in multiple languages...,vista speech recognition multiple languages pr...,vista speech recognition multiple language pri...,vista speech recognition multiple language pri...,1,"software english,multiple language,speech reco..."


In [32]:
frequency_questions = category_frequency(df_post_questions,keyword_col="keywords_fromBert",cluster_col="cluster_kmean")

NameError: name 'df_post_questions' is not defined

In [47]:
frequency_questions

Unnamed: 0,score,pattern,cluster
0,0.012658,"(panda, panda dataframe)",6


## Only using Title to classify

In [None]:
#Remove stop word
df_post_answer['Title_Clean'] = df_post_answer['Title_Clean'].apply(remove_stopwords)
#Lemma Text
title_text_for_lemma = df_post_answer['Title_Clean'].tolist()
title_lemma = lemma_texts_parallel(title_text_for_lemma)
df_post_answer['Title_Lemma'] = title_lemma

In [19]:
cluster_embedding(df_post_answer,'Title_Clean',model_embedding)

In [26]:
df_post_answer['keywords_fromBert'] = df_post_answer['Title_Clean'].apply(extract_keyword)

In [33]:
frequency_title = category_frequency(df_post_answer,keyword_col="keywords_fromBert",cluster_col="cluster_kmean")

In [34]:
frequency_title

Unnamed: 0,score,pattern,cluster
0,0.005063,"(strings, based)",0
1,0.005063,"(remove, characters)",0
2,0.005063,"(words, characters)",0
3,0.005063,"(strings, convert)",0
4,0.010127,"(regular, expression)",0
...,...,...,...
410,0.005172,"(language, natural language, processing)",9
411,0.006034,"(language processing, natural language, natural)",9
412,0.010345,"(language processing, natural, processing)",9
413,0.010345,"(natural, natural language, processing)",9
