In [None]:
!pip install matplotlib
from bs4 import BeautifulSoup
import re
import string
from pathlib import Path
import nltk
!pip install optuna
import optuna
from nltk.corpus import stopwords
nltk.download('wordnet')

from gensim.models import CoherenceModel
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from gensim import corpora
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem import PorterStemmer

import numpy as np
import matplotlib.pyplot as plt
np.random.seed(2018)


!pip install pandas==1.5.3
import pandas as pd
import csv
import logging
import spacy
import os

!python -m spacy download en_core_web_sm
nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])

In [None]:
def sent_to_words(sentences):
    for sentence in sentences:
        #removes punctuations
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))
        
        


def lemmatize_stemming(text):
  ps = PorterStemmer()
  return ps.stem(WordNetLemmatizer().lemmatize(text, pos='v'))




def preprocess_text(text):
    # Remove code snippets enclosed in <code> tags
    text = re.sub(r'<code>(.*?)</code>', '', text, flags=re.DOTALL)

    # Remove HTML tags
    soup = BeautifulSoup(text, 'html.parser')
    text = soup.get_text(separator=' ')

    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Remove numbers and punctuation marks
    text = re.sub(r'\d+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))

    # Remove English stopwords
    stop_words = set(stopwords.words('english'))
    text = ' '.join(word for word in text.split() if word.lower() not in stop_words)

        # Add extra stopwords
    extra_stop_words = set(['question', 'answer', 'etc','error','tri','answer','use',"using","trying","try"])  # Add more words as needed
    stop_words.update(extra_stop_words)
    text = ' '.join(word for word in text.split() if word.lower() not in stop_words)
    
    return text
    

def preprocess_lemmatize_stemming(text):
    
    result = []
    for token in text:
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [None]:
from ast import literal_eval

df= pd.read_csv("HF_SO_2019_2024.csv")

df['cleaned_title'] = df['title'].apply(preprocess_text)
df['cleaned_body'] = df['body'].apply(preprocess_text)


df['corpus'] = df['cleaned_title'] + ' ' + df['cleaned_body']

################
data_words = list(sent_to_words(df['corpus']))
    # Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=4, threshold=30) # higher threshold fewer phrases.
    # Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
    #trigram_mod = gensim.models.phrases.Phraser(trigram)
    # Define functions for bigrams
def make_bigrams(texts):
 return [bigram_mod[doc] for doc in texts]

data_words_bigrams = make_bigrams(data_words)
df['corpus']=data_words_bigrams
#data_words_bigrams_df = pd.DataFrame({'Tokenized_Document': data_words_bigrams})
#df_corpus.to_csv('corpus.csv', index=False)

df['corpus'] = df['corpus'].apply(preprocess_lemmatize_stemming)

words_to_transform = ["huggingFace", "huggingfac", "hugging_face","hugging_fac",'hugging face']
target_word = "huggingface"

def transform_tokens(tokens, words_to_transform, target_word):
    transformed_tokens = [target_word if token.lower() in words_to_transform else token for token in tokens]
    return transformed_tokens

stop_words = set(stopwords.words('english'))
extra_stop_words = set(['follow','exampl','question', 'answer', 'etc','error','tri','answer','use',"using","trying","try",'huggingface'])  # Add more words as needed
stop_words.update(extra_stop_words)


def removeStopWords(tokens, words_to_remove):
    return [token for token in tokens if token not in words_to_remove]

df['corpus'] = df['corpus'].apply(lambda tokens: transform_tokens(tokens, words_to_transform, target_word))

df['corpus'] = df['corpus'].apply(lambda tokens: removeStopWords(tokens, stop_words))

df['corpus'].head(50)

df.to_csv('HF_SO_2019_2024.csv', index=False)

In [None]:
from ast import literal_eval
df = pd.read_csv('HF_SO_2019_2024.csv')
df['corpus'] = df['corpus'].apply(lambda x: literal_eval(x) if isinstance(x, str) else x)
# Import the wordcloud library
from wordcloud import WordCloud
# Join the different processed titles together.
long_string = ','.join([' '.join(list_token) for list_token in df['corpus']])
# Create a WordCloud object
wordcloud = WordCloud(background_color="white", max_words=10000000, contour_width=3, contour_color='steelblue')
# Generate a word cloud
wordcloud.generate(long_string)
# Visualize the word cloud
wordcloud.to_image()

In [None]:
texts = df['corpus'].tolist()
id2word = gensim.corpora.Dictionary(texts)
corpus = [id2word.doc2bow(text) for text in texts]
print([[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]])

In [None]:
def compute_coherence(model, corpus, corpora_dict):
    coherence_model_lda = CoherenceModel(
        model=model,
        texts=corpus,
        corpus=None,
        dictionary=corpora_dict,
        coherence="c_v",
    )
    return coherence_model_lda.get_coherence()

In [None]:
from scipy.spatial import distance

def compute_intertopic_diversity_JSD(lda_model):
    # Compute intertopic diversity using Jensen-Shannon divergence
    topic_distributions = lda_model.get_topics()
    intertopic_diversity = 0.0
    num_topics = len(topic_distributions)
    
    for i in range(num_topics):
        for j in range(i + 1, num_topics):
            jsd = distance.jensenshannon(topic_distributions[i], topic_distributions[j])**2
            intertopic_diversity += jsd
    
    intertopic_diversity /= (num_topics * (num_topics - 1)) / 2  # Average over all pairs
    
    return intertopic_diversity

In [None]:
with open("final_result/model_results.csv", "a") as f:
        csvwriter = csv.DictWriter(
            f, fieldnames=["trial","coherence","diversity", "ntopics", "alpha", "eta","iterations","chunksize","passes"]
        )
        csvwriter.writeheader()
def write_model_results(model,trial ,coherence_score,diversity_score):
    params = trial.params
    trialnum = trial.number
    with open("final_result/model_results.csv", "a") as f:
        csvwriter = csv.DictWriter(
            f, fieldnames=["trail", "coherence","diversity", "ntopics", "alpha", "eta","iterations","chunksize","passes"]
        )

        csvwriter.writerow(
            {
                "trail": trialnum,
                "coherence": coherence_score,
                "diversity": diversity_score,
                "ntopics": params["num_topics"],
                "alpha": params["alpha"],
                "eta": params["eta"],
                "iterations": params["iterations"],
                "chunksize": params["chunksize"],
                "passes":  params["passes"],

            }
        )
        
    model_path = Path(f"models/trial_{trialnum}")
    model_path.mkdir(parents=True, exist_ok=True)
    model.save(str(model_path / f"{trialnum}_lda"))
    top_words_filename = model_path / f"trial{trialnum}_top_words.csv"
    get_and_save_top_words(model, top_words_filename)

In [None]:
def get_and_save_top_words(model, out_file):
    top_words_per_topic = []
    for t in range(model.num_topics):
        top_words_per_topic.extend([(t,) + x for x in model.show_topic(t, topn=50)])
    pd.DataFrame(top_words_per_topic, columns=["topic", "word", "p"]).to_csv(out_file, index=False
    )

In [None]:
def objective(trial):
    alpha = trial.suggest_float('alpha', 0.01, 1, step=0.05)
    eta = trial.suggest_float('eta', 0.01, 1,step=0.05)
    ntopics = trial.suggest_int("num_topics", 15,20)
    iterations = trial.suggest_int("iterations", 4000, 5000, step=10)
    chunksize = trial.suggest_int("chunksize", 20, 100, step=10)
    passes = trial.suggest_int("passes", 150, 300, step=10)
    


    model = gensim.models.LdaMulticore(

        corpus=corpus,
        id2word=id2word,
        num_topics=ntopics,
        random_state=300,
        iterations=iterations,
        chunksize=chunksize,
        passes=passes,
        alpha=alpha,
        eta=eta,
        per_word_topics=True,
    )

    #beta = model.get_topics()
    #diversity_score= get_topic_diversity(beta, 10)
    
    diversity_score = compute_intertopic_diversity_JSD(model)
    coherence_score = compute_coherence(model, texts, id2word)
    print(f"Trial {trial.number} coherence score: {round(coherence_score,3)} diversity score: {round(diversity_score,3)}")
    
    write_model_results(model,trial, coherence_score ,diversity_score)

    return coherence_score,diversity_score



In [None]:
import math
def knee_point(p):

# Calculate maximum coherence and diversity scores from Pareto front
  coherence_scores_max = max([trial.values[0] for trial in p])
  diversity_scores_max = max([trial.values[1] for trial in p])

# Calculate the ideal point
  ideal_point = (coherence_scores_max, diversity_scores_max)

# Initialize variables for knee point
  knee_point = None
  min_distance = float('inf')

# Find knee point using Euclidean distance
  for solution in p:  
        
      coherence_score = solution.values[0]
      diversity_score = solution.values[1]
        
      if diversity_score!=None:
        
        distance = math.sqrt((coherence_scores_max - coherence_score) ** 2 + (diversity_scores_max - diversity_score) ** 2)

      # Update knee point if smaller distance found
        if distance < min_distance:
            min_distance = distance
            knee_point = solution

  return knee_point

In [None]:
study = optuna.multi_objective.create_study(
    directions=["maximize", "maximize"],
    sampler=optuna.multi_objective.samplers.NSGAIIMultiObjectiveSampler()
)

result = study.optimize(objective, n_trials=500)
best_trials = study.get_pareto_front_trials()

In [None]:
best_hyperparams = knee_point(best_trials)

print(best_hyperparams.number)

In [None]:
model_path ="models/trial_"+str(best_hyperparams.number)+"/"+str(best_hyperparams.number)+"_lda"
loaded_model = gensim.models.LdaMulticore.load(model_path)
lda_topics = loaded_model.show_topics(num_topics=-1, num_words=10, formatted=False)
topics = loaded_model.print_topics(num_words=10)

lda_topics_processed = [
    [word for word, _ in words_with_probabilities]
    for _, words_with_probabilities in lda_topics
]

# Displaying the processed topics
for idx, topic_words in enumerate(lda_topics_processed):
    print(f"Topic {idx} - Words: {', '.join(topic_words)}")

In [None]:
diversity_score = compute_intertopic_diversity_JSD(loaded_model)
coherence_score = compute_coherence(loaded_model, texts, id2word) 
print(diversity_score)
print(coherence_score)

In [None]:
num_topics = loaded_model.num_topics
alpha = loaded_model.alpha
eta = loaded_model.eta
passes = loaded_model.passes
iterations = loaded_model.iterations
chunksiza= loaded_model.chunksize
print(f'Number of topics: {num_topics}')
print(f'Alpha: {alpha}')
print(f'Eta: {eta}')
print(f'Passes: {passes}')
print(f'Iterations: {iterations}')
print(f'chunksiza: {chunksiza}')

In [None]:
dictionary = id2word
topic_labels = {
    0: "Model customization",
    1: "LLMs usage and understanding",
    2: "Other",
    3: "Model training",
    4: "Other",
    5: "Other",
    6: "LLMs usage and understanding",
    7: "Model Deployment",
    8: "Other",
    9: "Environment",
    10: "Datasets",
    11: "Model Loading, Saving, pushing",
    12: "istributed Computing and Resource Management",
    13: "LLMs usage and understanding",
    14: "LLMs usage and understanding",
}

from ast import literal_eval
df = pd.read_csv('HF_SO_2019_2024.csv')
df['corpus'] = df['corpus'].apply(lambda x: literal_eval(x) if isinstance(x, str) else x)

# Function to assign topics to each post and get topic word distribution
def assign_topics_to_posts(lda_model, post,topic_labels):
    post_bow = dictionary.doc2bow(post)
    topic_distribution = lda_model.get_document_topics(post_bow)
    most_probable_topic = max(topic_distribution, key=lambda x: x[1])
    topic_id, topic_prob = most_probable_topic

    # Get the topic word distribution
    topic_word_distribution = lda_model.get_topic_terms(topic_id)
    terms = [dictionary.get(id) for id, _ in topic_word_distribution]
    topic_label = topic_labels.get(topic_id, "Unknown")
    
    return topic_id, topic_prob, terms, topic_label

# Apply the function to each row in the DataFrame
df[['assigned_topic', 'topic_probability', 'topic_word_distribution', 'topic_label']] = \
    df['corpus'].apply(lambda post: pd.Series(assign_topics_to_posts(loaded_model, post, topic_labels)))

# Display the resulting DataFrame
df.to_csv('HF_SO_2019_2024.csv', index=False)

In [None]:
def select_posts_for_each_topic(df, num_posts=50):
    
    # Initialize an empty DataFrame to store selected posts
    selected_posts = []

    # Get unique topics
    unique_topics = df['topic_label'].unique()

    # Iterate over each topic
    for topic in unique_topics:
        # Filter DataFrame for posts with the current topic
        topic_posts = df[df['topic_label'] == topic]
        topic_posts = topic_posts.sort_values(by='topic_probability', ascending=False)
        selected_topic_posts = topic_posts.head(num_posts)

        # Append selected posts to the selected_posts DataFrame
        selected_posts.append(selected_topic_posts[['cleaned_title','cleaned_body', 'topic_label','topic_probability']])
        selected_posts_df = pd.concat(selected_posts, ignore_index=True)
    return selected_posts_df

df = pd.read_csv('HF_SO_2019_2024.csv',on_bad_lines='skip')

selected_posts = select_posts_for_each_topic(df, num_posts=50)

selected_posts.to_csv('50_post_per_topic.csv', index=False)

print(selected_posts)