In [None]:
import pandas as pd
import re
import gensim
from gensim import corpora
from gensim.models import Phrases, TfidfModel, CoherenceModel, LdaMulticore, HdpModel
import pyLDAvis
import pyLDAvis.gensim_models
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
pyLDAvis.enable_notebook()

In [None]:
# Initialize Lemmatizer and define helper functions with NLTK
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

# Initialize Lemmatizer and define helper functions
lemmatizer = WordNetLemmatizer()

def get_wordnet_pos(treebank_tag):
    tag = {'J': wordnet.ADJ, 'N': wordnet.NOUN, 'V': wordnet.VERB, 'R': wordnet.ADV}
    return tag.get(treebank_tag[0], wordnet.NOUN)

def lemmatize_sentence(sentence):
    # Remove special characters while keeping possessive apostrophes
    sentence = re.sub(r"[^a-zA-Z0-9_' ]", "", sentence)
    tokens = word_tokenize(sentence)
    lemmatized_tokens = [lemmatizer.lemmatize(token, get_wordnet_pos(pos)) for token, pos in nltk.pos_tag(tokens)]
    return lemmatized_tokens

def preprocess_keywords(data):
    keywords = data['terms'].str.split().tolist()
    keywords = [[word for word in keyword if word not in gensim.parsing.preprocessing.STOPWORDS] for keyword in keywords]
    bigram_transformer = Phrases(keywords, min_count=5, threshold=100)
    keywords_bigram = [bigram_transformer[keyword] for keyword in keywords]
    trigram_transformer = Phrases(keywords_bigram, min_count=5, threshold=100)
    keywords_trigram = [trigram_transformer[keyword] for keyword in keywords_bigram]
    return [lemmatize_sentence(' '.join(keyword)) for keyword in keywords_trigram]

# Load the data
data = pd.read_csv("corrected_data_symspell.csv")

# Preprocess the data to get lemmatized keywords
keywords_lemmatized = preprocess_keywords(data)

In [None]:
# Create dictionary and corpus for LDA
dictionary = corpora.Dictionary(keywords_lemmatized)
dictionary.filter_extremes(no_below=5, no_above=0.5)
corpus = [dictionary.doc2bow(term) for term in keywords_lemmatized]

# TF-IDF Weighting
tfidf_model = TfidfModel(corpus)
corpus_tfidf = tfidf_model[corpus]

In [None]:
def train_and_save_lda(corpus, dictionary, num_topics, filename, keywords_lemmatized):
    lda_model = LdaMulticore(corpus, num_topics=20, id2word=dictionary, passes=100, workers=12)
    vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, dictionary)
    pyLDAvis.save_html(vis, filename + "_vis.html")
    pyLDAvis.display(vis)
    lda_model.save(filename + "_model")
    topics_lda = lda_model.print_topics(num_words=10)
    df_topics_lda = pd.DataFrame(topics_lda, columns=['Topic_ID', 'Keywords'])
    df_topics_lda.to_csv(filename + "_topics.csv", index=False)
    coherence_model_lda = CoherenceModel(model=lda_model, texts=keywords_lemmatized, dictionary=dictionary, coherence='c_v')
    coherence_lda = coherence_model_lda.get_coherence()
    print(f"Coherence for {filename}: {coherence_lda}")
    return lda_model
    return vis 



In [None]:
def train_and_save_lda(corpus, dictionary, num_topics, filename, keywords_lemmatized):
    lda_model = LdaMulticore(corpus, num_topics=15, id2word=dictionary, passes=100, workers=12)
    vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, dictionary)
    pyLDAvis.save_html(vis, filename + "_vis.html")
    pyLDAvis.display(vis)
    lda_model.save(filename + "_model")
    topics_lda = lda_model.print_topics(num_words=20)
    df_topics_lda = pd.DataFrame(topics_lda, columns=['Topic_ID', 'Keywords'])
    df_topics_lda.to_csv(filename + "_topics.csv", index=False)
    coherence_model_lda = CoherenceModel(model=lda_model, texts=keywords_lemmatized, dictionary=dictionary, coherence='c_v')
    coherence_lda = coherence_model_lda.get_coherence()
    print(f"Coherence for {filename}: {coherence_lda}")
    return vis 



In [None]:
vis_regular = train_and_save_lda(corpus, dictionary, 7, 'lda_regular', keywords_lemmatized)
pyLDAvis.display(vis_regular)

In [None]:

vis_tfidf = train_and_save_lda(corpus_tfidf, dictionary, 7, 'lda_tfidf', keywords_lemmatized)
pyLDAvis.display(vis_tfidf)


## Using HDP

In [None]:
def train_and_save_hdp(corpus, dictionary, filename, keywords_lemmatized):
    hdp_model = HdpModel(corpus, dictionary)
    # Convert HDP topics to LDA format for visualization (HDP provides dynamic topic count)
    lda_model = hdp_model.suggested_lda_model()
    vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, dictionary)
    pyLDAvis.save_html(vis, filename + "_hdp_vis.html")
    pyLDAvis.display(vis)
    lda_model.save(filename + "_hdp_model")
    topics_hdp = lda_model.print_topics(num_words=10)
    df_topics_hdp = pd.DataFrame(topics_hdp, columns=['Topic_ID', 'Keywords'])
    df_topics_hdp.to_csv(filename + "_hdp_topics.csv", index=False)
    coherence_model_hdp = CoherenceModel(model=lda_model, texts=keywords_lemmatized, dictionary=dictionary, coherence='c_v')
    coherence_hdp = coherence_model_hdp.get_coherence()
    print(f"Coherence for {filename} HDP: {coherence_hdp}")
    return vis  # Return the vis object for display


In [None]:
vis_hdp_regular = train_and_save_hdp(corpus, dictionary, 'hdp_regular', keywords_lemmatized)
pyLDAvis.display(vis_hdp_regular)

In [None]:
vis_hdp_tfidf = train_and_save_hdp(corpus_tfidf, dictionary, 'hdp_tfidf', keywords_lemmatized)
pyLDAvis.display(vis_hdp_tfidf)

# Adjusting ngrams

In [None]:
import pandas as pd
import gensim
from gensim import corpora
from gensim.models import Phrases, TfidfModel, CoherenceModel, LdaMulticore, HdpModel
import pyLDAvis
import pyLDAvis.gensim_models
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
pyLDAvis.enable_notebook()

In [None]:
# Initialize Lemmatizer and define helper functions. Make sure variables and functions up to here are reset.

lemmatizer = WordNetLemmatizer()

def get_wordnet_pos(treebank_tag):
    tag = {'J': wordnet.ADJ, 'N': wordnet.NOUN, 'V': wordnet.VERB, 'R': wordnet.ADV}
    return tag.get(treebank_tag[0], wordnet.NOUN)

def lemmatize_sentence(sentence):
    # Remove special characters while keeping possessive apostrophes
    sentence = re.sub(r"[^a-zA-Z0-9_' ]", "", sentence)
    tokens = word_tokenize(sentence)
    lemmatized_tokens = [lemmatizer.lemmatize(token, get_wordnet_pos(pos)) for token, pos in nltk.pos_tag(tokens)]
    return lemmatized_tokens

def preprocess_keywords(data):
    keywords = data['terms'].str.split().tolist()
    keywords = [[word for word in keyword if word not in gensim.parsing.preprocessing.STOPWORDS] for keyword in keywords]
    bigram_transformer = Phrases(keywords, min_count=5, threshold=100)
    keywords_bigram = [bigram_transformer[keyword] for keyword in keywords]
    trigram_transformer = Phrases(keywords_bigram, min_count=5, threshold=100)
    keywords_trigram = [trigram_transformer[keyword] for keyword in keywords_bigram]
    return [lemmatize_sentence(' '.join(keyword)) for keyword in keywords_trigram]

# Load the data
data = pd.read_csv("corrected_data_symspell.csv")

# Preprocess the data to get lemmatized keywords
keywords_lemmatized = preprocess_keywords(data)

In [None]:
# Create dictionary and corpus for LDA
dictionary = corpora.Dictionary(keywords_lemmatized)
dictionary.filter_extremes(no_below=5, no_above=0.5)
corpus = [dictionary.doc2bow(term) for term in keywords_lemmatized]

# TF-IDF Weighting
tfidf_model = TfidfModel(corpus)
corpus_tfidf = tfidf_model[corpus]


In [None]:
def train_and_save_lda(corpus, dictionary, num_topics, filename, keywords_lemmatized):
    lda_model = LdaMulticore(corpus, num_topics=20, id2word=dictionary, passes=100, workers=12)
    vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, dictionary)
    pyLDAvis.save_html(vis, filename + "_vis.html")
    pyLDAvis.display(vis)
    lda_model.save(filename + "_model")
    topics_lda = lda_model.print_topics(num_words=10)
    df_topics_lda = pd.DataFrame(topics_lda, columns=['Topic_ID', 'Keywords'])
    df_topics_lda.to_csv(filename + "_topics.csv", index=False)
    coherence_model_lda = CoherenceModel(model=lda_model, texts=keywords_lemmatized, dictionary=dictionary, coherence='c_v')
    coherence_lda = coherence_model_lda.get_coherence()
    print(f"Coherence for {filename}: {coherence_lda}")
    return lda_model
    return vis 


In [None]:
vis_regular = train_and_save_lda(corpus, dictionary, 7, 'lda_regular', keywords_lemmatized)
pyLDAvis.display(vis_regular)

In [None]:

vis_tfidf = train_and_save_lda(corpus_tfidf, dictionary, 7, 'lda_tfidf', keywords_lemmatized)
pyLDAvis.display(vis_tfidf)



### With HDP

In [None]:
def train_and_save_hdp(corpus, dictionary, filename, keywords_lemmatized):
    hdp_model = HdpModel(corpus, dictionary)
    # Convert HDP topics to LDA format for visualization (HDP provides dynamic topic count)
    lda_model = hdp_model.suggested_lda_model()
    vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, dictionary)
    pyLDAvis.save_html(vis, filename + "_hdp_vis.html")
    pyLDAvis.display(vis)
    lda_model.save(filename + "_hdp_model")
    topics_hdp = lda_model.print_topics(num_words=10)
    df_topics_hdp = pd.DataFrame(topics_hdp, columns=['Topic_ID', 'Keywords'])
    df_topics_hdp.to_csv(filename + "_hdp_topics.csv", index=False)
    coherence_model_hdp = CoherenceModel(model=lda_model, texts=keywords_lemmatized, dictionary=dictionary, coherence='c_v')
    coherence_hdp = coherence_model_hdp.get_coherence()
    print(f"Coherence for {filename} HDP: {coherence_hdp}")
    return vis  # Return the vis object for display


In [None]:
vis_hdp_regular = train_and_save_hdp(corpus, dictionary, 'hdp_regular', keywords_lemmatized)
pyLDAvis.display(vis_hdp_regular)

In [None]:
vis_hdp_tfidf = train_and_save_hdp(corpus_tfidf, dictionary, 'hdp_tfidf', keywords_lemmatized)
pyLDAvis.display(vis_hdp_tfidf)

# Iterative Training for LDA

## quick training to finetune

In [7]:
import pandas as pd
import gensim
from gensim import corpora, models
from gensim.models import TfidfModel, CoherenceModel, LdaMulticore, HdpModel, Phrases
from gensim.models.phrases import Phraser
import pyLDAvis.gensim_models
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
pyLDAvis.enable_notebook()
import re
import logging

In [8]:
# Adjusting term detection to prioritize bi-grams and tri-grams
# Cell after was the initial attempt

from gensim.models.phrases import Phraser

lemmatizer = WordNetLemmatizer()

# parts of speech tagging
def get_wordnet_pos(treebank_tag):
    tag = {'J': wordnet.ADJ, 'N': wordnet.NOUN, 'V': wordnet.VERB, 'R': wordnet.ADV}
    return tag.get(treebank_tag[0], wordnet.NOUN)

# lemmatizing tokens and cleaning data
def lemmatize_sentence(sentence):
    # Remove special characters while keeping possessive apostrophes
    sentence = re.sub(r"[^a-zA-Z0-9_' ]", "", sentence)
    tokens = word_tokenize(sentence)
    lemmatized_tokens = [lemmatizer.lemmatize(token, get_wordnet_pos(pos)) for token, pos in nltk.pos_tag(tokens)]
    return lemmatized_tokens

# In case lemmatization and stopwords missed things plus more weight on bi-grams and tri-grams
def preprocess_keywords(data):
    keywords = data['terms'].str.lower().str.split().tolist()  # Convert to lowercase
    keywords = [[word for word in keyword if word not in gensim.parsing.preprocessing.STOPWORDS and len(word) > 2 and word.isalnum()] for keyword in keywords]
    
    # Better bi-grams
    bigram_transformer = Phrases(keywords, min_count=3, threshold=50)  
    keywords_bigram = [bigram_transformer[keyword] for keyword in keywords]
    
    # Better tri-gram
    trigram_transformer = Phrases(keywords_bigram, min_count=3, threshold=50)  
    keywords_trigram = [trigram_transformer[keyword] for keyword in keywords_bigram]
    
    return [lemmatize_sentence(' '.join(keyword)) for keyword in keywords_trigram]


# Load the data
data = pd.read_csv("corrected_data_symspell.csv")


# Preprocess the data to get lemmatized keywords
keywords_lemmatized = preprocess_keywords(data)

In [9]:
# Create dictionary and corpus for LDA
dictionary = corpora.Dictionary(keywords_lemmatized)
dictionary.filter_extremes(no_below=5, no_above=0.5)
corpus = [dictionary.doc2bow(term) for term in keywords_lemmatized]

# TF-IDF Weighting
tfidf_model = TfidfModel(corpus)
corpus_tfidf = tfidf_model[corpus]


In [10]:
# Quicker initial run to test performance

# Setup logging
logging.basicConfig(filename="lda_training.log",
                    format='%(asctime)s : %(levelname)s : %(message)s',
                    level=logging.WARNING) # Using WARNING level initially
logger = logging.getLogger()

def train_and_save_lda(corpus_tfidf, dictionary, num_topics, alpha, eta, passes, filename, keywords_lemmatized):
    # Set the logging level to INFO inside this function
    logger.setLevel(logging.INFO)

    lda_model = LdaMulticore(corpus_tfidf, num_topics=num_topics, id2word=dictionary, 
                             passes=passes, iterations=50, alpha=alpha, eta=eta, workers=2)
    
    # Compute Coherence Score
    coherence_model_lda = CoherenceModel(model=lda_model, texts=keywords_lemmatized, dictionary=dictionary, coherence='c_v')
    coherence_lda = coherence_model_lda.get_coherence()
    
    # Save the model
    lda_model.save(filename + ".lda")
    
    # Log the coherence value
    logger.info(f"Saved {filename} with coherence {coherence_lda}")

    # Reset the logging level to WARNING after function completion
    logger.setLevel(logging.WARNING)
    
    return coherence_lda

# Define parameters range
num_topics_list = [5, 10, 20]
alpha_list = ['symmetric', 0.1, 0.9] # Reduced to 3 values for quicker sweep
eta_list = ['auto', 'symmetric', 0.1, 0.9] # Reduced to 4 values for quicker sweep
passes_list = [10, 15] # Reduced for early stopping

# Iterate and train
for num_topics in num_topics_list:
    for alpha in alpha_list:
        for eta in eta_list:
            for passes in passes_list:
                filename = f"lda_nt{num_topics}_alpha{alpha}_eta{eta}_passes{passes}"
                %time train_and_save_lda(corpus_tfidf, dictionary, num_topics, alpha, eta, passes, filename, keywords_lemmatized) # Profiling


CPU times: user 1.99 s, sys: 681 ms, total: 2.67 s
Wall time: 8.23 s
CPU times: user 2.18 s, sys: 590 ms, total: 2.77 s
Wall time: 8.63 s
CPU times: user 1.5 s, sys: 395 ms, total: 1.9 s
Wall time: 7.31 s
CPU times: user 2.29 s, sys: 727 ms, total: 3.02 s
Wall time: 7.74 s
CPU times: user 1.97 s, sys: 609 ms, total: 2.58 s
Wall time: 7.1 s
CPU times: user 2.52 s, sys: 926 ms, total: 3.44 s
Wall time: 9.18 s
CPU times: user 1.98 s, sys: 636 ms, total: 2.62 s
Wall time: 9.3 s
CPU times: user 2.46 s, sys: 749 ms, total: 3.21 s
Wall time: 9.37 s
CPU times: user 1.78 s, sys: 632 ms, total: 2.42 s
Wall time: 6.57 s
CPU times: user 2.39 s, sys: 732 ms, total: 3.12 s
Wall time: 8.85 s
CPU times: user 1.85 s, sys: 578 ms, total: 2.43 s
Wall time: 7.08 s
CPU times: user 2.36 s, sys: 674 ms, total: 3.04 s
Wall time: 9.51 s
CPU times: user 1.85 s, sys: 588 ms, total: 2.44 s
Wall time: 7.89 s
CPU times: user 2.27 s, sys: 731 ms, total: 3 s
Wall time: 8.03 s
CPU times: user 1.66 s, sys: 579 ms, tota

# Finetuned run. Run with results from previous cells

In [None]:
# Adjusting term detection to prioritize bi-grams and tri-grams
# Cell after was the initial attempt

from gensim.models.phrases import Phraser

lemmatizer = WordNetLemmatizer()

# parts of speech tagging
def get_wordnet_pos(treebank_tag):
    tag = {'J': wordnet.ADJ, 'N': wordnet.NOUN, 'V': wordnet.VERB, 'R': wordnet.ADV}
    return tag.get(treebank_tag[0], wordnet.NOUN)

# lemmatizing tokens and cleaning data
def lemmatize_sentence(sentence):
    # Remove special characters while keeping possessive apostrophes
    sentence = re.sub(r"[^a-zA-Z0-9_' ]", "", sentence)
    tokens = word_tokenize(sentence)
    lemmatized_tokens = [lemmatizer.lemmatize(token, get_wordnet_pos(pos)) for token, pos in nltk.pos_tag(tokens)]
    return lemmatized_tokens

# In case lemmatization and stopwords missed things plus more weight on bi-grams and tri-grams
def preprocess_keywords(data):
    keywords = data['terms'].str.lower().str.split().tolist()  # Convert to lowercase
    keywords = [[word for word in keyword if word not in gensim.parsing.preprocessing.STOPWORDS and len(word) > 2 and word.isalnum()] for keyword in keywords]
    
    # Better bi-grams
    bigram_transformer = Phrases(keywords, min_count=3, threshold=50)  
    keywords_bigram = [bigram_transformer[keyword] for keyword in keywords]
    
    # Better tri-gram
    trigram_transformer = Phrases(keywords_bigram, min_count=3, threshold=50)  
    keywords_trigram = [trigram_transformer[keyword] for keyword in keywords_bigram]
    
    return [lemmatize_sentence(' '.join(keyword)) for keyword in keywords_trigram]


# Load the data
data = pd.read_csv("corrected_data_symspell.csv")


# Preprocess the data to get lemmatized keywords
keywords_lemmatized = preprocess_keywords(data)


In [None]:
# Create dictionary and corpus for LDA
dictionary = corpora.Dictionary(keywords_lemmatized)
dictionary.filter_extremes(no_below=5, no_above=0.5)
corpus = [dictionary.doc2bow(term) for term in keywords_lemmatized]

# TF-IDF Weighting
tfidf_model = TfidfModel(corpus)
corpus_tfidf = tfidf_model[corpus]


In [None]:
# Iterate over 5-20 topics, with varying alpha and beta, and 10-25 passes. Including logging
import logging
# Setup logging
logging.basicConfig(filename="lda_training.log",
                    format='%(asctime)s : %(levelname)s : %(message)s',
                    level=logging.INFO)

logger = logging.getLogger()

def train_and_save_lda(corpus_tfidf, dictionary, num_topics, alpha, eta, passes, filename, keywords_lemmatized):
    lda_model = LdaMulticore(corpus_tfidf, num_topics=num_topics, id2word=dictionary, 
                             passes=passes, iterations=100, alpha=alpha, eta=eta, workers=12)
    
    # Compute Coherence Score
    coherence_model_lda = CoherenceModel(model=lda_model, texts=keywords_lemmatized, dictionary=dictionary, coherence='c_v')
    coherence_lda = coherence_model_lda.get_coherence()
    
    # Save the model
    lda_model.save(filename + ".lda")
    
    # Log the coherence value
    logger.info(f"Saved {filename} with coherence {coherence_lda}")
    
    return coherence_lda

# Define parameters range
num_topics_list = [5, 10, 20]
alpha_list = ['symmetric', 0.1, 0.3, 0.9]
eta_list = ['auto', 'asymmetric', 0.1, 0.3, 0.9]
passes_list = [10, 15, 20]

# Iterate and train
for num_topics in num_topics_list:
    for alpha in alpha_list:
        for eta in eta_list:
            for passes in passes_list:
                filename = f"lda_nt{num_topics}_alpha{alpha}_eta{eta}_passes{passes}"
                train_and_save_lda(corpus_tfidf, dictionary, num_topics, alpha, eta, passes, filename, keywords_lemmatized)

In [None]:
# Load the model with highest coherence

import pyLDAvis.gensim_models as gensimvis
import pyLDAvis

# Load the best LDA model from disk
best_lda_model = LdaMulticore.load("lda_nt15_alpha0.1_etasymmetric_passes20.lda")

# Prepare the visualization data
vis_data = gensimvis.prepare(best_lda_model, corpus, dictionary)

# Visualize
pyLDAvis.display(vis_data)


## Iterative Training for HDP

In [None]:
def train_and_save_hdp(corpus, dictionary, gamma, alpha, kappa, tau, filename, keywords_lemmatized):
    hdp_model = HdpModel(corpus, id2word=dictionary, gamma=gamma, alpha=alpha, kappa=kappa, tau=tau)
    
    # Compute Coherence Score
    coherence_model_hdp = CoherenceModel(model=hdp_model, texts=keywords_lemmatized, dictionary=dictionary, coherence='c_v')
    coherence_hdp = coherence_model_hdp.get_coherence()
    hdp_model.save(filename + ".hdp")
    # return the coherence for logging purposes
    return coherence_hdp

# Define your parameters range
gamma_list = [0.5, 1.0, 1.5]
alpha_list = [0.5, 1.0, 1.5]
kappa_list = [0.5, 1.0, 1.5]
tau_list = [32.0, 64.0]

# Iterate and train
for gamma in gamma_list:
    for alpha in alpha_list:
        for kappa in kappa_list:
            for tau in tau_list:
                filename = f"hdp_gamma{gamma}_alpha{alpha}_kappa{kappa}_tau{tau}"
                coherence = train_and_save_hdp(corpus, dictionary, gamma, alpha, kappa, tau, filename, keywords_lemmatized)
                print(f"Saved {filename} with coherence {coherence}")


In [None]:
from gensim.models import HdpModel

hdp_model = HdpModel.load("hdp_gamma1.5_alpha1.0_kappa1.0_tau32.0.hdp")

lda_model = hdp_model.suggested_lda_model()

In [None]:
import pyLDAvis.gensim_models

# Prepare the visualization data
vis_data = pyLDAvis.gensim_models.prepare(lda_model, corpus, dictionary)

# Display the visualization
pyLDAvis.display(vis_data)

Exporting LDA Topics and Coherence

In [None]:
import pandas as pd

# Number of topics and terms
num_topics = 15
num_words = 20

# Extract the topics from the model
topics = best_lda_model.show_topics(num_topics=num_topics, num_words=num_words, formatted=False)

# Extract topics and their coherence
topic_terms = []
topic_coherences = []
for topic_num, terms in topics:
    topic = [word for word, _ in terms]
    topic_terms.append(topic)
    
    # Compute coherence for each topic
    cm = CoherenceModel(topics=[topic], texts=keywords_lemmatized, dictionary=dictionary, coherence='c_v')
    coherence = cm.get_coherence()
    topic_coherences.append(coherence)

# Create a DataFrame to store the topics, terms, and coherence
df = pd.DataFrame({
    'Topic_Num': range(1, num_topics + 1),
    'Topic_Terms': [' '.join(terms) for terms in topic_terms],
    'Coherence': topic_coherences
})

# Save the DataFrame to a CSV file
df.to_csv("topics_and_coherence.csv", index=False)
