Application of LDA and LSA:

In [None]:
import numpy as np
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer
from gensim import corpora
from gensim.models import LdaModel
import matplotlib.pyplot as plt

# Function to perform LDA
def perform_lda(documents, num_topics=5):
    """
    Perform Latent Dirichlet Allocation (LDA) on the provided documents.
    
    Parameters:
    - documents: List of lemmatized documents.
    - num_topics: Number of topics to extract.

    Returns:
    - lda_model: Trained LDA model.
    - corpus: Corpus for LDA.
    - dictionary: Dictionary for LDA.
    """
    # Prepare the documents for LDA
    texts = [doc.split() for doc in documents]
    
    # Create a dictionary and corpus
    dictionary = corpora.Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts]
    
    # Train the LDA model
    lda_model = LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=15)
    
    return lda_model, corpus, dictionary

# Function to perform LSA
def perform_lsa(documents, num_topics=5):
    """
    Perform Latent Semantic Analysis (LSA) on the provided documents.

    Parameters:
    - documents: List of lemmatized documents.
    - num_topics: Number of topics to extract.

    Returns:
    - lsa_model: Trained LSA model.
    - svd: SVD transformation.
    """
    # Convert documents to a document-term matrix
    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(documents)

    # Perform SVD
    svd = TruncatedSVD(n_components=num_topics)
    lsa_model = svd.fit_transform(X)
    
    return lsa_model, svd, vectorizer

# Example usage
def analyze_topics(global_attributes_dict):
    """
    Analyze topics using LDA and LSA and display results.

    Parameters:
    - global_attributes_dict: Global attributes dictionary for analysis.
    """
    # Prepare a list of lemmatized chapter texts for analysis
    documents = [' '.join(attrs) for attrs in global_attributes_dict.values() if attrs]

    # Perform LDA
    lda_model, corpus, dictionary = perform_lda(documents, num_topics=5)
    
    # Display the topics found by LDA
    print("LDA Topics:")
    for idx, topic in lda_model.print_topics(-1):
        print(f"Topic {idx + 1}: {topic}")

    # Perform LSA
    lsa_model, svd, vectorizer = perform_lsa(documents, num_topics=5)

    # Display the topics found by LSA
    print("\nLSA Topics:")
    terms = vectorizer.get_feature_names_out()
    for i, topic in enumerate(svd.components_):
        print(f"Topic {i + 1}: ", end="")
        print(" + ".join([f"{terms[j]} * {topic[j]:.4f}" for j in topic.argsort()[-3:]]))

# Call the analyze_topics function with the global attributes dictionary
analyze_topics(global_attributes_dict)


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import TruncatedSVD
from gensim import corpora
from gensim.models import LdaModel
from gensim.models.coherencemodel import CoherenceModel
from sklearn.feature_extraction.text import CountVectorizer

# Sample text data
documents = [
    "Machine learning is great for data analysis.",
    "Natural language processing helps in understanding human languages.",
    "Deep learning is a subset of machine learning.",
    "Data science combines statistics and machine learning.",
    "Artificial intelligence is the future of technology."
]

# Preprocessing the text
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(documents)

# Convert to a bag-of-words format for LDA
corpus = [doc.split() for doc in documents]

# Create a dictionary representation of the documents
dictionary = corpora.Dictionary(corpus)

# LDA Model
lda_model = LdaModel(corpus=dictionary.doc2bow(doc) for doc in corpus, num_topics=2, id2word=dictionary, passes=10)
lda_coherence_model = CoherenceModel(model=lda_model, texts=corpus, dictionary=dictionary, coherence='c_v')
lda_coherence_score = lda_coherence_model.get_coherence()

# LSA Model
svd = TruncatedSVD(n_components=2)
lsa_topic_matrix = svd.fit_transform(X)
lsa_coherence_score = np.mean([np.corrcoef(lsa_topic_matrix[i], lsa_topic_matrix[j])[0, 1] for i in range(lsa_topic_matrix.shape[0]) for j in range(i+1, lsa_topic_matrix.shape[0])])

# Display results
results = pd.DataFrame({
    'Model': ['LDA', 'LSA'],
    'Coherence Score': [lda_coherence_score, lsa_coherence_score]
})

# Visualize the comparison
plt.figure(figsize=(10, 5))
sns.barplot(x='Model', y='Coherence Score', data=results)
plt.title('Coherence Scores Comparison: LDA vs. LSA')
plt.ylim(0, max(results['Coherence Score']) + 0.1)
plt.show()

print(f'LDA Coherence Score: {lda_coherence_score:.4f}')
print(f'LSA Coherence Score: {lsa_coherence_score:.4f}')
