In [6]:
import pandas as pd
import numpy as np 

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.manifold import TSNE
from sklearn.decomposition import TruncatedSVD, NMF, LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer

from tqdm import tqdm
from wordcloud import WordCloud

import nltk
from nltk import word_tokenize
nltk.download('punkt')

from gensim import corpora
from gensim.models import CoherenceModel
from gensim.corpora import Dictionary
from gensim.utils import simple_preprocess
from gensim.matutils import corpus2dense, corpus2csc

import pyLDAvis
import pyLDAvis.lda_model

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\garim\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## Latent Sematic Analysis (LSA)

In [17]:
# read in pre-processed ata
data = pd.read_csv("data/preprocessed.csv")
data_spotify = pd.read_csv("data/preprocessed_spotify.csv")
documents = data['clean_lyrics']

# Vectorize documents using TF-IDF
vectorizer = TfidfVectorizer(tokenizer= word_tokenize, min_df= 0.01, max_df= 0.6)
X = vectorizer.fit_transform(documents)
feature_names = vectorizer.get_feature_names_out()

# Define range of topics
topic_range = range(2, 21)

# Store perplexity and coherence scores
lsa_explained_variance = []
nmf_reconstruction_error = []
lsa_coherence = []
nmf_coherence = []

# store LSA and NMF trained models
lsa_models = []
nmf_models = []

# Store topic modeling results
lsa_topics = {}
nmf_topics = {}

for n_topics in tqdm(topic_range):
    # LSA model
    lsa_model = TruncatedSVD(n_components=n_topics)
    lsa_topic_matrix = lsa_model.fit_transform(X)
    lsa_explained_variance.append(lsa_model.explained_variance_ratio_.sum())
    # LSA coherence (approximation using gensim)
    lsa_topic_words = [list(map(lambda x: feature_names[x], np.argsort(topic)[::-1])) 
                       for topic in lsa_model.components_]
    
    # Create a dictionary and a corpus
    # Prepare data for coherence calculation
    corpus = [text.split() for text in data_spotify['clean_lyrics']]
    dictionary = Dictionary(corpus)
    coherence_model_lda = CoherenceModel(topics=lsa_topic_words, texts= corpus, 
                            dictionary=dictionary, coherence='c_v')
    lsa_coherence.append(coherence_model_lda.get_coherence())
    lsa_models.append(lsa_model)
    del lsa_model

    # NMF model
    nmf_model = NMF(n_components=n_topics)
    nmf_topic_matrix = nmf_model.fit_transform(X)
    nmf_reconstruction_error.append(nmf_model.reconstruction_err_)
    # NMF coherence (approximation using gensim)
    nmf_topic_words = [list(map(lambda x: feature_names[x], np.argsort(topic)[::-1])) 
                       for topic in nmf_model.components_]
    coherence_model_lda = CoherenceModel(topics=nmf_topic_words, texts= corpus, 
                                         dictionary=dictionary, coherence='c_v')
    nmf_coherence.append(coherence_model_lda.get_coherence())
    nmf_models.append(nmf_model)
    del nmf_model

    # Store topic matrices for t-SNE visualization
    lsa_topics[n_topics] = lsa_topic_matrix
    nmf_topics[n_topics] = nmf_topic_matrix

# Results DataFrame
results_df = pd.DataFrame({
    'Topics': topic_range,
    'LSA Perplexity': lsa_explained_variance,
    'NMF Perplexity': nmf_reconstruction_error,
    'LSA Coherence': lsa_coherence,
    'NMF Coherence': nmf_coherence
})

print(results_df)




   Topics  LSA Perplexity  NMF Perplexity  LSA Coherence  NMF Coherence
0       2       -0.022179      -71.766468       0.403511       0.388935


In [None]:
# Visualization using t-SNE
def plot_tsne(topic_matrices, model_name):
    tsne_results = {}
    for n_topics, matrix in topic_matrices.items():
        tsne = TSNE(n_components=2, random_state=0)
        tsne_result = tsne.fit_transform(matrix)
        tsne_results[n_topics] = tsne_result

    plt.figure(figsize=(14, 8))
    for n_topics, tsne_result in tsne_results.items():
        plt.scatter(tsne_result[:, 0], tsne_result[:, 1], label=f'{model_name} {n_topics} topics')
    
    plt.title(f't-SNE Visualization of {model_name} Topics')
    plt.xlabel('Component 1')
    plt.ylabel('Component 2')
    plt.legend()
    plt.show()

# Plot t-SNE results
plot_tsne(lsa_topics, 'LSA')
plot_tsne(nmf_topics, 'NMF')