# Python Text Analysis: Topic Modeling Solutions

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from sklearn.decomposition import LatentDirichletAllocation, NMF
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

%matplotlib inline

In [None]:
from sklearn.decomposition import NMF

In [None]:
# Import fetcher function
from sklearn.datasets import fetch_20newsgroups

In [None]:
full_data, labels = fetch_20newsgroups(
    subset='train',
    shuffle=True,
    random_state=1,
    remove=("headers", "footers", "quotes"),
    return_X_y=True)

In [None]:
n_subsamples = 2000
data = full_data[:n_subsamples]

In [None]:
def plot_top_words(model, feature_names, n_top_words=10, n_row=2, n_col=5, normalize=False):
    """Plots the top words from a topic model.
    
    Parameters
    ----------
    model : topic model object (e.g., LatentDirichletAllocation, NMF)
        The trained topic model. It should have a components_ attribute.
    feature_names : array-like of strings
        The names of each token, as a list or array.
    n_top_words : int
        The number of words to plot for each topic.
    n_row : int
        The number of rows in the plot.
    n_col : int
        The number of columns in the plot.
    normalize : boolean
        If True, normalizes the components so that they sum to 1 along samples.
    """
    # Create figure
    fig, axes = plt.subplots(n_row, n_col, figsize=(3 * n_col, 5 * n_row), sharex=True)
    axes = axes.flatten()
    components = model.components_
    # Normalize components, if necessary
    if normalize:
        components = components / components.sum(axis=1)[:, np.newaxis]
    # Iterate over each topic
    for topic_idx, topic in enumerate(components):
        # Obtain the top words for each topic
        top_features_ind = topic.argsort()[: -n_top_words - 1 : -1]
        # Get the token names
        top_features = [feature_names[i] for i in top_features_ind]
        # Get their values
        weights = topic[top_features_ind]

        # Plot the token weights as a bar plot
        ax = axes[topic_idx]
        ax.barh(top_features, weights, height=0.7)
        ax.set_title(f"Topic {topic_idx +1}", fontdict={"fontsize": 20})
        ax.invert_yaxis()
        ax.tick_params(axis="both", which="major", labelsize=20)
        
        # Customize plot
        for i in "top right left".split():
            ax.spines[i].set_visible(False)

    plt.subplots_adjust(top=0.90, bottom=0.05, wspace=0.90, hspace=0.3)

    return fig, axes

In [None]:
n_tokens = 1000

vectorizer = TfidfVectorizer(
    max_df=0.95,
    min_df=2,
    max_features=n_tokens,
    stop_words="english")

# Perform vectorizing
tfidf = vectorizer.fit_transform(data)
tokens = vectorizer.get_feature_names_out()

---

### Challenge 1: Exploring Hyperparameters in NMF

The choice of 10 components was somewhat arbitrary. It was something we had to do before we could proceed with fitting the model to the data. This is what's known as a *hyperparameter*. There are other hyperparameters in the `NMF`. For example, the `alpha` values specifies to what degree we should force values to be set equal to zero.

Try fitting the NMF with other variations of hyperparameters, and plot the resulting topics using the `plot_top_words` function. What do you notice?

---

In [None]:
# Make alpha very large
n_components = 10
random_state = 1

nmf = NMF(
    n_components=n_components,
    random_state=random_state,
    alpha=100,
    l1_ratio=0.5,
    init='nndsvda',
    max_iter=500).fit(tfidf)

In [None]:
# Notice how everything is now zero
fig, axes = plot_top_words(nmf, tokens)
plt.show()

In [None]:
# Increase number of topics
n_components = 20
random_state = 1

nmf = NMF(
    n_components=n_components,
    random_state=random_state,
    alpha=0.1,
    l1_ratio=0.5,
    init='nndsvda',
    max_iter=500).fit(tfidf)

In [None]:
# We need to change number of rows
fig, axes = plot_top_words(nmf, tokens, n_row=4)
plt.show()

In [None]:
# Decrease number of topics
n_components = 5
random_state = 1

nmf = NMF(
    n_components=n_components,
    random_state=random_state,
    alpha=0.1,
    l1_ratio=0.5,
    init='nndsvda',
    max_iter=500).fit(tfidf)

In [None]:
# We need to change number of rows
fig, axes = plot_top_words(nmf, tokens, n_row=1)
plt.show()

---

### Challenge 2: Exploring Hyperparameters in LDA

As in the case of NMF, try performing LDA with other variations of hyperparameters, and plot the resulting topics using the `plot_top_words` function. Use the [documentation](https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.LatentDirichletAllocation.html) as a guide to choose different hyperparameters.

---

In [None]:
# Use a CountVectorizer for LDA
n_tokens = 1000
count_vectorizer = CountVectorizer(
    max_df=0.95,
    min_df=2,
    max_features=n_tokens,
    stop_words="english")
counts = count_vectorizer.fit_transform(data)
tokens = count_vectorizer.get_feature_names_out()

In [None]:
# Increase number of components
n_components = 20
random_state = 0

lda = LatentDirichletAllocation(
    n_components=n_components,
    max_iter=5,
    learning_method="online",
    learning_offset=50.0, 
    random_state=random_state).fit(counts)

In [None]:
# Change number of rows
fig, axes = plot_top_words(lda, tokens, normalize=True, n_row=4)
plt.show()

In [None]:
# Decrease number of components
n_components = 5
random_state = 0

lda = LatentDirichletAllocation(
    n_components=n_components,
    max_iter=5,
    learning_method="online",
    learning_offset=50.0, 
    random_state=random_state).fit(counts)

In [None]:
# Change number of rows
fig, axes = plot_top_words(lda, tokens, normalize=True, n_row=1)
plt.show()

---

### Challenge 3: Finding Similar Documents

Calculate the cosine similarity between all pairs of documents, and find the two documents whose cosine similarity is the highest. What are these documents? Do they seem similar?

---

In [None]:
n_components = 10
random_state = 0

lda = LatentDirichletAllocation(
    n_components=n_components,
    max_iter=5,
    learning_method="online", # Use when dataset is large
    learning_offset=50.0, 
    random_state=random_state).fit(counts)

In [None]:
topic_representation = lda.transform(counts)

In [None]:
# Calculate similarities
similarities = cosine_similarity(topic_representation)

In [None]:
# Double check the shape
similarities.shape

In [None]:
# The diagonal of this matrix is all ones.
# We want to zero this out in order to find the highest similarities.
np.fill_diagonal(similarities, 0)

In [None]:
# Now, find the highest value
# We need two functions: np.argmax, and np.unravel_index
idx1, idx2 = np.unravel_index(np.argmax(similarities), similarities.shape)

In [None]:
# What is the similarity?
similarities[idx1, idx2]

In [None]:
# What are the documents?
print(data[idx1])
print(data[idx2])