In [2]:
%pip install wordcloud


Collecting wordcloud
  Using cached wordcloud-1.9.4-cp39-cp39-macosx_11_0_arm64.whl.metadata (3.4 kB)
Using cached wordcloud-1.9.4-cp39-cp39-macosx_11_0_arm64.whl (168 kB)
Installing collected packages: wordcloud
Successfully installed wordcloud-1.9.4

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [None]:
import numpy as np
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from collections import Counter
import re

In [None]:
# Preprocess the sonnet dataset
def preprocess_text(text):
    # Remove non-alphabetic characters and convert to lowercase
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = text.lower()
    return text.split()

# Load the sonnet dataset (one sonnet per line)
with open("sonnets.txt", "r") as f:
    sonnets = f.readlines()

# Preprocess the dataset
documents = [preprocess_text(sonnet) for sonnet in sonnets]

# Build vocabulary and document-word matrix (DOCD)
vocab = list(set(word for doc in documents for word in doc))
word_to_index = {word: idx for idx, word in enumerate(vocab)}
index_to_word = {idx: word for word, idx in word_to_index.items()}

# Convert documents to word indices
DOCD = np.array([[word_to_index[word] for word in doc] for doc in documents])

# Set parameters
K = 6  # Number of topics
N = len(documents)  # Number of documents (154 for sonnets)
W = len(vocab)  # Number of words in the vocabulary
DLMAX = max(len(doc) for doc in documents)  # Max document length

alpha = 5 * np.ones(K)  # Dirichlet prior for document-topic distribution
beta = 2 * np.ones(W)  # Dirichlet prior for topic-word distribution

# Initialize matrices
Z = np.zeros((N, DLMAX), dtype=int)  # Topic assignments for each word in each document
A = np.tile(alpha, (N, 1))  # Document-topic counts (N x K)
B = np.tile(beta, (K, 1))  # Topic-word counts (K x W)
BSUM = np.sum(B, axis=1)  # Sum of word counts per topic (K x 1)

# Perform Gibbs sampling
iterations = 1000
for T in range(iterations):
    for d in range(N):
        for i in range(len(documents[d])):  # Only iterate over the actual words in each document
            w = DOCD[d, i]  # Word in the document
            zi = Z[d, i]  # Current topic assignment for word w in document d

            if zi > 0:
                A[d, zi] -= 1  # Decrease count of the old topic for document d
                B[zi, w] -= 1  # Decrease count of the word for the old topic
                BSUM[zi] -= 1  # Decrease the sum of word counts for the old topic

            # Calculate the unnormalized distribution for the new topic assignment
            dst = A[d, :] * (B[:, w] / BSUM)  # Probabilities for each topic

            # Sample a new topic based on the distribution
            new_zi = np.random.choice(len(dst), p=dst / np.sum(dst))

            # Update topic assignment and counts
            Z[d, i] = new_zi
            A[d, new_zi] += 1  # Increase count for the new topic in document d
            B[new_zi, w] += 1  # Increase count for the word in the new topic
            BSUM[new_zi] += 1  # Increase the sum of word counts for the new topic

# Display wordcloud for each topic
for k in range(K):
    plt.figure(k)
    plt.clf()
    word_freq = B[k, :]
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(dict(zip(vocab, word_freq)))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.title(f"Topic {k+1}")
    plt.axis('off')
    plt.show()