# TAHLR Week 10b: Unsupervised Methods: Topic Modeling and Clustering

Code notebook for TAHLR course at ISAW (Fall 2023) based on Albrecht et al. 2022 (Blueprints) Ch. 8: Unsupervised Methods: Topic Modeling and Clustering; streamlined version

In [None]:
# Imports

from glob import glob
from nltk.tokenize import sent_tokenize
from collections import Counter
import pandas as pd
import numpy as np
import seaborn as sns
sns.set_style("darkgrid")
import matplotlib.pyplot as plt
%matplotlib inline
from natsort import natsorted

In [None]:
# Helper function for preprocessing

import string

def preprocess(text):
    text = text.lower()
    text = text.replace("\n", " ")
    for p in string.punctuation:
        text = text.replace(p, "")
    return text


In [None]:
# Load data

PATH = '../data/texts/lat/'
files = natsorted(glob(PATH + '*.txt'))
books = ["livy_1", "livy_2", "aen_1", "aen_2", "aen_3", "aen_4", "aen_5", "aen_6", "aen_7", "aen_8", "aen_9", "aen_10", "aen_11", "aen_12"]

data = []

for book, file in zip(books, files):
    with open(file, 'r') as f:
        text = f.read()
        sents = sent_tokenize(text)
        for sent in sents:
            data.append((book, sent))

df = pd.DataFrame(data, columns=['book', 'text'])
df['text'] = df['text'].apply(lambda x: preprocess(x))

df.sample(2)

In [None]:
df.shape

In [None]:
words = [[word for word in sent.split()] for sent in df['text']]
words = [word for sent in words for word in sent]
wordcounts = Counter(words)
stopwords = [word for word, count in wordcounts.most_common(50)]

In [None]:
# Make tf-idf matrix

from sklearn.feature_extraction.text import TfidfVectorizer

TV = TfidfVectorizer(stop_words=stopwords, max_features=10000)
tfidf_vectors = TV.fit_transform(df['text'])
vocab = TV.get_feature_names_out()

In [None]:
tfidf_vectors.shape

## Blueprint: Creating a Topic Model for Paragraphs with LDA

In [None]:
# Consider reducing number of sents (because LDA is computationally expensive)

# e.g.
# df = df[:50000]

In [None]:
# Helper function for displaying topics

def display_topics(model, features, no_top_words=5):
    for topic, word_vector in enumerate(model.components_):
        total = word_vector.sum()
        largest = word_vector.argsort()[::-1] # invert sort order
        print("\nTopic %02d" % topic)
        for i in range(0, no_top_words):
            print("  %s (%2.2f)" % (features[largest[i]],
                  word_vector[largest[i]]*100.0/total))

In [None]:
# Decompose, lda_para_model; nb: could take a long time

from sklearn.decomposition import LatentDirichletAllocation

lda_model = LatentDirichletAllocation(n_components = 20, random_state=42)
W_lda_matrix = lda_model.fit_transform(tfidf_vectors)
H_lda_matrix = lda_model.components_

In [None]:
display_topics(lda_model, TV.get_feature_names_out(), no_top_words=5)

In [None]:
# # Use pyLDAvis to visualize topics; if you can get it to work!

# # !pip install pyLDAvis
# import pyLDAvis.lda_model

# lda_display = pyLDAvis.lda_model.prepare(lda_model, tfidf_vectors,
#                             TV, sort_topics=False)
# pyLDAvis.display(lda_display)

In [None]:
# # Visualize topic "weights" with word cloud

# import matplotlib.pyplot as plt
# from wordcloud import WordCloud

# def wordcloud_topics(model, features, no_top_words=40):
#     for topic, words in enumerate(model.components_):
#         size = {}
#         largest = words.argsort()[::-1] # invert sort order
#         for i in range(0, no_top_words):
#             size[features[largest[i]]] = abs(words[largest[i]])
#         wc = WordCloud(background_color="white", max_words=100,
#                        width=960, height=540)
#         wc.generate_from_frequencies(size)
#         plt.figure(figsize=(12,12))
#         plt.imshow(wc, interpolation='bilinear')
#         plt.axis("off")
#         # if you don't want to save the topic model, comment the next line
#         plt.savefig(f'topic{topic}.png')

In [None]:
# wordcloud_topics(lda_model, vocab)

## Bonus Blueprint: Kmeans clustering w. visualization

In [None]:
# from https://medium.com/mlearning-ai/text-clustering-with-tf-idf-in-python-c94cd26a31e7

# Set up kmeans

N = 3

X = tfidf_vectors

from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=N, random_state=42, n_init='auto')
kmeans.fit(X)
clusters = kmeans.labels_

In [None]:
# Reduce dimensions with PCA

from sklearn.decomposition import PCA
pca = PCA(n_components=2, random_state=42)
pca_vecs = pca.fit_transform(X.toarray())
x0 = pca_vecs[:, 0]
x1 = pca_vecs[:, 1]

In [None]:
# Update dataframe

df["cluster"] = clusters
df["x0"] = x0
df["x1"] = x1
df.head()


In [None]:
# Helper function, get top keywords

def get_top_keywords(n_terms):
    """This function returns the keywords for each centroid of the KMeans"""
    df = pd.DataFrame(X.todense()).groupby(clusters).mean() # groups the TF-IDF vector by cluster
    terms = TV.get_feature_names_out() # access tf-idf terms
    for i,r in df.iterrows():
        print('\nCluster {}'.format(i))
        print(','.join([terms[t] for t in np.argsort(r)[-n_terms:]])) # for each row of the dataframe, find the n terms that have the highest tf idf score
            
get_top_keywords(10)

In [None]:
# Map clusters to appropriate labels 

cluster_map = {i: f'cluster_{i}' for i in range(0, N)}
df['cluster'] = df['cluster'].map(cluster_map)
print(cluster_map)

In [None]:
# Visualize clusters with Seaborn

import seaborn as sns

plt.figure(figsize=(12, 7))

plt.title("TF-IDF + KMeans on Misc. Latin", fontdict={"fontsize": 18})

plt.xlabel("X0", fontdict={"fontsize": 16})
plt.ylabel("X1", fontdict={"fontsize": 16})

label_sample = df[df['x0'] > .2].sample(5, random_state=2)
# plot `book` at `x0` and `x1`` for each row in label_sample
for i in range(len(label_sample)):
    plt.text(label_sample.iloc[i]['x0'], label_sample.iloc[i]['x1'], label_sample.iloc[i]['book'], size=8)

label_sample = df[df['x0'] < -.2].sample(5, random_state=2)
# plot `book` at `x0` and `x1`` for each row in label_sample
for i in range(len(label_sample)):
    plt.text(label_sample.iloc[i]['x0'], label_sample.iloc[i]['x1'], label_sample.iloc[i]['book'], size=8)

label_sample = df[df['x1'] < 0].sample(1, random_state=2)
# plot `book` at `x0` and `x1`` for each row in label_sample
for i in range(len(label_sample)):
    plt.text(label_sample.iloc[i]['x0'], label_sample.iloc[i]['x1'], label_sample.iloc[i]['book'], size=8)

sns.scatterplot(data=df, x='x0', y='x1', hue='cluster', hue_order=cluster_map.values())
plt.show()