# TAHLR Week 10: Unsupervised Methods: Topic Modeling and Clustering

Code notebook for TAHLR course at ISAW (Fall 2023) based on Albrecht et al. 2022 (Blueprints) Ch. 8: Unsupervised Methods: Topic Modeling and Clustering

In [None]:
# # Get data from remote location

# !mkdir -p ../data/blueprints
# !curl -LJO https://github.com/blueprints-for-text-analytics-python/blueprints-text/raw/master/data/un-general-debates/un-general-debates-blueprint.csv.gz --output-dir ../data/blueprints

In [None]:
# Imports

import pandas as pd
import numpy as np
import seaborn as sns
sns.set_style("darkgrid")
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
# Load data

file = "../data/blueprints/un-general-debates-blueprint.csv.gz"
debates = df = pd.read_csv(file)
df.sample(2)

In [None]:
# Preprocess data; spec. split into paragraphs

import re
df["paragraphs"] = df["text"].map(lambda text: re.split('[.?!]\s*\n', text))
df["number_of_paragraphs"] = df["paragraphs"].map(len)

In [None]:
# Visualize paragraph count

debates.groupby('year').agg({'number_of_paragraphs': 'mean'}).plot.bar()

In [None]:
# Make tf-idf matrix

from sklearn.feature_extraction.text import TfidfVectorizer

from spacy.lang.en.stop_words import STOP_WORDS as stopwords
stopwords = [word for word in stopwords if word.isalpha()]

tfidf_text = TfidfVectorizer(stop_words=stopwords, min_df=5, max_df=0.7)
vectors_text = tfidf_text.fit_transform(debates['text'])
vectors_text.shape

In [None]:
# Flatten the paragraphs keeping the years

paragraph_df = pd.DataFrame([{ "text": paragraph, "year": year } 
                               for paragraphs, year in \
                               zip(df["paragraphs"], df["year"]) 
                                    for paragraph in paragraphs if paragraph])

tfidf_para_vectorizer = TfidfVectorizer(stop_words=stopwords, min_df=5,
                                        max_df=0.7)
tfidf_para_vectors = tfidf_para_vectorizer.fit_transform(paragraph_df["text"])
tfidf_para_vectors.shape

## Blueprint: Creating a Topic Model for Paragraphs Using NMF

In [None]:
# Decompose, NMF

from sklearn.decomposition import NMF

nmf_text_model = NMF(n_components=10, random_state=42)
W_text_matrix = nmf_text_model.fit_transform(tfidf_para_vectors)
H_text_matrix = nmf_text_model.components_

In [None]:
# Helper function for displaying topics

def display_topics(model, features, no_top_words=5):
    for topic, word_vector in enumerate(model.components_):
        total = word_vector.sum()
        largest = word_vector.argsort()[::-1] # invert sort order
        print("\nTopic %02d" % topic)
        for i in range(0, no_top_words):
            print("  %s (%2.2f)" % (features[largest[i]],
                  word_vector[largest[i]]*100.0/total))

In [None]:
# Display NMF topics

display_topics(nmf_text_model, tfidf_para_vectorizer.get_feature_names_out())

## Blueprint: Creating a Topic Model for Paragraphs with SVD

In [None]:
# Decompose, SVD

from sklearn.decomposition import TruncatedSVD

svd_para_model = TruncatedSVD(n_components = 10, random_state=42)
W_svd_para_matrix = svd_para_model.fit_transform(tfidf_para_vectors)
H_svd_para_matrix = svd_para_model.components_

In [None]:
# Display SVD topics

display_topics(svd_para_model, tfidf_para_vectorizer.get_feature_names_out())

## Blueprint: Creating a Topic Model for Paragraphs with LDA

In [None]:
paragraph_df.shape

In [None]:
# Reduce number of paragraphs (because LDA is computationally expensive)

paragraph_df = paragraph_df[:50000]

tfidf_para_vectorizer = TfidfVectorizer(stop_words=stopwords, min_df=5,
                                        max_df=0.7)
tfidf_para_vectors = tfidf_para_vectorizer.fit_transform(paragraph_df["text"])
tfidf_para_vectors.shape

In [None]:
#  Work with count vectors

from sklearn.feature_extraction.text import CountVectorizer

count_para_vectorizer = CountVectorizer(stop_words=stopwords, min_df=5,
                        max_df=0.7)
count_para_vectors = count_para_vectorizer.fit_transform(paragraph_df["text"])

In [None]:
# Decompose, lda_para_model; nb: could take a long time

from sklearn.decomposition import LatentDirichletAllocation

lda_para_model = LatentDirichletAllocation(n_components = 10, random_state=42)
W_lda_para_matrix = lda_para_model.fit_transform(count_para_vectors)
H_lda_para_matrix = lda_para_model.components_

In [None]:
display_topics(lda_para_model, tfidf_para_vectorizer.get_feature_names_out())

In [None]:
# Use pyLDAvis to visualize topics

# !pip install pyLDAvis
import pyLDAvis.lda_model

lda_display = pyLDAvis.lda_model.prepare(lda_para_model, count_para_vectors,
                            count_para_vectorizer, sort_topics=False)
pyLDAvis.display(lda_display)

In [None]:
# Visualize topic "weights" with word cloud

import matplotlib.pyplot as plt
from wordcloud import WordCloud

def wordcloud_topics(model, features, no_top_words=40):
    for topic, words in enumerate(model.components_):
        size = {}
        largest = words.argsort()[::-1] # invert sort order
        for i in range(0, no_top_words):
            size[features[largest[i]]] = abs(words[largest[i]])
        wc = WordCloud(background_color="white", max_words=100,
                       width=960, height=540)
        wc.generate_from_frequencies(size)
        plt.figure(figsize=(12,12))
        plt.imshow(wc, interpolation='bilinear')
        plt.axis("off")
        # if you don't want to save the topic model, comment the next line
        plt.savefig(f'topic{topic}.png')

In [None]:
wordcloud_topics(lda_para_model, count_para_vectorizer.get_feature_names_out())

## Bonus Blueprint: Kmeans clustering w. visualization

In [None]:
# from https://medium.com/mlearning-ai/text-clustering-with-tf-idf-in-python-c94cd26a31e7

# Set up kmeans

X = tfidf_para_vectors

from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=3, random_state=42)
kmeans.fit(X)
clusters = kmeans.labels_

In [None]:
# Reduce dimensions with PCA

from sklearn.decomposition import PCA
pca = PCA(n_components=2, random_state=42)
pca_vecs = pca.fit_transform(X.toarray())
x0 = pca_vecs[:, 0]
x1 = pca_vecs[:, 1]

In [None]:
# Update dataframe

paragraph_df["cluster"] = clusters
paragraph_df["x0"] = x0
paragraph_df["x1"] = x1
paragraph_df.head()


In [None]:
# Helper function, get top keywords

def get_top_keywords(n_terms):
    """This function returns the keywords for each centroid of the KMeans"""
    df = pd.DataFrame(X.todense()).groupby(clusters).mean() # groups the TF-IDF vector by cluster
    terms = tfidf_para_vectorizer.get_feature_names_out() # access tf-idf terms
    for i,r in df.iterrows():
        print('\nCluster {}'.format(i))
        print(','.join([terms[t] for t in np.argsort(r)[-n_terms:]])) # for each row of the dataframe, find the n terms that have the highest tf idf score
            
get_top_keywords(10)

In [None]:
# Map clusters to appropriate labels 

cluster_map = {0: "africa", 1: "general", 2: "china"}
paragraph_df['cluster'] = paragraph_df['cluster'].map(cluster_map)

In [None]:
# Visualize clusters with Seaborn

import seaborn as sns

plt.figure(figsize=(12, 7))

plt.title("TF-IDF + KMeans on UN General Debates", fontdict={"fontsize": 18})

plt.xlabel("X0", fontdict={"fontsize": 16})
plt.ylabel("X1", fontdict={"fontsize": 16})

sns.scatterplot(data=paragraph_df, x='x0', y='x1', hue='cluster', palette="viridis")
plt.show()