# Topic Modeling

- Using Gensim (LDA, LSI, HDP, etc)

In [1]:
%load_ext watermark
%watermark -v -p numpy,pandas,polars,mlxtend,omegaconf,gensim --conda

Python implementation: CPython
Python version       : 3.10.8
IPython version      : 8.26.0

numpy    : 1.26.0
pandas   : 2.2.2
polars   : 1.4.1
mlxtend  : 0.23.1
omegaconf: not installed
gensim   : 4.3.3

conda environment: n/a



In [2]:
# Built-in library
from pathlib import Path
from pathlib import Path
import re
import json
from typing import Any, Optional, Union
import logging
import warnings

# Standard imports
import numpy as np
import numpy.typing as npt
from pprint import pprint
import pandas as pd
import polars as pl
from rich.console import Console
from rich.theme import Theme

custom_theme = Theme(
    {
        "info": "#76FF7B",
        "warning": "#FBDDFE",
        "error": "#FF0000",
    }
)
console = Console(theme=custom_theme)

# Visualization
import matplotlib.pyplot as plt

# NumPy settings
np.set_printoptions(precision=4)

# Pandas settings
pd.options.display.max_rows = 1_000
pd.options.display.max_columns = 1_000
pd.options.display.max_colwidth = 600

# Polars settings
pl.Config.set_fmt_str_lengths(1_000)
pl.Config.set_tbl_cols(n=1_000)
pl.Config.set_tbl_rows(n=200)

warnings.filterwarnings("ignore")


# auto reload imports# Built-in library
from pathlib import Path
import re
import json
from typing import Any, Optional, Union
import logging
import warnings

# Standard imports
import numpy as np
import numpy.typing as npt
from pprint import pprint
import pandas as pd
import polars as pl
from rich.console import Console
from rich.theme import Theme

custom_theme = Theme(
    {
        "info": "#76FF7B",
        "warning": "#FBDDFE",
        "error": "#FF0000",
    }
)
console = Console(theme=custom_theme)

# Visualization
import matplotlib.pyplot as plt

# NumPy settings
np.set_printoptions(precision=4)

# Pandas settings
pd.options.display.max_rows = 1_000
pd.options.display.max_columns = 1_000
pd.options.display.max_colwidth = 600

# Polars settings
pl.Config.set_fmt_str_lengths(1_000)
pl.Config.set_tbl_cols(n=1_000)
pl.Config.set_tbl_rows(500)

warnings.filterwarnings("ignore")


# Black code formatter (Optional)
%load_ext lab_black

# auto reload imports
%load_ext autoreload
%autoreload 2

<br>

# [Gensim](https://radimrehurek.com/gensim/index.html)

- `Gensim` = **“Generate Similar”**
- Gensim is a free open-source Python library for representing documents as semantic vectors, as efficiently (computer-wise) and painlessly (human-wise) as possible.
- It's designed to process raw, unstructured digital texts (“plain text”) using unsupervised machine learning algorithms.

#### Installation

```sh
pip install --upgrade gensim

```

### Use cases

- Train large-scale NLP semantic models.
- Represent text as semantic vectors.
- Find semantically related documents.


### Core Concepts of Gensim

- `Document`: some text.
- `Corpus`: a collection of documents.
- `Vector`: a mathematically convenient representation of a document.
- `Model`: an algorithm for transforming vectors from one representation to another.

In [3]:
import gensim

In [4]:
# Demonstartion purpose only! Gensim handles large corpora by streaming documents,
# avoiding memory overload.
text_corpus: list[str] = [
    "Human machine interface for lab abc computer applications",
    "A survey of user opinion of computer system response time",
    "The EPS user interface management system",
    "System and human system engineering testing of EPS",
    "Relation of user perceived response time to error measurement",
    "The generation of random binary unordered trees",
    "The intersection graph of paths in trees",
    "Graph minors IV Widths of trees and well quasi ordering",
    "Graph minors A survey",
]

# Create a set of frequent words
stopwords: set[str] = set("for a of the and to in".split())
print(f"{stopwords = }")

# Lowercase each document, split it by white space and filter out stopwords
texts: list[list[str]] = [
    [word for word in document.lower().split() if word not in stopwords]
    for document in text_corpus
]
console.print(f"{texts = }")

stopwords = {'to', 'and', 'the', 'of', 'a', 'for', 'in'}


In [5]:
from collections import defaultdict


# Count the word frequencies
frequency: defaultdict[str, int] = defaultdict(int)

for text in texts:
    for token in text:
        frequency[token] += 1


console.print(f"{frequency = }")

# Keep ONLY tokens that occur more than once
processed_corpus: list[list[str]] = [
    [token for token in text if frequency[token] > 1] for text in texts
]
console.print(f"{processed_corpus = }")

In [6]:
from gensim import corpora


# Create a dictionary that contains the token_id and the token_text
dictionary = corpora.Dictionary(processed_corpus)
console.print(f"{dictionary.items() = }")

#### Vector

- A mathematical way of representing documents in a multi-dimensional space.

In [7]:
# Vector: a mathematical way of representing documents in a multi-dimensional space.

# View the token ids
console.print(f"{dictionary.token2id = }")

# Vectorize a document
new_doc: str = "Human computer interaction"
new_vec: list[tuple[int, int]] = dictionary.doc2bow(new_doc.lower().split())
console.print(f"{new_vec = }")

#### Comment

```py
# Explaination of the result is shown below
new_vec = [(0, 1), (1, 1)]
```

In [8]:
new_doc_list: list[str] = new_doc.lower().split()

for idx, token in enumerate(new_doc_list):
    flag: bool = token in dictionary.values()
    count_: int = new_doc.count(token)
    print(f"{token}: ({idx}, {count_}) is in dictionary: {flag}")

human: (0, 0) is in dictionary: True
computer: (1, 1) is in dictionary: True
interaction: (2, 1) is in dictionary: False


In [9]:
# Vectorize the entire using a BoW (Bag of Words) approach. i.e. count the
# number of times a word appears in a document.
bow_corpus: list[list[tuple[int, int]]] = [
    dictionary.doc2bow(doc) for doc in processed_corpus
]
console.print(f"{bow_corpus = }")

<br>

#### Model

- It refers to `transformation` from one document representation to another.
- E.g. `TF-IDF` model transforms word frequency counts based on word rarity in the corpus. 

In [10]:
from gensim import models


# Train the model
tfidf: models.TfidfModel = models.TfidfModel(corpus=bow_corpus)

# Transform an input document
doc: list[str] = "System Minors".lower().split()
doc_bow: list[tuple[int, int]] = dictionary.doc2bow(doc)
doc_vec: list[tuple[int, float]] = tfidf[doc_bow]
console.print(f"{doc_vec = }")

#  system: 0.5898341626740045, minors: 0.8075244024440723
# [(5, 0.5898341626740045), (11, 0.8075244024440723)]
# Because system occurs a lot more than minors in the corpus, it has a lower tf-idf score.

#### Calculating Similariites

In [11]:
from gensim import similarities


# Prepare for similarity search
num_features: int = 12  # number of features (dimensions)
corpus_vector: list[tuple[int, float]] = tfidf[bow_corpus]

index = similarities.SparseMatrixSimilarity(
    corpus=corpus_vector, num_features=num_features
)

query_doc: list[str] = "System engineering".lower().split()
query_doc_bow: list[tuple[int, int]] = dictionary.doc2bow(query_doc)
query_doc_vector: list[tuple[int, float]] = tfidf[query_doc_bow]
sims: list[tuple[int, float]] = index[query_doc_vector]
console.print(f"{sims = }")

In [12]:
# Compare `text_corpus` with `sims`
pprint(text_corpus)

['Human machine interface for lab abc computer applications',
 'A survey of user opinion of computer system response time',
 'The EPS user interface management system',
 'System and human system engineering testing of EPS',
 'Relation of user perceived response time to error measurement',
 'The generation of random binary unordered trees',
 'The intersection graph of paths in trees',
 'Graph minors IV Widths of trees and well quasi ordering',
 'Graph minors A survey']


In [13]:
# Sort
for idx, score in sorted(enumerate(sims), key=lambda x: x[1], reverse=True):
    print(f"doc {idx}: similarity={score:4f}")

doc 3: similarity=0.718481
doc 2: similarity=0.417076
doc 1: similarity=0.324487
doc 0: similarity=0.000000
doc 4: similarity=0.000000
doc 5: similarity=0.000000
doc 6: similarity=0.000000
doc 7: similarity=0.000000
doc 8: similarity=0.000000


<hr><br>

#### Loading Data Effciently

- Streaming data from disk.

In [14]:
from smart_open import open  # for transparently opening remote files


# Copied from https://radimrehurek.com/gensim/tut1.html
class MyCorpus:
    def __iter__(self):
        for line in open("https://radimrehurek.com/mycorpus.txt"):
            # assume there's one document per line, tokens separated by whitespace
            yield dictionary.doc2bow(line.lower().split())

In [15]:
class MyCustomCorpus:
    def __init__(self, text: str = "text") -> None:
        self.text = text
        self.data: pl.LazyFrame = pl.scan_csv("../data/test.csv").select([self.text])

    def __repr__(self) -> str:
        return f"{self.__class__.__name__}()"

    def __iter__(self):
        for row in self.data.collect().iter_rows(named=True):
            text: str = row[self.text]
            yield dictionary.doc2bow(text.lower().split())

In [16]:
corpus_memory_friendly: MyCustomCorpus = MyCustomCorpus(text="text")

print(corpus_memory_friendly)

MyCustomCorpus()


In [17]:
from gensim.corpora import Dictionary


stopwords: set[str] = set("i is for a of the and to in on my".split())
df: pl.DataFrame = pl.read_csv("../data/test.csv")
processed_corpus: list[list[str]] = [
    line.lower().split() for line in df["text"].to_list()
]

dictionary: Dictionary = Dictionary(processed_corpus)

In [18]:
for idx, vec in enumerate(corpus_memory_friendly):
    if idx < 3:
        print(vec)

[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 2), (7, 1)]
[(5, 1), (6, 2), (8, 2), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 1), (21, 1)]
[(6, 4), (8, 1), (20, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1), (27, 2), (28, 1), (29, 1), (30, 1), (31, 1), (32, 1)]


In [19]:
# Text preprocessing

df: pl.DataFrame = pl.read_csv("../data/test.csv")
processed_corpus: list[list[str]] = [
    line.lower().split() for line in df["text"].to_list()
]

dictionary: Dictionary = Dictionary(processed_corpus)

# Extract the ids of the stopwords
stop_ids: list[int] = [
    dictionary.token2id[s_word] for s_word in stopwords if s_word in dictionary.token2id
]

# Extract ids of words with frequency less than 2
low_frequency_ids: list[int] = [
    tokenid for tokenid, docfreq in dictionary.dfs.items() if docfreq < 2
]
# Remove the ids
dictionary.filter_tokens(stop_ids + low_frequency_ids)

# Remove gaps in id sequence after words are filtered
dictionary.compactify()
print(dictionary)

Dictionary<10 unique tokens: ['messi', 'soccer', 'jesus', 'team', 'has']...>


In [20]:
print(dictionary.__len__())

10


<hr>

### Corpus Formats

- Gensim serially processes `Vector Space` corpus files, reading/writing one document at a time without loading the entire corpus into memory.

#### Save Corpus

- One of the more notable file formats is the `Market Matrix` format. To save a corpus in the Matrix Market format:

```py
# Create a toy corpus of 2 documents, as a plain Python list
# make one document empty, for the heck of it
corpus: list[list[tuple[int, float]]] = [[(1, 0.5)], []]  

# Save the corpus in Matrix Market format
corpora.MmCorpus.serialize('/tmp/corpus.mm', corpus)
```

- Other formats include:
  - `Joachim’s SVMlight` format
  - `Blei’s LDA-C` format
  - `GibbsLDA++` format.

```py
corpora.SvmLightCorpus.serialize('/tmp/corpus.svmlight', corpus)
corpora.BleiCorpus.serialize('/tmp/corpus.lda-c', corpus)
corpora.LowCorpus.serialize('/tmp/corpus.low', corpus)
```

#### Load The Corpus

- Conversely, to load a corpus iterator from a Matrix Market file:

```py
corpus = corpora.MmCorpus('/tmp/corpus.mm')
```

### Compatibility With NumPy And SciPy

- Gensim also contains efficient utility functions to help converting from/to numpy matrices.

```py
import gensim
import numpy as np
import gensim

# random matrix as an example
numpy_matrix: np.ndarray = np.random.randint(10, size=[5, 2])  
corpus = gensim.matutils.Dense2Corpus(numpy_matrix)

# convert from gensim corpus to numpy matrix
numpy_matrix = gensim.matutils.corpus2dense(corpus, num_terms=number_of_corpus_features)
```
<br>

#### From/to scipy.sparse matrices


```py
import scipy.sparse

# random sparse matrix as an example
scipy_sparse_matrix = scipy.sparse.random(5, 2)  # random sparse matrix as example
corpus = gensim.matutils.Sparse2Corpus(scipy_sparse_matrix)
scipy_csc_matrix = gensim.matutils.corpus2csc(corpus)
```


### Loading And Saving Models

In [21]:
from gensim.interfaces import TransformedCorpus


doc_bow: list[list[tuple[int, int]]] = [
    dictionary.doc2bow(text) for text in processed_corpus
]
tfidf_model: models.TfidfModel = models.TfidfModel(dictionary=dictionary)
corpus_tfidf: TransformedCorpus = tfidf_model[doc_bow]

lsi_model: models.LsiModel = models.LsiModel(
    corpus=corpus_tfidf, id2word=dictionary, num_topics=2
)
corpus_lsi: TransformedCorpus = lsi_model[corpus_tfidf]

# lsi_model.show_topics()
console.print(lsi_model.print_topics(num_topics=2))

#### Comment

```text
(0, '0.600*"messi" + 0.557*"soccer" + 0.339*"jesus" + 0.290*"team" + 0.274*"has" + 0.234*"religion" + 0.000*"religious" + 0.000*"are" + 0.000*"prayer" + 0.000*"together."')

- This means that for topic 0, `messi`, `soccer`, `jesus`, `team`, `has`, `religion` are all related words and contribute the most to the topic.
```

In [24]:
for doc, as_text in zip(corpus_lsi, processed_corpus):
    print(f"{doc} | {as_text}")

[(0, 0.7975568794650282)] | ['messi', 'is', 'the', 'best', 'soccer', 'player', 'in', 'the', 'world.']
[(0, 0.5700679546502856)] | ['the', 'soccer', 'field', 'became', 'a', 'sacred', 'ground', 'for', 'the', 'local', 'community', 'and', 'a', 'way', 'to', 'glorify', 'jesus', 'christ.']
[(0, 0.28980153629729605)] | ['a', 'divine', 'goal', 'changed', 'the', 'fate', 'of', 'the', 'underdog', 'team', 'thanks', 'to', 'the', 'great', 'passion', 'of', 'the', 'players.']
[(0, 0.7975568794650282)] | ['soccer', 'players', 'prayed', 'together', 'before', 'a', 'crucial', 'match.', 'e.g.', 'messi']
[(0, 0.5782949666969803)] | ['soccer', 'has', 'a', 'global', 'community.', 'the', 'team', 'was', 'awarded', 'a', 'medal.']
[(0, 0.3596260221908575)] | ['religion', 'has', 'been', 'a', 'guiding', 'force', 'in', 'my', 'life,', 'helping', 'me', 'make', 'important', 'decisions.', 'thank', 'you', 'lord']
[(1, 0.8660254037844384)] | ['prayer', 'and', 'meditation', 'are', 'essential', 'parts', 'of', 'my', 'religiou

<br>

#### Comment

```text
Topic 0: Probably about soccer.
(0, 0.7975568794650282)] | ['messi', 'is', 'the', 'best', 'soccer', 'player', 'in', 'the', 'world.']
[(0, 0.5700679546502856)] | ['the', 'soccer', 'field', 'became', 'a', 'sacred', 'ground', 'for', 'the', 'local', 'community', 'and', 'a', 'way', 'to', 'glorify', 'jesus', 'christ.']

# Topic 1: Probably about prayer/religious practice.
[(1, 0.8660254037844384)] | ['prayer', 'and', 'meditation', 'are', 'essential', 'parts', 'of', 'my', 'religious', 'practice.']
[(1, 0.7071067811865475)] | ['prayer', 'is', 'the', 'key.', 'do', 'not', 'forget', 'to', 'pray', 'together.']
```

<br><br>


#### Save The Model



In [25]:
# Save LSI model
lsi_model.save("../models/lsi_model.lsi")

In [27]:
# Load the model
loaded_lsi_model = models.LsiModel.load("../models/lsi_model.lsi")
console.print(loaded_lsi_model.print_topics(num_topics=2))

In [None]:
processed_corpus

In [None]:
models.LsiModel??

In [None]:
for vec in corpus_memory_friendly:
    print(vec)
    break

### Load Data

In [None]:
pl.Config.set_fmt_str_lengths(20)
pl.set_random_seed(seed=123)


fp: str = "../data/articles_final_df.parquet"
articles_df: pl.DataFrame = pl.read_parquet(fp)
print(f"{articles_df.shape = }")
articles_df.head(2)

In [None]:
df: pl.DataFrame = pl.DataFrame(
    {
        "id": np.arange(0, 10),
        "text": [
            "Messi is the best soccer player in the world.",
            "The soccer field became a sacred ground for the local community and a way to glorify Jesus Christ.",
            "A divine goal changed the fate of the underdog team thanks to the great passion of the players.",
            "Soccer players prayed together before a crucial match. e.g. Messi",
            "Soccer has a global community. The team was awarded a medal.",
            "Religion has been a guiding force in my life, helping me make important decisions. Thank you Lord",
            "Prayer and meditation are essential parts of my religious practice.",
            "Prayer is the key. Do not forget to pray together.",
            "Every tongue will shout that Jesus is the Lord. Religion is the key to success.",
            "Religious holidays (religion) are times of joy and reflection, bringing families closer together.",
        ],
    }
)

df.write_csv("../data/test.csv")