In [10]:
from bertopic import BERTopic
import os
from scipy.cluster import hierarchy as sch

# add .. to the path so we can import from the parent directory
import sys
sys.path.append("..")

from modeling.bertopic_models import get_texts, EMBEDDING_MODELS


# set up the config variables
CORPUS_DIR = os.path.join("../corpora/UN General Debate Corpus/TXT")
MODELS_DIR = "../models"

# load in the corpus
all_texts = get_texts(CORPUS_DIR)
texts = [text[0] for text in all_texts]

In [30]:
# load in the pretrained models
TOPIC_MODELS = {}

for name, embedding_model in EMBEDDING_MODELS.items():
    saved_model_path = os.path.join(MODELS_DIR, name)
    if not os.path.exists(saved_model_path):
        print(f"Skipping model '{name}' because it does not exist")
        continue
    else:
        print(f"Loading model {name}")
    TOPIC_MODELS[name] = BERTopic.load(saved_model_path)



Loading model all-MiniLM-L6-v2
Loading model all-mpnet-base-v2
Loading model distilbert
Loading model all-MiniLM-L12-v2




Loading model roberta


In [31]:
t = TOPIC_MODELS["all-mpnet-base-v2"]

In [28]:
# get hierarchical topics for different linkage methods

linkages = ["complete", "average", "centroid", "median"]
linkage_functions = [lambda x: sch.linkage(x, method=linkage, optimal_ordering=True) for linkage in linkages]
hierarchical_topics = {linkage: t.hierarchical_topics(texts, linkage_function=linkage_function) for linkage, linkage_function in zip(linkages, linkage_functions)}


100%|██████████| 201/201 [00:06<00:00, 32.83it/s]
100%|██████████| 201/201 [00:04<00:00, 44.25it/s]
100%|██████████| 201/201 [00:04<00:00, 43.09it/s]
100%|██████████| 201/201 [00:04<00:00, 42.29it/s]


In [33]:
# visualize the hierarchy for complete linkage
t.visualize_hierarchy(hierarchical_topics=hierarchical_topics['complete'], orientation="bottom")

In [42]:
# analyze the topic tree for complete linkage
tree_complete = t.get_topic_tree(hierarchical_topics['complete'], tight_layout=True)
print(tree_complete)

.
├─■──azerbaijan_armenia_genocide_humanitarian_armenians ── Topic: 47
└─of_the_and_to_in
  ├─the_of_and_to_in
  │ ├─■──eritrea_ethiopia_sudan_eritreans_eritrean ── Topic: 176
  │ └─that_and_to_the_of
  │   ├─that_and_to_the_of
  │   │ ├─that_and_to_the_of
  │   │ │ ├─■──timor_timorese_asean_indonesia_myanmar ── Topic: 180
  │   │ │ └─the_of_and_to_in
  │   │ │   ├─the_united_of_to_states
  │   │ │   │ ├─■──abkhazian_tbilisi_abkhazia_georgians_nato ── Topic: 156
  │   │ │   │ └─the_of_and_to_in
  │   │ │   │   ├─and_to_the_in_of
  │   │ │   │   │ ├─the_of_and_to_in
  │   │ │   │   │ │ ├─and_to_the_in_of
  │   │ │   │   │ │ │ ├─and_to_the_in_of
  │   │ │   │   │ │ │ │ ├─the_of_and_to_in
  │   │ │   │   │ │ │ │ │ ├─■──korea_peaceful_sovereignty_reunification_kim ── Topic: 42
  │   │ │   │   │ │ │ │ │ └─and_in_to_the_of
  │   │ │   │   │ │ │ │ │   ├─and_in_to_the_of
  │   │ │   │   │ │ │ │ │   │ ├─the_of_and_to_in
  │   │ │   │   │ │ │ │ │   │ │ ├─and_in_to_the_of
  │   │ │   │   │ │ │ │ 

In [37]:
# visualize the hierarchy for average linkage
t.visualize_hierarchy(hierarchical_topics=hierarchical_topics['average'], orientation="bottom")

In [43]:
# analyze the topic tree for average linkage
tree_average = t.get_topic_tree(hierarchical_topics['average'], tight_layout=True)
print(tree_average)

.
├─■──azerbaijan_armenia_genocide_humanitarian_armenians ── Topic: 47
└─of_the_and_to_in
  ├─the_of_and_to_in
  │ ├─■──eritrea_ethiopia_sudan_eritreans_eritrean ── Topic: 176
  │ └─that_and_to_the_of
  │   ├─that_and_to_the_of
  │   │ ├─that_and_to_the_of
  │   │ │ ├─■──timor_timorese_asean_indonesia_myanmar ── Topic: 180
  │   │ │ └─the_of_and_to_in
  │   │ │   ├─the_united_of_to_states
  │   │ │   │ ├─■──abkhazian_tbilisi_abkhazia_georgians_nato ── Topic: 156
  │   │ │   │ └─the_of_and_to_in
  │   │ │   │   ├─and_to_the_in_of
  │   │ │   │   │ ├─the_of_and_to_in
  │   │ │   │   │ │ ├─and_to_the_in_of
  │   │ │   │   │ │ │ ├─and_to_the_in_of
  │   │ │   │   │ │ │ │ ├─the_of_and_to_in
  │   │ │   │   │ │ │ │ │ ├─■──korea_peaceful_sovereignty_reunification_kim ── Topic: 42
  │   │ │   │   │ │ │ │ │ └─and_in_to_the_of
  │   │ │   │   │ │ │ │ │   ├─and_in_to_the_of
  │   │ │   │   │ │ │ │ │   │ ├─the_of_and_to_in
  │   │ │   │   │ │ │ │ │   │ │ ├─and_in_to_the_of
  │   │ │   │   │ │ │ │ 