<a href="https://colab.research.google.com/github/cristianmejia00/clustering/blob/main/Topic_Models_using_BERTopic_HEATMAPS_v1_20241018.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Heatmap for Topic Modeling with BERTopic


# Requirements

## Packages installation and initialization

In [1]:
!pip install bertopic[visualization]

Collecting bertopic[visualization]
  Downloading bertopic-0.16.4-py3-none-any.whl.metadata (23 kB)
[0mCollecting hdbscan>=0.8.29 (from bertopic[visualization])
  Downloading hdbscan-0.8.39-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (15 kB)
Collecting umap-learn>=0.5.0 (from bertopic[visualization])
  Downloading umap_learn-0.5.7-py3-none-any.whl.metadata (21 kB)
Collecting pynndescent>=0.5 (from umap-learn>=0.5.0->bertopic[visualization])
  Downloading pynndescent-0.5.13-py3-none-any.whl.metadata (6.8 kB)
Downloading hdbscan-0.8.39-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.2/4.2 MB[0m [31m13.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading umap_learn-0.5.7-py3-none-any.whl (88 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m88.8/88.8 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading bertopic-0.16.4-py3-none-any.whl (143 kB)
[2K   [90m━

In [2]:
import pandas as pd
import time
import math
from datetime import date
import uuid
import re
import os
import json
import pickle
from itertools import compress
from bertopic import BERTopic
from umap import UMAP
from gensim.parsing.preprocessing import remove_stopwords
import numpy as np
from sklearn.cluster import KMeans

In [3]:
# Change to the name of the folder in your Google Drive
root_folder_name = 'Bibliometrics_Drive'
ROOT_FOLDER_PATH = f"drive/MyDrive/{root_folder_name}"

## Connect your Google Drive

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
# Function to save object to a pickle file
def save_object_as_pickle(obj, filename):
  """
  Saves an object as a pickle file.

  Args:
      obj: The object to be saved.
      filename: The filename of the pickle file.
  """
  with open(filename, "wb") as f:
    pickle.dump(obj, f)



In [6]:
# Function to load pickle object given a path
def load_pickle(path):
    with open(path, 'rb') as f:
        return pickle.load(f)


In [7]:
def save_heatmap_settings_as_json(heatmap_settings, filename="heatmap_settings.json"):
  """Saves heatmap settings as a JSON file with pretty indentation.

  Args:
      heatmap_settings: The heatmap settings dictionary.
      filename: The name of the JSON file.
  """
  with open(filename, "w") as f:
    json.dump(heatmap_settings, f, indent=4)



---



## PART 3: Merging Topic Models

# 🔴 Input files and options



In [75]:
heatmap_settings = {
    'metadata': {
      'heatmap_analysis_id': 'H004',
      'heatmap_name': 'PIK_RIKEN_UTOKYO',
      'date': '2024-11-02',
      'created_by': 'cristianmejia00@gmail.com',
      'notes': '',
      'input_directory': '',
      'output_directory': ''
    },
    'global': {
                'min_cluster_size': 10,
                'seed': 100,
                'transformer_model': 'all-MiniLM-L6-v2',
                'sankey_threshold': 0.8
              },
    'inputs': [
        {
            'project_folder_name': 'Q281_pik',
            'analysis_folder_name': 'a01_tm__f01_e01__km01',
            'level_folder_name': 'level0',
            'embeddings_folder_name': 'f01/e01',
            'display_name': 'PIK',
            'cluster_column': 'Cluster Code',
            'heatmap_display_order': 0,
            'sankey_display_order': 0,
            'color': "#E9571F"
        },
        {
            'project_folder_name': 'Q282c_riken_bio',
            'analysis_folder_name': 'a01_tm__f01_e01__km01',
            'embeddings_folder_name': 'f01/e01',
            'level_folder_name': 'level0',
            'display_name': 'RIKEN.Bio',
            'cluster_column': 'Cluster Code',
            'heatmap_display_order': 1,
            'sankey_display_order': 2,
            'color': '#4C962F'
        },
        {
            'project_folder_name': 'Q282a_riken_sus',
            'analysis_folder_name': 'a01_tm__f01_e01__km01',
            'embeddings_folder_name': 'f01/e01',
            'level_folder_name': 'level0',
            'display_name': 'RIKEN.Sust',
            'cluster_column': 'Cluster Code',
            'heatmap_display_order': 2,
            'sankey_display_order': 2,
            'color': '#150A9A'
        },
        {
            'project_folder_name': 'Q282b_riken_com',
            'analysis_folder_name': 'a01_tm__f01_e01__km01',
            'embeddings_folder_name': 'f01/e01',
            'level_folder_name': 'level0',
            'display_name': 'RIKEN.Com',
            'cluster_column': 'Cluster Code',
            'heatmap_display_order': 3,
            'sankey_display_order': 2,
            'color': '#808080'
        },
        {
            'project_folder_name': 'Q312_utokyo',
            'analysis_folder_name': 'a01_tm__f01_e01__km01',
            'embeddings_folder_name': 'f01/e01',
            'level_folder_name': 'level0',
            'display_name': 'UTokyo',
            'cluster_column': 'Cluster Code',
            'heatmap_display_order': 4,
            'sankey_display_order': 1,
            'color': '#F2BA05'
        }#,
        # {
        #     'project_folder_name': 'Q308_bio_plant',
        #     'analysis_folder_name': 'a01_tm__f01_e01__km01',
        #     'embeddings_folder_name': 'f01/e01',
        #     'level_folder_name': 'level0',
        #     'display_name': 'Plant',
        #     'cluster_column': 'Cluster Code',
        #     'heatmap_display_order': 4,
        #     'sankey_display_order': 1,
        #     'color': '#66FF00'
        # }
      ]
}

In [76]:
analysis_folder = f'{ROOT_FOLDER_PATH}/{heatmap_settings["metadata"]["heatmap_analysis_id"]}'

if not os.path.exists(analysis_folder):
  !mkdir $analysis_folder


# Save settings
save_heatmap_settings_as_json(heatmap_settings, filename=f'{ROOT_FOLDER_PATH}/{heatmap_settings["metadata"]["heatmap_analysis_id"]}/heatmap_settings_{heatmap_settings["metadata"]["heatmap_analysis_id"]}_{heatmap_settings["metadata"]["heatmap_name"]}.json')

In [48]:
heatmap_input_dfs = []
for tm in heatmap_settings['inputs']:
  document_path = f'{ROOT_FOLDER_PATH}/{tm["project_folder_name"]}/{tm["analysis_folder_name"]}/{tm["level_folder_name"]}/article_report.csv'
  print(document_path)
  input_df = pd.read_csv(document_path,
                         usecols=['ID', 'uuid', tm['cluster_column']])
  # Each dataset can use different clustering result e.g. X_C, level0, level1, so we need to unify the header name for concatenation
  input_df['display_name'] = tm['display_name']
  input_df['cluster'] = input_df['display_name'] + "-" + input_df[tm['cluster_column']].astype(str)
  input_df = input_df.rename(columns={'ID': 'UT'})
  input_df = input_df.drop(columns=[tm['cluster_column']])
  heatmap_input_dfs.append(input_df)

drive/MyDrive/Bibliometrics_Drive/Q281_pik/a01_tm__f01_e01__km01/level0/article_report.csv
drive/MyDrive/Bibliometrics_Drive/Q282c_riken_bio/a01_tm__f01_e01__km01/level0/article_report.csv
drive/MyDrive/Bibliometrics_Drive/Q282a_riken_sus/a01_tm__f01_e01__km01/level0/article_report.csv
drive/MyDrive/Bibliometrics_Drive/Q282b_riken_com/a01_tm__f01_e01__km01/level0/article_report.csv
drive/MyDrive/Bibliometrics_Drive/Q312_utokyo/a01_tm__f01_e01__km01/level0/article_report.csv


In [49]:
document_info = pd.concat(heatmap_input_dfs).reset_index(drop=True)
print(len(document_info))
document_info.head()

24590


Unnamed: 0,UT,uuid,display_name,cluster
0,WOS:000071415800010,4bf5613f-f5fb-4a9f-8f5c-466f12e33461,PIK,PIK-20
1,WOS:000071604200043,0d733506-e76b-4a01-9408-411d8714c8b2,PIK,PIK-9
2,WOS:000071740800015,d2c2a60c-e484-42b3-8839-f8fdb7d8dafd,PIK,PIK-29
3,WOS:000072198600007,78239207-c2b0-4ec2-935e-feb73f5f2884,PIK,PIK-27
4,WOS:000072339800002,c68a089b-5260-42ec-8623-03a3cd829f83,PIK,PIK-14




---



## PART 5. Heatmap

In [52]:
# For firms we know, simply get the embeddings back.
embeddings_list = []
corpus_list = []
for tm in heatmap_settings['inputs']:
  print(f"=================Loading: {tm['project_folder_name']}")
  embeddings = load_pickle(f"{ROOT_FOLDER_PATH}/{tm['project_folder_name']}/{tm['embeddings_folder_name']}/embeddings.pck")
  if type(embeddings) == dict:
    print('Dict type found')
    embeddings = embeddings['embeddings']
  corpus_tmp = pd.read_csv(f"{ROOT_FOLDER_PATH}/{tm['project_folder_name']}/{tm['embeddings_folder_name']}/corpus.csv")

  embeddings_list.append(embeddings)
  corpus_list.append(corpus_tmp)

Dict type found


In [53]:
# Combine embeddings
embeddings_uploaded = np.vstack(embeddings_list)
corpus_uploaded = pd.concat(corpus_list).reset_index(drop=True)

In [54]:
embeddings_uploaded.shape

(24730, 384)

In [55]:
print(len(embeddings_uploaded))
print(len(corpus_uploaded))
print(len(document_info))

24730
24730
24590


In [56]:
# prompt: add `embeddings_uploaded` as a column to `corpus_uploaded`
corpus_uploaded['embeddings'] = list(embeddings_uploaded)

In [57]:
# prompt: remove  rows of corpus_uploaded where UT is duplicated

# Remove rows where 'UT' is duplicated, keeping the first occurrence
corpus_uploaded = corpus_uploaded.drop_duplicates(subset=['UT'], keep='first')

In [58]:
# prompt: merge `corpus_uploaded` and `document_info` by column UT. The merged data frame has as many rows and same sorting as corpus_uploaded. The merged data frame is named `full_corpus`.
full_corpus = pd.merge(document_info[['UT', 'uuid', 'cluster']], corpus_uploaded[['UT', 'text', 'embeddings']], on='UT', how='left')

In [59]:
len(full_corpus)

24590

In [60]:
len(document_info)

24590

In [61]:
# prompt: Remove all rows in full_corpus where the size of column `embeddings` do not match 384
full_corpus = full_corpus[full_corpus['embeddings'].apply(lambda x: len(x) == 384 if isinstance(x, list) or isinstance(x, np.ndarray) else False)]

In [62]:
len(full_corpus)

24590

In [63]:
# Remodel the topic model
from bertopic.backend import BaseEmbedder
from bertopic.cluster import BaseCluster
from bertopic.vectorizers import ClassTfidfTransformer
from bertopic.dimensionality import BaseDimensionalityReduction
from sentence_transformers import SentenceTransformer

In [64]:
cluster_idx_mapping = full_corpus.cluster.value_counts()
cluster_idx_mapping = cluster_idx_mapping[cluster_idx_mapping >= heatmap_settings['global']['min_cluster_size']]
cluster_idx_mapping

Unnamed: 0_level_0,count
cluster,Unnamed: 1_level_1
RIKEN.Sust-1,374
PIK-1,318
PIK-2,317
UTokyo-0,279
PIK-3,269
...,...
RIKEN.Sust-30,13
RIKEN.Bio-20,12
RIKEN.Bio-21,11
RIKEN.Bio-22,11


In [65]:
full_corpus = full_corpus[full_corpus.cluster.isin(cluster_idx_mapping.index.to_list())]

In [66]:
len(full_corpus)

24590

In [67]:
# Form the embbedings
my_embeddings = np.vstack(full_corpus['embeddings'].tolist())

In [68]:
# get text and topics
docs = full_corpus.text
cluster_list = full_corpus.cluster

In [69]:
idx_cluster = [cluster_idx_mapping.index.get_loc(i) for i in cluster_list]

In [70]:
len(idx_cluster) == len(docs) == len(my_embeddings)

True

# 🟢🟢

In [71]:
# Init "empty" models
embedding_model = SentenceTransformer(heatmap_settings["global"]["transformer_model"])
empty_dimensionality_model = BaseDimensionalityReduction()
empty_cluster_model = BaseCluster()
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)

# Fit BERTopic without actually performing any clustering
topic_model= BERTopic(
        embedding_model=embedding_model,
        umap_model=empty_dimensionality_model,
        hdbscan_model=empty_cluster_model,
        ctfidf_model=ctfidf_model
)

In [77]:
topics, probs = topic_model.fit_transform(docs, my_embeddings, y=idx_cluster)

In [78]:
tm_summary = topic_model.get_topic_info()
tm_summary

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,0,374,0_catalyst_catalyzed_scandium_alkenes,"[catalyst, catalyzed, scandium, alkenes, catal...",[scandium-catalyzed syndiospecific polymerizat...
1,1,318,1_oscillators_synchronization_delay_chimera,"[oscillators, synchronization, delay, chimera,...",[effect fractional derivatives amplitude chime...
2,2,317,2_vegetation_amazon_npp_terrestrial,"[vegetation, amazon, npp, terrestrial, albedo,...",[climate extreme versus carbon extreme: respon...
3,3,279,3_magnetic_spin_hall_magnetization,"[magnetic, spin, hall, magnetization, ferromag...",[fabrication single-crystalline yfeo<sub>3</su...
4,4,269,4_hydrological_river_runoff_catchment,"[hydrological, river, runoff, catchment, basin...",[propagation forcing model uncertainties hydro...
...,...,...,...,...,...
231,231,13,231_sg1_inkt_endothelin_traj18,"[sg1, inkt, endothelin, traj18, granulosum, st...",[super-sensitive auxin-inducible degron engine...
232,232,12,232_gibberellin_atgid1a_atgid1c_ga,"[gibberellin, atgid1a, atgid1c, ga, atgid1b, s...",[overview gibberellin metabolism enzyme genes ...
233,233,11,233_petal_erectile_abscission_aquatica,"[petal, erectile, abscission, aquatica, pub4, ...","[plant u-box protein, pub4, regulates asymmetr..."
234,234,11,234_1928_nakase_komagata_yana,"[1928, nakase, komagata, yana, banno, zsolt, f...","[kockovaella nakase, banno & y. yamada (1991) ..."


In [79]:
# Document information. Including the topic assignation
test = topic_model.get_document_info(docs, df = full_corpus)
test = test[['cluster', 'Name']].drop_duplicates(subset=['cluster'], keep='first')
print(test.shape)
test.head(10)

(236, 2)


Unnamed: 0,cluster,Name
0,PIK-20,61_monsoon_zonal_arctic_indian
1,PIK-9,15_glacial_cave_kyr_holocene
2,PIK-29,126_nan_editorial_crutzen_things
3,PIK-27,124_extinction_permafrost_sahara_biosphere
4,PIK-14,29_forest_forests_stand_bison
5,PIK-2,2_vegetation_amazon_npp_terrestrial
6,PIK-17,42_sdgs_nexus_sustainability_biodiversity
8,PIK-3,4_hydrological_river_runoff_catchment
10,PIK-4,5_paris_mitigation_emissions_ndcs
11,PIK-13,26_coastal_precipitation_downscaling_cclm




---



In [80]:
# Default
# Visualize topic similarity using heatmap (self similarity)
hm = topic_model.visualize_heatmap()
#hm.write_html(f"{ROOT_FOLDER_PATH}/heatmap_updated.html")
hm

In [None]:
#pd.DataFrame(hm.data[0]['z'], columns=hm.data[0]['x']).to_csv(f'{ROOT_FOLDER_PATH}/{heatmap_settings["metadata"]["heatmap_analysis_id"]}/heatmap_matrix.csv', index=False)



---



## coordinates

In [81]:
import numpy as np
import umap

def reduce_dimensionality(data):
    # Create a UMAP object with the desired settings
    reducer = umap.UMAP(n_components=2, random_state=heatmap_settings['global']['seed'])

    # Perform dimensionality reduction
    reduced_data = reducer.fit_transform(data)

    return reduced_data

In [82]:
# Reduce dimensionality using UMAP
reduced_data = reduce_dimensionality(hm.data[0]['z'])
# Print the shape of the reduced data
print("Reduced data shape:", reduced_data.shape)

Reduced data shape: (236, 2)


In [83]:
dms = pd.DataFrame(reduced_data)
dms.columns = ['x', 'y']
dms['label'] = tm_summary['Name']
dms['cluster'] = dms['label'].map(test.set_index('Name')['cluster'])
dms.head()

Unnamed: 0,x,y,label,cluster
0,8.127893,1.287215,0_catalyst_catalyzed_scandium_alkenes,RIKEN.Sust-1
1,5.216771,3.852225,1_oscillators_synchronization_delay_chimera,PIK-1
2,2.642149,6.99985,2_vegetation_amazon_npp_terrestrial,PIK-2
3,6.546183,1.36623,3_magnetic_spin_hall_magnetization,UTokyo-0
4,3.019911,6.80792,4_hydrological_river_runoff_catchment,PIK-3


In [84]:
# Save dms
dms.to_csv(f'{ROOT_FOLDER_PATH}/{heatmap_settings["metadata"]["heatmap_analysis_id"]}/coordinates.csv', index = False)

In [85]:
# Save heatmap
pd.DataFrame(hm.data[0]['z'], columns=dms["cluster"]).to_csv(f'{ROOT_FOLDER_PATH}/{heatmap_settings["metadata"]["heatmap_analysis_id"]}/heatmap_matrix.csv', index=False)



---



## Melted

In [86]:
# prompt: `hm_test` is a squared matrix similarity matrix. This is a symmetric matrix so we only consider the lower triangle, without the diagonal.  Let's get the melted form as a data frame with 3 columns `Source`, `Target`, and `Similarity`. Then, sort it from the largest similarity to the lowest. Remove the pairs with value of zero.
hm_test = hm.data[0]['z']

# Assuming hm_test is your similarity matrix
df = pd.DataFrame(hm_test)

# Get the lower triangle without the diagonal
rows, cols = np.tril_indices(df.shape[0], -1)

# Create a DataFrame with Source, Target, and Similarity
similarity_df = pd.DataFrame({
    'Source': df.columns[rows],
    'Target': df.columns[cols],
    'Similarity': df.values[rows, cols]
})

# Remove rows with similarity of zero
similarity_df = similarity_df[similarity_df['Similarity'] > 0]

# Sort by similarity in descending order
similarity_df = similarity_df.sort_values('Similarity', ascending=False)

similarity_df

Unnamed: 0,Source,Target,Similarity
25212,225,12,0.969097
17972,190,17,0.962980
595,35,0,0.958142
5278,103,25,0.949204
27051,233,23,0.946157
...,...,...,...
18912,194,191,0.000050
5634,106,69,0.000019
3009,78,6,0.000012
21622,208,94,0.000006


In [87]:
# prompt: Using dataframe similarity_df: Replace the values of columns Source and Target with the labels from `dms["cluster"]`

# Replace Source and Target with labels from dms["cluster"]
similarity_df['Source'] = similarity_df['Source'].map(lambda x: dms["cluster"][int(x)]) # Convert x to integer
similarity_df['Target'] = similarity_df['Target'].map(lambda x: dms["cluster"][int(x)]) # Convert x to integer


In [88]:
similarity_df.head(100)

Unnamed: 0,Source,Target,Similarity
25212,RIKEN.Bio-16,RIKEN.Sust-2,0.969097
17972,RIKEN.Bio-11,RIKEN.Sust-3,0.962980
595,UTokyo-9,RIKEN.Sust-1,0.958142
5278,UTokyo-45,RIKEN.Sust-5,0.949204
27051,RIKEN.Bio-21,RIKEN.Sust-4,0.946157
...,...,...,...
24110,RIKEN.Sust-26,RIKEN.Bio-2,0.844956
26331,RIKEN.Bio-18,RIKEN.Bio-16,0.844784
25223,RIKEN.Bio-16,RIKEN.Sust-4,0.844725
2713,RIKEN.Sust-14,RIKEN.Sust-2,0.844050


In [89]:
# # prompt: Using dataframe similarity_df: Replace the values of columns Source and Target with the corresponding "cluster" in the `dms` data frame. Use the column "label" in `dms` to find the matches.

# # Assuming you have a dataframe named 'dms' with 'label' and 'cluster' columns

# # Create a dictionary mapping 'label' to 'cluster' from the 'dms' dataframe
# label_to_cluster = dict(zip(dms['label'], dms['cluster']))

# # Replace 'Source' column values with corresponding 'cluster' values
# similarity_df['Source'] = similarity_df['Source'].map(label_to_cluster)

# # Replace 'Target' column values with corresponding 'cluster' values
# similarity_df['Target'] = similarity_df['Target'].map(label_to_cluster)

# similarity_df.head()

In [90]:
similarity_df.to_csv(f'{ROOT_FOLDER_PATH}/{heatmap_settings["metadata"]["heatmap_analysis_id"]}/heatmap_melted.csv', index = False)