<a href="https://colab.research.google.com/github/cristianmejia00/clustering/blob/main/06_heatmap_sankey/01_heatmap.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Heatmap for Topic Modeling with BERTopic


# Requirements

## Packages installation and initialization

In [1]:
!pip install bertopic[visualization]

zsh:1: no matches found: bertopic[visualization]


In [1]:
import pandas as pd
import time
import math
from datetime import date
import uuid
import re
import os
import json
import pickle
from itertools import compress
from bertopic import BERTopic
from umap import UMAP
from gensim.parsing.preprocessing import remove_stopwords
import numpy as np
from sklearn.cluster import KMeans

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Change to the name of the folder in your Google Drive
root_folder_name = 'Bibliometrics_Drive'
#ROOT_FOLDER_PATH = f"drive/MyDrive/{root_folder_name}" # <- Google Colab
ROOT_FOLDER_PATH = f"/Users/cristian/Library/CloudStorage/GoogleDrive-cristianmejia00@gmail.com/My Drive/{root_folder_name}" #Mac

## Connect your Google Drive

In [22]:
# from google.colab import drive
# drive.mount('/content/drive')

In [3]:
# Function to save object to a pickle file
def save_object_as_pickle(obj, filename):
  """
  Saves an object as a pickle file.

  Args:
      obj: The object to be saved.
      filename: The filename of the pickle file.
  """
  with open(filename, "wb") as f:
    pickle.dump(obj, f)



In [4]:
# Function to load pickle object given a path
def load_pickle(path):
    with open(path, 'rb') as f:
        return pickle.load(f)


In [5]:
def save_heatmap_settings_as_json(heatmap_settings, filename="heatmap_settings.json"):
  """Saves heatmap settings as a JSON file with pretty indentation.

  Args:
      heatmap_settings: The heatmap settings dictionary.
      filename: The name of the JSON file.
  """
  with open(filename, "w") as f:
    json.dump(heatmap_settings, f, indent=4)



---



## PART 3: Merging Topic Models

# 🔴 Input files and options



In [6]:
heatmap_settings = {
    'metadata': {
      'heatmap_analysis_id': 'H011',
      'heatmap_name': 'EU_act-AI_libsci',
      'date': '2025-01-31',
      'created_by': 'cristianmejia00@gmail.com',
      'notes': '',
      'input_directory': '',
      'output_directory': ''
    },
    'global': {
                'min_cluster_size': 10,
                'seed': 100,
                'transformer_model': 'all-MiniLM-L6-v2',
                'sankey_threshold': 0.8
              },
    'inputs': [
        {
            'project_folder_name': 'Q324_EU_Act',
            'analysis_folder_name': 'a01_tm__f01_e01__km01',
            'level_folder_name': 'level0',
            'embeddings_folder_name': 'f01/e01',
            'display_name': 'EUact',
            'cluster_column': 'Cluster Code',
            'heatmap_display_order': 0,
            'sankey_display_order': 1,
            'color': "#E9571F"
        },
        {
            'project_folder_name': 'Q325_ai_libsci',
            'analysis_folder_name': 'a01_tm__f01_e01__km01',
            'embeddings_folder_name': 'f01/e01',
            'level_folder_name': 'level0',
            'display_name': 'AI',
            'cluster_column': 'Cluster Code',
            'heatmap_display_order': 1,
            'sankey_display_order': 0,
            'color': '#808080'
        }#,
        # {
        #     'project_folder_name': 'Q282b_riken_com',
        #     'analysis_folder_name': 'a01_tm__f01_e01__km01',
        #     'embeddings_folder_name': 'f01/e01',
        #     'level_folder_name': 'level0',
        #     'display_name': 'RIKEN.Com',
        #     'cluster_column': 'Cluster Code',
        #     'heatmap_display_order': 3,
        #     'sankey_display_order': 2,
        #     'color': '#808080'
        # },
        # {
        #     'project_folder_name': 'Q312_utokyo',
        #     'analysis_folder_name': 'a01_tm__f01_e01__km01',
        #     'embeddings_folder_name': 'f01/e01',
        #     'level_folder_name': 'level0',
        #     'display_name': 'UTokyo',
        #     'cluster_column': 'Cluster Code',
        #     'heatmap_display_order': 4,
        #     'sankey_display_order': 1,
        #     'color': '#F2BA05'
        # }#,
        # # {
        # #     'project_folder_name': 'Q308_bio_plant',
        # #     'analysis_folder_name': 'a01_tm__f01_e01__km01',
        # #     'embeddings_folder_name': 'f01/e01',
        # #     'level_folder_name': 'level0',
        # #     'display_name': 'Plant',
        # #     'cluster_column': 'Cluster Code',
        # #     'heatmap_display_order': 4,
        # #     'sankey_display_order': 1,
        # #     'color': '#66FF00'
        # # }
      ]
}

In [12]:
# analysis_folder = ""

# if not os.path.exists(analysis_folder):
#   print("We are here!")
#   !mkdir $analysis_folder


# # Save settings
# save_heatmap_settings_as_json(heatmap_settings, filename=f'{ROOT_FOLDER_PATH}/{heatmap_settings["metadata"]["heatmap_analysis_id"]}/heatmap_settings_{heatmap_settings["metadata"]["heatmap_analysis_id"]}_{heatmap_settings["metadata"]["heatmap_name"]}.json')

In [7]:
# analysis_folder = f'{ROOT_FOLDER_PATH}/{heatmap_settings["metadata"]["heatmap_analysis_id"]}'

# if not os.path.exists(analysis_folder):
#   !mkdir $analysis_folder

In [8]:

# Save settings
save_heatmap_settings_as_json(heatmap_settings, filename=f'{ROOT_FOLDER_PATH}/{heatmap_settings["metadata"]["heatmap_analysis_id"]}/heatmap_settings_{heatmap_settings["metadata"]["heatmap_analysis_id"]}_{heatmap_settings["metadata"]["heatmap_name"]}.json')

In [9]:
heatmap_input_dfs = []
for tm in heatmap_settings['inputs']:
  document_path = f'{ROOT_FOLDER_PATH}/{tm["project_folder_name"]}/{tm["analysis_folder_name"]}/{tm["level_folder_name"]}/article_report.csv'
  print(document_path)
  input_df = pd.read_csv(document_path,
                         usecols=['ID', 'uuid', tm['cluster_column']])
  # Each dataset can use different clustering result e.g. X_C, level0, level1, so we need to unify the header name for concatenation
  input_df['display_name'] = tm['display_name']
  input_df['cluster'] = input_df['display_name'] + "-" + input_df[tm['cluster_column']].astype(str)
  input_df = input_df.rename(columns={'ID': 'UT'})
  input_df = input_df.drop(columns=[tm['cluster_column']])
  heatmap_input_dfs.append(input_df)

/Users/cristian/Library/CloudStorage/GoogleDrive-cristianmejia00@gmail.com/My Drive/Bibliometrics_Drive/Q324_EU_Act/a01_tm__f01_e01__km01/level0/article_report.csv
/Users/cristian/Library/CloudStorage/GoogleDrive-cristianmejia00@gmail.com/My Drive/Bibliometrics_Drive/Q325_ai_libsci/a01_tm__f01_e01__km01/level0/article_report.csv


In [10]:
document_info = pd.concat(heatmap_input_dfs).reset_index(drop=True)
print(len(document_info))
document_info.head()

3800


Unnamed: 0,UT,uuid,display_name,cluster
0,id811064.185021328,0af04b20-1ed4-47c6-ace3-cea5b939dc6d,EUact,EUact-1
1,id703716.191055823,58d7dede-ea5e-480f-b8d5-0bd380c812b2,EUact,EUact-1
2,id501805.516222237,8ab4c1fd-4a3a-4688-ba47-054b946645ac,EUact,EUact-1
3,id918304.481728388,9c0b6f8e-0448-472b-a8ec-a3c5044fc602,EUact,EUact-1
4,id404984.461066991,d7710a62-56b8-41d5-9c76-5a57781193e6,EUact,EUact-1




---



## PART 5. Heatmap

In [11]:
# For firms we know, simply get the embeddings back.
embeddings_list = []
corpus_list = []
for tm in heatmap_settings['inputs']:
  print(f"=================Loading: {tm['project_folder_name']}")
  embeddings = load_pickle(f"{ROOT_FOLDER_PATH}/{tm['project_folder_name']}/{tm['embeddings_folder_name']}/embeddings.pck")
  if type(embeddings) == dict:
    print('Dict type found')
    embeddings = embeddings['embeddings']
  corpus_tmp = pd.read_csv(f"{ROOT_FOLDER_PATH}/{tm['project_folder_name']}/{tm['embeddings_folder_name']}/corpus.csv")

  embeddings_list.append(embeddings)
  corpus_list.append(corpus_tmp)

Dict type found
Dict type found


In [13]:
# Combine embeddings
embeddings_uploaded = np.vstack(embeddings_list)
corpus_uploaded = pd.concat(corpus_list).reset_index(drop=True)

In [14]:
embeddings_uploaded.shape

(3800, 384)

In [15]:
print(len(embeddings_uploaded))
print(len(corpus_uploaded))
print(len(document_info))

3800
3800
3800


In [16]:
# prompt: add `embeddings_uploaded` as a column to `corpus_uploaded`
corpus_uploaded['embeddings'] = list(embeddings_uploaded)

In [17]:
# prompt: remove  rows of corpus_uploaded where UT is duplicated

# Remove rows where 'UT' is duplicated, keeping the first occurrence
corpus_uploaded = corpus_uploaded.drop_duplicates(subset=['UT'], keep='first')

In [18]:
# prompt: merge `corpus_uploaded` and `document_info` by column UT. The merged data frame has as many rows and same sorting as corpus_uploaded. The merged data frame is named `full_corpus`.
full_corpus = pd.merge(document_info[['UT', 'uuid', 'cluster']], corpus_uploaded[['UT', 'text', 'embeddings']], on='UT', how='left')

In [19]:
len(full_corpus)

3800

In [20]:
len(document_info)

3800

In [21]:
# prompt: Remove all rows in full_corpus where the size of column `embeddings` do not match 384
full_corpus = full_corpus[full_corpus['embeddings'].apply(lambda x: len(x) == 384 if isinstance(x, list) or isinstance(x, np.ndarray) else False)]

In [22]:
len(full_corpus)

3800

In [23]:
# Remodel the topic model
from bertopic.backend import BaseEmbedder
from bertopic.cluster import BaseCluster
from bertopic.vectorizers import ClassTfidfTransformer
from bertopic.dimensionality import BaseDimensionalityReduction
from sentence_transformers import SentenceTransformer

In [24]:
cluster_idx_mapping = full_corpus.cluster.value_counts()
cluster_idx_mapping = cluster_idx_mapping[cluster_idx_mapping >= heatmap_settings['global']['min_cluster_size']]
cluster_idx_mapping

cluster
AI-1        143
AI-2        120
AI-3        113
AI-4        110
AI-5        110
           ... 
EUact-28     15
EUact-30     15
EUact-31     14
EUact-33     10
EUact-32     10
Name: count, Length: 88, dtype: int64

In [25]:
full_corpus = full_corpus[full_corpus.cluster.isin(cluster_idx_mapping.index.to_list())]

In [26]:
len(full_corpus)

3798

In [27]:
# Form the embbedings
my_embeddings = np.vstack(full_corpus['embeddings'].tolist())

In [28]:
# get text and topics
docs = full_corpus.text
cluster_list = full_corpus.cluster

In [29]:
idx_cluster = [cluster_idx_mapping.index.get_loc(i) for i in cluster_list]

In [30]:
len(idx_cluster) == len(docs) == len(my_embeddings)

True

# 🟢🟢

In [31]:
# Init "empty" models
embedding_model = SentenceTransformer(heatmap_settings["global"]["transformer_model"])
empty_dimensionality_model = BaseDimensionalityReduction()
empty_cluster_model = BaseCluster()
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)

# Fit BERTopic without actually performing any clustering
topic_model= BERTopic(
        embedding_model=embedding_model,
        umap_model=empty_dimensionality_model,
        hdbscan_model=empty_cluster_model,
        ctfidf_model=ctfidf_model
)

In [32]:
topics, probs = topic_model.fit_transform(docs, my_embeddings, y=idx_cluster)

In [33]:
tm_summary = topic_model.get_topic_info()
tm_summary

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,0,143,0_libraries_library_librarians_academic,"[libraries, library, librarians, academic, uni...",[Exploring the implementation of artificial in...
1,1,120,1_nan_age_editorial_guest,"[nan, age, editorial, guest, informationscienc...",[Ethics in artificial intelligence introductio...
2,2,113,2_news_media_journalism_disinformation,"[news, media, journalism, disinformation, fake...",[Contrasted media frames of AI during the COVI...
3,3,110,3_healthcare_care_health_physicians,"[healthcare, care, health, physicians, clinica...",[A framework to identify ethical concerns with...
4,4,110,4_artificialintelligence_nan_divergent_informa...,"[artificialintelligence, nan, divergent, infor...","[ARTIFICIALINTELLIGENCE A SURVEY nan, ARTIFIC..."
...,...,...,...,...,...
83,83,15,83_certificates_suspension_restriction_notified,"[certificates, suspension, restriction, notifi...",[Where the Commission ascertains that a notifi...
84,84,15,84_prosecute_manipulated_obligation_corrective,"[prosecute, manipulated, obligation, correctiv...",[Providers shall ensure that AI systems intend...
85,85,14,85_postmarket_immigration_asylum_monitoring,"[postmarket, immigration, asylum, monitoring, ...",[The postmarket monitoring system shall active...
86,86,10,86_logs_highrisk_recording_logging,"[logs, highrisk, recording, logging, automatic...",[c provide a competent authority upon a reaso...


In [60]:
# Document information. Including the topic assignation
test = topic_model.get_document_info(docs, df = full_corpus)
test = test[['cluster', 'Name']].drop_duplicates(subset=['cluster'], keep='first')
test['short_name'] = test['Name'].str[:7]
test['dataset'] = test['cluster'].str.split('-').str[0]
print(test.shape)
test.head(10)

(88, 4)


Unnamed: 0,cluster,Name,short_name,dataset
0,EUact-1,19_notified_bodies_notifying_body,19_noti,EUact
56,EUact-2,22_surveillance_authorities_market_confidentia...,22_surv,EUact
108,EUact-3,24_sandboxes_sandbox_regulatory_competent,24_sand,EUact
157,EUact-4,27_advisory_forum_board_commission,27_advi,EUact
205,EUact-5,29_conformity_highrisk_harmonisation_section,29_conf,EUact
251,EUact-6,30_generalpurpose_systemic_risks_risk,30_gene,EUact
297,EUact-7,31_highrisk_intended_appropriate_sets,31_high,EUact
342,EUact-8,33_highrisk_importer_importers_name,33_high,EUact
385,EUact-9,35_mandate_authorised_representative_obligations,35_mand,EUact
427,EUact-10,38_harm_persons_adverse_safety,38_harm,EUact




---



In [59]:
test

Unnamed: 0,cluster,Name,short_name,dataset
0,EUact-1,19_notified_bodies_notifying_body,19_notified_bodies_notify,EUact
56,EUact-2,22_surveillance_authorities_market_confidentia...,22_surveillance_authoriti,EUact
108,EUact-3,24_sandboxes_sandbox_regulatory_competent,24_sandboxes_sandbox_regu,EUact
157,EUact-4,27_advisory_forum_board_commission,27_advisory_forum_board_c,EUact
205,EUact-5,29_conformity_highrisk_harmonisation_section,29_conformity_highrisk_ha,EUact
...,...,...,...,...
3704,AI-51,74_embodiment_social_metaphors_spacetime,74_embodiment_social_meta,AI
3725,AI-52,76_cataloging_bibliographic_readers_libraries,76_cataloging_bibliograph,AI
3745,AI-53,77_chatbots_graphs_graphic_dialogue,77_chatbots_graphs_graphi,AI
3764,AI-54,80_big_bda_heuristics_ra,80_big_bda_heuristics_ra,AI


In [35]:
# Default
# Visualize topic similarity using heatmap (self similarity)
hm = topic_model.visualize_heatmap()
#hm.write_html(f"{ROOT_FOLDER_PATH}/heatmap_updated.html")
hm

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

In [36]:
#pd.DataFrame(hm.data[0]['z'], columns=hm.data[0]['x']).to_csv(f'{ROOT_FOLDER_PATH}/{heatmap_settings["metadata"]["heatmap_analysis_id"]}/heatmap_matrix.csv', index=False)



---



## coordinates

In [90]:
import numpy as np
import umap

def reduce_dimensionality(data):
    # Create a UMAP object with the desired settings
    reducer = umap.UMAP(n_components=2, random_state=heatmap_settings['global']['seed'], metric='cosine')

    # Perform dimensionality reduction
    reduced_data = reducer.fit_transform(data)

    return reduced_data

In [91]:
label_dataset = []
for i, label in enumerate(hm.data[0]['x']):
    short_label = label[:7]
    label_dataset.append(test[test['short_name'] == short_label]['dataset'].iloc[0])

updated_matrix = []
for this_line, current_sim_values in enumerate(hm.data[0]['z']):
    updated_sim_values = [0 if label_dataset[i] == label_dataset[this_line] else x for i, x in enumerate(current_sim_values)]
    updated_matrix.append(updated_sim_values)

updated_matrix = np.array(updated_matrix)
updated_matrix


array([[0.        , 0.        , 0.        , ..., 0.43050772, 0.29384995,
        0.22819856],
       [0.        , 0.        , 0.        , ..., 0.37149024, 0.38568619,
        0.21283455],
       [0.        , 0.        , 0.        , ..., 0.49406415, 0.37622219,
        0.36051553],
       ...,
       [0.43050772, 0.37149024, 0.49406415, ..., 0.        , 0.        ,
        0.        ],
       [0.29384995, 0.38568619, 0.37622219, ..., 0.        , 0.        ,
        0.        ],
       [0.22819856, 0.21283455, 0.36051553, ..., 0.        , 0.        ,
        0.        ]])

In [92]:
# Reduce dimensionality using UMAP
reduced_data = reduce_dimensionality(updated_matrix) #hm.data[0]['z']
# Print the shape of the reduced data
print("Reduced data shape:", reduced_data.shape)

Reduced data shape: (88, 2)


In [93]:
dms = pd.DataFrame(reduced_data)
dms.columns = ['x', 'y']
dms['label'] = tm_summary['Name']
dms['cluster'] = dms['label'].map(test.set_index('Name')['cluster'])
dms.head()

Unnamed: 0,x,y,label,cluster
0,8.744691,-1.912183,0_libraries_library_librarians_academic,AI-1
1,6.970324,-3.576364,1_nan_age_editorial_guest,AI-2
2,8.104703,-1.694053,2_news_media_journalism_disinformation,AI-3
3,8.166542,-2.979819,3_healthcare_care_health_physicians,AI-4
4,6.612433,-3.618106,4_artificialintelligence_nan_divergent_informa...,AI-5


In [94]:
# Save dms
dms.to_csv(f'{ROOT_FOLDER_PATH}/{heatmap_settings["metadata"]["heatmap_analysis_id"]}/coordinates.csv', index = False)

In [95]:
# Save heatmap
pd.DataFrame(hm.data[0]['z'], columns=dms["cluster"]).to_csv(f'{ROOT_FOLDER_PATH}/{heatmap_settings["metadata"]["heatmap_analysis_id"]}/heatmap_matrix.csv', index=False)



---



## Melted

In [96]:
# prompt: `hm_test` is a squared matrix similarity matrix. This is a symmetric matrix so we only consider the lower triangle, without the diagonal.  Let's get the melted form as a data frame with 3 columns `Source`, `Target`, and `Similarity`. Then, sort it from the largest similarity to the lowest. Remove the pairs with value of zero.
hm_test = hm.data[0]['z']

# Assuming hm_test is your similarity matrix
df = pd.DataFrame(hm_test)

# Get the lower triangle without the diagonal
rows, cols = np.tril_indices(df.shape[0], -1)

# Create a DataFrame with Source, Target, and Similarity
similarity_df = pd.DataFrame({
    'Source': df.columns[rows],
    'Target': df.columns[cols],
    'Similarity': df.values[rows, cols]
})

# Remove rows with similarity of zero
similarity_df = similarity_df[similarity_df['Similarity'] > 0]

# Sort by similarity in descending order
similarity_df = similarity_df.sort_values('Similarity', ascending=False)

similarity_df

Unnamed: 0,Source,Target,Similarity
1924,62,33,0.912006
1920,62,29,0.904949
557,33,29,0.895954
2007,63,54,0.892012
50,10,5,0.888293
...,...,...,...
3420,83,17,0.070195
1242,50,17,0.063481
1352,52,26,0.055826
612,35,17,0.055556


In [97]:
# prompt: Using dataframe similarity_df: Replace the values of columns Source and Target with the labels from `dms["cluster"]`

# Replace Source and Target with labels from dms["cluster"]
similarity_df['Source'] = similarity_df['Source'].map(lambda x: dms["cluster"][int(x)]) # Convert x to integer
similarity_df['Target'] = similarity_df['Target'].map(lambda x: dms["cluster"][int(x)]) # Convert x to integer


In [98]:
similarity_df.head(100)

Unnamed: 0,Source,Target,Similarity
1924,EUact-20,EUact-8,0.912006
1920,EUact-20,EUact-5,0.904949
557,EUact-8,EUact-5,0.895954
2007,EUact-22,EUact-16,0.892012
50,AI-11,AI-6,0.888293
...,...,...,...
3229,AI-54,AI-49,0.772018
3340,EUact-28,EUact-1,0.771775
1717,AI-42,AI-7,0.771356
335,AI-23,AI-11,0.771295


In [99]:
# # prompt: Using dataframe similarity_df: Replace the values of columns Source and Target with the corresponding "cluster" in the `dms` data frame. Use the column "label" in `dms` to find the matches.

# # Assuming you have a dataframe named 'dms' with 'label' and 'cluster' columns

# # Create a dictionary mapping 'label' to 'cluster' from the 'dms' dataframe
# label_to_cluster = dict(zip(dms['label'], dms['cluster']))

# # Replace 'Source' column values with corresponding 'cluster' values
# similarity_df['Source'] = similarity_df['Source'].map(label_to_cluster)

# # Replace 'Target' column values with corresponding 'cluster' values
# similarity_df['Target'] = similarity_df['Target'].map(label_to_cluster)

# similarity_df.head()

In [100]:
similarity_df.to_csv(f'{ROOT_FOLDER_PATH}/{heatmap_settings["metadata"]["heatmap_analysis_id"]}/heatmap_melted.csv', index = False)