<a href="https://colab.research.google.com/github/cristianmejia00/clustering/blob/main/06_heatmap_sankey/01_heatmap.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Heatmap for Topic Modeling with BERTopic


# Requirements

## Packages installation and initialization

In [None]:
#!pip install bertopic[visualization]

In [None]:
import pandas as pd
import time
import math
from datetime import date
import uuid
import re
import os
import json
import pickle
from itertools import compress
from bertopic import BERTopic
from umap import UMAP
from gensim.parsing.preprocessing import remove_stopwords
import numpy as np
from sklearn.cluster import KMeans

In [None]:
# Change to the name of the folder in your Google Drive
root_folder_name = 'Bibliometrics_Drive'
#ROOT_FOLDER_PATH = f"drive/MyDrive/{root_folder_name}" # <- Google Colab
ROOT_FOLDER_PATH = f"/Users/cristian/Library/CloudStorage/GoogleDrive-cristianmejia00@gmail.com/My Drive/{root_folder_name}" #Mac

## Connect your Google Drive

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

In [None]:
# Function to save object to a pickle file
def save_object_as_pickle(obj, filename):
  """
  Saves an object as a pickle file.

  Args:
      obj: The object to be saved.
      filename: The filename of the pickle file.
  """
  with open(filename, "wb") as f:
    pickle.dump(obj, f)



In [None]:
# Function to load pickle object given a path
def load_pickle(path):
    with open(path, 'rb') as f:
        return pickle.load(f)


In [None]:
def save_heatmap_settings_as_json(heatmap_settings, filename="heatmap_settings.json"):
  """Saves heatmap settings as a JSON file with pretty indentation.

  Args:
      heatmap_settings: The heatmap settings dictionary.
      filename: The name of the JSON file.
  """
  with open(filename, "w") as f:
    json.dump(heatmap_settings, f, indent=4)



---



## PART 3: Merging Topic Models

# 🔴 Input files and options



In [None]:
heatmap_settings = {
    'metadata': {
      'heatmap_analysis_id': 'H013',
      'heatmap_name': 'Brain_Heath_Social_Issues',
      'date': '2025-05-07',
      'created_by': 'cristianmejia00@gmail.com',
      'notes': '',
      'input_directory': '',
      'output_directory': ''
    },
    'global': {
                'min_cluster_size': 10,
                'seed': 100,
                'transformer_model': 'all-MiniLM-L6-v2',
                'sankey_threshold': 0.8
              },
    'inputs': [
        {
            'project_folder_name': 'Q10_brain_health_ts_20250501',
            'analysis_folder_name': 'a01_cn__f01_dc__c01_lv',
            'level_folder_name': 'level1',
            'embeddings_folder_name': 'f01/e01',
            'display_name': 'BH_lv1',
            'cluster_column': 'Cluster Code',
            'heatmap_display_order': 0,
            'sankey_display_order': 0,
            'color': "#E9571F"
        },
        {
            'project_folder_name': 'Q6_wellbeing_ti_20250501',
            'analysis_folder_name': 'a01_cn__f01_dc__c01_lv',
            'embeddings_folder_name': 'f01/e01',
            'level_folder_name': 'level1',
            'display_name': 'WB_lv1',
            'cluster_column': 'Cluster Code',
            'heatmap_display_order': 1,
            'sankey_display_order': 1,
            'color': '#808080'
        },
        {
            'project_folder_name': 'Q7_qol_ti_20250501',
            'analysis_folder_name': 'a01_cn__f01_dc__c01_lv',
            'embeddings_folder_name': 'f01/e01',
            'level_folder_name': 'level1',
            'display_name': 'QoL_lv1',
            'cluster_column': 'Cluster Code',
            'heatmap_display_order': 2,
            'sankey_display_order': 1,
            'color': '#89CFF0'
        },
        {
            'project_folder_name': 'Q8_sustainability_ti_20250501',
            'analysis_folder_name': 'a01_cn__f01_dc__c01_lv',
            'embeddings_folder_name': 'f01/e01',
            'level_folder_name': 'level1',
            'display_name': 'Sust_lv1',
            'cluster_column': 'Cluster Code',
            'heatmap_display_order': 3,
            'sankey_display_order': 1,
            'color': '#F2BA05'
        },
        {
            'project_folder_name': 'Q9_happiness_ti_5y_20250501',
            'analysis_folder_name': 'a01_cn__f01_dc__c01_lv',
            'embeddings_folder_name': 'f01/e01',
            'level_folder_name': 'level1',
            'display_name': 'H_lv1',
            'cluster_column': 'Cluster Code',
            'heatmap_display_order': 4,
            'sankey_display_order': 1,
            'color': '#66FF00'
        }
      ]
}

In [None]:
# Save settings
save_heatmap_settings_as_json(heatmap_settings, filename=f'{ROOT_FOLDER_PATH}/{heatmap_settings["metadata"]["heatmap_analysis_id"]}/heatmap_settings_{heatmap_settings["metadata"]["heatmap_analysis_id"]}_{heatmap_settings["metadata"]["heatmap_name"]}.json')

In [None]:
f'{ROOT_FOLDER_PATH}/{heatmap_settings["metadata"]["heatmap_analysis_id"]}'

In [None]:
heatmap_input_dfs = []
for tm in heatmap_settings['inputs']:
  document_path = f'{ROOT_FOLDER_PATH}/{tm["project_folder_name"]}/{tm["analysis_folder_name"]}/louvain/0.9/{tm["level_folder_name"]}/article_report.csv'
  print(document_path)
  input_df = pd.read_csv(document_path,
                         usecols=['ID', 'uuid', tm['cluster_column']])
  # Each dataset can use different clustering result e.g. X_C, level0, level1, so we need to unify the header name for concatenation
  input_df['display_name'] = tm['display_name']
  input_df['cluster'] = input_df['display_name'] + "-" + input_df[tm['cluster_column']].astype(str)
  input_df = input_df.rename(columns={'ID': 'UT'})
  input_df = input_df.drop(columns=[tm['cluster_column']])
  heatmap_input_dfs.append(input_df)

In [None]:
document_info = pd.concat(heatmap_input_dfs).reset_index(drop=True)
print(len(document_info))
document_info.head()



---



## PART 5. Heatmap

In [None]:
# For firms we know, simply get the embeddings back.
embeddings_list = []
corpus_list = []
for tm in heatmap_settings['inputs']:
  print(f"=================Loading: {tm['project_folder_name']}")
  embeddings = load_pickle(f"{ROOT_FOLDER_PATH}/{tm['project_folder_name']}/{tm['embeddings_folder_name']}/embeddings.pck")
  if type(embeddings) == dict:
    print('Dict type found')
    embeddings = embeddings['embeddings']
    print(len(embeddings))
  corpus_tmp = pd.read_csv(f"{ROOT_FOLDER_PATH}/{tm['project_folder_name']}/{tm['embeddings_folder_name']}/corpus.csv")
  print(len(corpus_tmp))
  
  embeddings_list.append(embeddings)
  corpus_list.append(corpus_tmp)

In [None]:
corpus_list[1].head()


In [None]:
# Combine embeddings
embeddings_uploaded = np.vstack(embeddings_list)
corpus_uploaded = pd.concat(corpus_list).reset_index(drop=True)

In [None]:
# Count df lengths
print(len(embeddings_uploaded))
print(len(corpus_uploaded))
print(len(document_info))

In [None]:
# prompt: add `embeddings_uploaded` as a column to `corpus_uploaded`
corpus_uploaded['embeddings'] = list(embeddings_uploaded)
corpus_uploaded.head()

In [None]:
# Remove rows where 'UT' is duplicated, keeping the first occurrence
corpus_uploaded = corpus_uploaded.drop_duplicates(subset=['uuid'], keep='first')
corpus_uploaded.tail()

In [None]:
# prompt: merge `corpus_uploaded` and `document_info` by column uuid. The merged data frame has as many rows and same sorting as corpus_uploaded. The merged data frame is named `full_corpus`.
full_corpus = pd.merge(document_info[['UT', 'uuid', 'cluster']], corpus_uploaded[['uuid', 'text', 'embeddings']], on='uuid', how='left')
print(len(full_corpus))

In [None]:
len(full_corpus) == len(document_info)

In [None]:
# prompt: Remove all rows in full_corpus where the size of column `embeddings` do not match 384
len(embeddings_uploaded[0])

# Add a new column 'vector_length' to store the size of the embeddings
full_corpus['vector_length'] = full_corpus['embeddings'].apply(lambda x: len(x) if isinstance(x, (list, np.ndarray)) else 0)

# Display the first few rows to verify
full_corpus.head()

#full_corpus = full_corpus[full_corpus['embeddings'].apply(lambda x: len(x) == 384 if isinstance(x, list) or isinstance(x, np.ndarray) else False)]

In [None]:
full_corpus.vector_length.value_counts()


In [None]:
len(full_corpus)

In [None]:
# Remodel the topic model
from bertopic.backend import BaseEmbedder
from bertopic.cluster import BaseCluster
from bertopic.vectorizers import ClassTfidfTransformer
from bertopic.dimensionality import BaseDimensionalityReduction
from sentence_transformers import SentenceTransformer

In [None]:
# This part is optional when we have datasets with small clusters
# cluster_idx_mapping = full_corpus.cluster.value_counts()
# #cluster_idx_mapping = cluster_idx_mapping[cluster_idx_mapping >= heatmap_settings['global']['min_cluster_size']]
# full_corpus = full_corpus[full_corpus.cluster.isin(cluster_idx_mapping.index.to_list())]
# cluster_idx_mapping

In [None]:
# Form the embbedings
my_embeddings = np.vstack(full_corpus['embeddings'].tolist())

In [None]:
# get text and topics
docs = full_corpus.text
cluster_list = full_corpus.cluster

In [None]:
#idx_cluster = [cluster_idx_mapping.index.get_loc(i) for i in cluster_list]

In [None]:
len(idx_cluster) == len(docs) == len(my_embeddings)

# 🟢🟢

In [None]:
# Init "empty" models
embedding_model = SentenceTransformer(heatmap_settings["global"]["transformer_model"])
empty_dimensionality_model = BaseDimensionalityReduction()
empty_cluster_model = BaseCluster()
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)

# Fit BERTopic without actually performing any clustering
topic_model= BERTopic(
        embedding_model=embedding_model,
        umap_model=empty_dimensionality_model,
        hdbscan_model=empty_cluster_model,
        ctfidf_model=ctfidf_model
)

In [None]:
topics, probs = topic_model.fit_transform(docs, my_embeddings, y=idx_cluster)

In [None]:
tm_summary = topic_model.get_topic_info()
tm_summary

In [None]:
# Document information. Including the topic assignation
test = topic_model.get_document_info(docs, df = full_corpus)
test = test[['cluster', 'Name']].drop_duplicates(subset=['cluster'], keep='first')
test['short_name'] = test['Name'].str[:7]
test['dataset'] = test['cluster'].str.split('-').str[0]
print(test.shape)
test.head(10)



---



In [None]:
# Default
# Visualize topic similarity using heatmap (self similarity)
hm = topic_model.visualize_heatmap()
#hm.write_html(f"{ROOT_FOLDER_PATH}/heatmap_updated.html")
hm

In [None]:
#pd.DataFrame(hm.data[0]['z'], columns=hm.data[0]['x']).to_csv(f'{ROOT_FOLDER_PATH}/{heatmap_settings["metadata"]["heatmap_analysis_id"]}/heatmap_matrix.csv', index=False)



---



## coordinates

In [None]:
# The code adjusts the similarity matrix to ignore (set to 0) similarities between items that belong to the same dataset. This might be useful in scenarios where intra-dataset similarities are not meaningful or should be excluded from further analysis.
label_dataset = []
for i, label in enumerate(hm.data[0]['x']):
    short_label = label[:7]
    label_dataset.append(test[test['short_name'] == short_label]['dataset'].iloc[0])

updated_matrix = []
for this_line, current_sim_values in enumerate(hm.data[0]['z']):
    updated_sim_values = [0 if label_dataset[i] == label_dataset[this_line] else x for i, x in enumerate(current_sim_values)]
    updated_matrix.append(updated_sim_values)

updated_matrix = np.array(updated_matrix)
updated_matrix


In [None]:
import numpy as np
import umap

def reduce_dimensionality(data):
    # Create a UMAP object with the desired settings
    reducer = umap.UMAP(n_components=2, random_state=heatmap_settings['global']['seed'], metric='cosine', min_dist=0.65, n_neighbors=25, n_epochs=1500, verbose=True)

    # Perform dimensionality reduction
    reduced_data = reducer.fit_transform(data)

    return reduced_data

---

In [None]:
# Reduce dimensionality using UMAP
reduced_data = reduce_dimensionality(hm.data[0]['z'])
#reduced_data = reduce_dimensionality(updated_matrix) 
# Print the shape of the reduced data
print("Reduced data shape:", reduced_data.shape)

In [None]:
# Here's a dangerous procedure. We are appending the names of the clusters without veryfying the order in the heatmap.
dms = pd.DataFrame(reduced_data)
dms.columns = ['x', 'y']
dms['label'] = tm_summary['Name'] # Here. We need to ensure the order. If the heatmap change the order of the cluster like by applying the heatmap clustered, then this code will fail.
dms['cluster'] = dms['label'].map(test.set_index('Name')['cluster'])
dms.head()

In [None]:
# Save dms
dms.to_csv(f'{ROOT_FOLDER_PATH}/{heatmap_settings["metadata"]["heatmap_analysis_id"]}/coordinates.csv', index = False)

In [None]:
# Save heatmap
pd.DataFrame(hm.data[0]['z'], columns=dms["cluster"]).to_csv(f'{ROOT_FOLDER_PATH}/{heatmap_settings["metadata"]["heatmap_analysis_id"]}/heatmap_matrix.csv', index=False)

In [None]:
dms['dataset'] = dms['cluster'].str.split('_').str[0]
dms.head(10)

In [None]:
from turtle import color
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Create a scatter plot with different colors for each dataset
plt.figure(figsize=(10, 6))
sns.scatterplot(data=dms, x='x', y='y', s=100, hue='dataset', palette='Set1', alpha=0.7)
plt.title('Scatter Plot by Dataset')
plt.xlabel('X-axis')
plt.ylabel('Y-axis')
plt.grid(True, linestyle='--', alpha=0.7)
# Add legend
plt.legend(title='Dataset', bbox_to_anchor=(1.05, 1), loc='upper left')
# Improve appearance
sns.set_style("whitegrid")
plt.tight_layout()
# Show the plot
plt.show()



---



## Melted

In [None]:
# prompt: `hm_test` is a squared matrix similarity matrix. This is a symmetric matrix so we only consider the lower triangle, without the diagonal.  Let's get the melted form as a data frame with 3 columns `Source`, `Target`, and `Similarity`. Then, sort it from the largest similarity to the lowest. Remove the pairs with value of zero.
hm_test = hm.data[0]['z']

# Assuming hm_test is your similarity matrix
df = pd.DataFrame(hm_test)

# Get the lower triangle without the diagonal
rows, cols = np.tril_indices(df.shape[0], -1)

# Create a DataFrame with Source, Target, and Similarity
similarity_df = pd.DataFrame({
    'Source': df.columns[rows],
    'Target': df.columns[cols],
    'Similarity': df.values[rows, cols]
})

# Remove rows with similarity of zero
similarity_df = similarity_df[similarity_df['Similarity'] > 0]

# Sort by similarity in descending order
similarity_df = similarity_df.sort_values('Similarity', ascending=False)

similarity_df

In [None]:
# prompt: Using dataframe similarity_df: Replace the values of columns Source and Target with the labels from `dms["cluster"]`

# Replace Source and Target with labels from dms["cluster"]
similarity_df['Source'] = similarity_df['Source'].map(lambda x: dms["cluster"][int(x)]) # Convert x to integer
similarity_df['Target'] = similarity_df['Target'].map(lambda x: dms["cluster"][int(x)]) # Convert x to integer


In [None]:
similarity_df.head(100)

In [None]:
# # prompt: Using dataframe similarity_df: Replace the values of columns Source and Target with the corresponding "cluster" in the `dms` data frame. Use the column "label" in `dms` to find the matches.

# # Assuming you have a dataframe named 'dms' with 'label' and 'cluster' columns

# # Create a dictionary mapping 'label' to 'cluster' from the 'dms' dataframe
# label_to_cluster = dict(zip(dms['label'], dms['cluster']))

# # Replace 'Source' column values with corresponding 'cluster' values
# similarity_df['Source'] = similarity_df['Source'].map(label_to_cluster)

# # Replace 'Target' column values with corresponding 'cluster' values
# similarity_df['Target'] = similarity_df['Target'].map(label_to_cluster)

# similarity_df.head()

In [None]:
similarity_df.to_csv(f'{ROOT_FOLDER_PATH}/{heatmap_settings["metadata"]["heatmap_analysis_id"]}/heatmap_melted.csv', index = False)