<a href="https://colab.research.google.com/github/cristianmejia00/clustering/blob/main/06_heatmap_sankey/01_heatmap.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Heatmap for Topic Modeling with BERTopic


# Requirements

## Packages installation and initialization

In [1]:
!pip install bertopic[visualization]

Collecting bertopic[visualization]
  Downloading bertopic-0.16.4-py3-none-any.whl.metadata (23 kB)
[0mCollecting hdbscan>=0.8.29 (from bertopic[visualization])
  Downloading hdbscan-0.8.39-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (15 kB)
Collecting umap-learn>=0.5.0 (from bertopic[visualization])
  Downloading umap_learn-0.5.7-py3-none-any.whl.metadata (21 kB)
Collecting pynndescent>=0.5 (from umap-learn>=0.5.0->bertopic[visualization])
  Downloading pynndescent-0.5.13-py3-none-any.whl.metadata (6.8 kB)
Downloading hdbscan-0.8.39-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.2/4.2 MB[0m [31m31.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading umap_learn-0.5.7-py3-none-any.whl (88 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m88.8/88.8 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading bertopic-0.16.4-py3-none-any.whl (143 kB)
[2K   [90m━

In [2]:
import pandas as pd
import time
import math
from datetime import date
import uuid
import re
import os
import json
import pickle
from itertools import compress
from bertopic import BERTopic
from umap import UMAP
from gensim.parsing.preprocessing import remove_stopwords
import numpy as np
from sklearn.cluster import KMeans

In [3]:
# Change to the name of the folder in your Google Drive
root_folder_name = 'Bibliometrics_Drive'
ROOT_FOLDER_PATH = f"drive/MyDrive/{root_folder_name}"

## Connect your Google Drive

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
# Function to save object to a pickle file
def save_object_as_pickle(obj, filename):
  """
  Saves an object as a pickle file.

  Args:
      obj: The object to be saved.
      filename: The filename of the pickle file.
  """
  with open(filename, "wb") as f:
    pickle.dump(obj, f)



In [6]:
# Function to load pickle object given a path
def load_pickle(path):
    with open(path, 'rb') as f:
        return pickle.load(f)


In [7]:
def save_heatmap_settings_as_json(heatmap_settings, filename="heatmap_settings.json"):
  """Saves heatmap settings as a JSON file with pretty indentation.

  Args:
      heatmap_settings: The heatmap settings dictionary.
      filename: The name of the JSON file.
  """
  with open(filename, "w") as f:
    json.dump(heatmap_settings, f, indent=4)



---



## PART 3: Merging Topic Models

# 🔴 Input files and options



In [53]:
heatmap_settings = {
    'metadata': {
      'heatmap_analysis_id': 'H008',
      'heatmap_name': 'Human_Aug-Innovation-Innovativeness',
      'date': '2024-11-18',
      'created_by': 'cristianmejia00@gmail.com',
      'notes': '',
      'input_directory': '',
      'output_directory': ''
    },
    'global': {
                'min_cluster_size': 10,
                'seed': 100,
                'transformer_model': 'all-MiniLM-L6-v2',
                'sankey_threshold': 0.8
              },
    'inputs': [
        {
            'project_folder_name': 'Q310_innovation_CORRECT',
            'analysis_folder_name': 'a01_cn__f01_dc__c01_lv',
            'level_folder_name': 'level1',
            'embeddings_folder_name': 'f01/e01',
            'display_name': 'INVN',
            'cluster_column': 'Cluster Code',
            'heatmap_display_order': 0,
            'sankey_display_order': 1,
            'color': "#E9571F"
        },
        {
            'project_folder_name': 'Q311_innovativeness_CORRECT',
            'analysis_folder_name': 'a01_cn__f01_dc__c01_lv',
            'embeddings_folder_name': 'f01/e01',
            'level_folder_name': 'level0',
            'display_name': 'INVTNS',
            'cluster_column': 'Cluster Code',
            'heatmap_display_order': 1,
            'sankey_display_order': 1,
            'color': '#4C962F'
        },
        {
            'project_folder_name': 'Q318_human_augmentation',
            'analysis_folder_name': 'a01_cn__f01_dc__c01_lv',
            'embeddings_folder_name': 'f01/e01',
            'level_folder_name': 'level1',
            'display_name': 'HA',
            'cluster_column': 'Cluster Code',
            'heatmap_display_order': 2,
            'sankey_display_order': 0,
            'color': '#808080'
        }#,
        # {
        #     'project_folder_name': 'Q282b_riken_com',
        #     'analysis_folder_name': 'a01_tm__f01_e01__km01',
        #     'embeddings_folder_name': 'f01/e01',
        #     'level_folder_name': 'level0',
        #     'display_name': 'RIKEN.Com',
        #     'cluster_column': 'Cluster Code',
        #     'heatmap_display_order': 3,
        #     'sankey_display_order': 2,
        #     'color': '#808080'
        # },
        # {
        #     'project_folder_name': 'Q312_utokyo',
        #     'analysis_folder_name': 'a01_tm__f01_e01__km01',
        #     'embeddings_folder_name': 'f01/e01',
        #     'level_folder_name': 'level0',
        #     'display_name': 'UTokyo',
        #     'cluster_column': 'Cluster Code',
        #     'heatmap_display_order': 4,
        #     'sankey_display_order': 1,
        #     'color': '#F2BA05'
        # }#,
        # # {
        # #     'project_folder_name': 'Q308_bio_plant',
        # #     'analysis_folder_name': 'a01_tm__f01_e01__km01',
        # #     'embeddings_folder_name': 'f01/e01',
        # #     'level_folder_name': 'level0',
        # #     'display_name': 'Plant',
        # #     'cluster_column': 'Cluster Code',
        # #     'heatmap_display_order': 4,
        # #     'sankey_display_order': 1,
        # #     'color': '#66FF00'
        # # }
      ]
}

In [54]:
analysis_folder = ""

if not os.path.exists(analysis_folder):
  print("We are here!")
  !mkdir $analysis_folder


# Save settings
save_heatmap_settings_as_json(heatmap_settings, filename=f'{ROOT_FOLDER_PATH}/{heatmap_settings["metadata"]["heatmap_analysis_id"]}/heatmap_settings_{heatmap_settings["metadata"]["heatmap_analysis_id"]}_{heatmap_settings["metadata"]["heatmap_name"]}.json')

In [None]:
analysis_folder = f'{ROOT_FOLDER_PATH}/{heatmap_settings["metadata"]["heatmap_analysis_id"]}'

if not os.path.exists(analysis_folder):
  !mkdir $analysis_folder

In [55]:
heatmap_input_dfs = []
for tm in heatmap_settings['inputs']:
  document_path = f'{ROOT_FOLDER_PATH}/{tm["project_folder_name"]}/{tm["analysis_folder_name"]}/{tm["level_folder_name"]}/article_report.csv'
  print(document_path)
  input_df = pd.read_csv(document_path,
                         usecols=['ID', 'uuid', tm['cluster_column']])
  # Each dataset can use different clustering result e.g. X_C, level0, level1, so we need to unify the header name for concatenation
  input_df['display_name'] = tm['display_name']
  input_df['cluster'] = input_df['display_name'] + "-" + input_df[tm['cluster_column']].astype(str)
  input_df = input_df.rename(columns={'ID': 'UT'})
  input_df = input_df.drop(columns=[tm['cluster_column']])
  heatmap_input_dfs.append(input_df)

drive/MyDrive/Bibliometrics_Drive/Q310_innovation_CORRECT/a01_cn__f01_dc__c01_lv/level1/article_report.csv
drive/MyDrive/Bibliometrics_Drive/Q311_innovativeness_CORRECT/a01_cn__f01_dc__c01_lv/level0/article_report.csv
drive/MyDrive/Bibliometrics_Drive/Q318_human_augmentation/a01_cn__f01_dc__c01_lv/level1/article_report.csv


In [56]:
document_info = pd.concat(heatmap_input_dfs).reset_index(drop=True)
print(len(document_info))
document_info.head()

37157


Unnamed: 0,UT,uuid,display_name,cluster
0,WOS:000074597100003,abb8d7c2-0ebe-4d16-ad61-7bb4ae967736,INVN,INVN-1-1---
1,WOS:A1993KH92100002,3f57a7b6-9c13-427d-a2b0-7a69c9086904,INVN,INVN-1-1---
2,WOS:000177365900005,74fd2662-98de-485f-bcd5-ac7f5512d0e8,INVN,INVN-1-1---
3,WOS:000076481600003,81d15331-8909-4063-b43f-c79f08c183f5,INVN,INVN-1-1---
4,WOS:000228151500004,37f717fb-12fd-4bf7-b765-fa8fa2e36a16,INVN,INVN-1-1---




---



## PART 5. Heatmap

In [57]:
# For firms we know, simply get the embeddings back.
embeddings_list = []
corpus_list = []
for tm in heatmap_settings['inputs']:
  print(f"=================Loading: {tm['project_folder_name']}")
  embeddings = load_pickle(f"{ROOT_FOLDER_PATH}/{tm['project_folder_name']}/{tm['embeddings_folder_name']}/embeddings.pck")
  if type(embeddings) == dict:
    print('Dict type found')
    embeddings = embeddings['embeddings']
  corpus_tmp = pd.read_csv(f"{ROOT_FOLDER_PATH}/{tm['project_folder_name']}/{tm['embeddings_folder_name']}/corpus.csv")

  embeddings_list.append(embeddings)
  corpus_list.append(corpus_tmp)

Dict type found
Dict type found
Dict type found


In [58]:
# Combine embeddings
embeddings_uploaded = np.vstack(embeddings_list)
corpus_uploaded = pd.concat(corpus_list).reset_index(drop=True)

In [59]:
embeddings_uploaded.shape

(59079, 384)

In [60]:
print(len(embeddings_uploaded))
print(len(corpus_uploaded))
print(len(document_info))

59079
59079
37157


In [61]:
# prompt: add `embeddings_uploaded` as a column to `corpus_uploaded`
corpus_uploaded['embeddings'] = list(embeddings_uploaded)

In [62]:
# prompt: remove  rows of corpus_uploaded where UT is duplicated

# Remove rows where 'UT' is duplicated, keeping the first occurrence
corpus_uploaded = corpus_uploaded.drop_duplicates(subset=['UT'], keep='first')

In [63]:
# prompt: merge `corpus_uploaded` and `document_info` by column UT. The merged data frame has as many rows and same sorting as corpus_uploaded. The merged data frame is named `full_corpus`.
full_corpus = pd.merge(document_info[['UT', 'uuid', 'cluster']], corpus_uploaded[['UT', 'text', 'embeddings']], on='UT', how='left')

In [64]:
len(full_corpus)

37157

In [65]:
len(document_info)

37157

In [66]:
# prompt: Remove all rows in full_corpus where the size of column `embeddings` do not match 384
full_corpus = full_corpus[full_corpus['embeddings'].apply(lambda x: len(x) == 384 if isinstance(x, list) or isinstance(x, np.ndarray) else False)]

In [67]:
len(full_corpus)

37157

In [68]:
# Remodel the topic model
from bertopic.backend import BaseEmbedder
from bertopic.cluster import BaseCluster
from bertopic.vectorizers import ClassTfidfTransformer
from bertopic.dimensionality import BaseDimensionalityReduction
from sentence_transformers import SentenceTransformer

In [69]:
cluster_idx_mapping = full_corpus.cluster.value_counts()
cluster_idx_mapping = cluster_idx_mapping[cluster_idx_mapping >= heatmap_settings['global']['min_cluster_size']]
cluster_idx_mapping

Unnamed: 0_level_0,count
cluster,Unnamed: 1_level_1
INVN-2-1---,882
INVN-2-2---,807
INVN-99-99---,731
INVN-1-1---,684
INVN-4-1---,664
...,...
HA-7-2---,46
HA-14-1---,46
HA-13-2---,46
HA-2-4---,45


In [70]:
full_corpus = full_corpus[full_corpus.cluster.isin(cluster_idx_mapping.index.to_list())]

In [71]:
len(full_corpus)

37157

In [72]:
# Form the embbedings
my_embeddings = np.vstack(full_corpus['embeddings'].tolist())

In [73]:
# get text and topics
docs = full_corpus.text
cluster_list = full_corpus.cluster

In [74]:
idx_cluster = [cluster_idx_mapping.index.get_loc(i) for i in cluster_list]

In [75]:
len(idx_cluster) == len(docs) == len(my_embeddings)

True

# 🟢🟢

In [76]:
# Init "empty" models
embedding_model = SentenceTransformer(heatmap_settings["global"]["transformer_model"])
empty_dimensionality_model = BaseDimensionalityReduction()
empty_cluster_model = BaseCluster()
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)

# Fit BERTopic without actually performing any clustering
topic_model= BERTopic(
        embedding_model=embedding_model,
        umap_model=empty_dimensionality_model,
        hdbscan_model=empty_cluster_model,
        ctfidf_model=ctfidf_model
)

In [77]:
topics, probs = topic_model.fit_transform(docs, my_embeddings, y=idx_cluster)

In [78]:
tm_summary = topic_model.get_topic_info()
tm_summary

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,0,882,0_transitions_transition_niche_regime,"[transitions, transition, niche, regime, socio...",[NGOs fostering transitions towards sustainabl...
1,1,807,1_tis_wind_systemic_functions,"[tis, wind, systemic, functions, sectoral, pv,...",[From lagging to leading? Technological innova...
2,2,731,2_ir_land_journalism_reporting,"[ir, land, journalism, reporting, foresight, r...",[Integrated reporting: On the need for broaden...
3,3,684,3_orientation_marketing_mo_orientations,"[orientation, marketing, mo, orientations, mar...",[Mediating effect of innovation capability bet...
4,4,664,4_dynamic_capabilities_dcs_capability,"[dynamic, capabilities, dcs, capability, dc, m...",[How does intellectual capital drive firm perf...
...,...,...,...,...,...
201,201,46,201_brain_fnirs_eog_bci,"[brain, fnirs, eog, bci, physiological, notifi...",[A Novel Safety Evaluation Approach of Transfe...
202,202,46,202_strain_mxene_sensor_sensors,"[strain, mxene, sensor, sensors, stretchable, ...",[Constructing conductive titanium carbide nano...
203,203,46,203_tomato_fruit_malate_crispr,"[tomato, fruit, malate, crispr, genes, ma1, ge...",[A dramatic decline in fruit citrate induced b...
204,204,45,204_sleeve_exosuit_soft_shoulder,"[sleeve, exosuit, soft, shoulder, elbow, upper...",[Design and Control of an Assistive Device for...


In [79]:
# Document information. Including the topic assignation
test = topic_model.get_document_info(docs, df = full_corpus)
test = test[['cluster', 'Name']].drop_duplicates(subset=['cluster'], keep='first')
print(test.shape)
test.head(10)

(206, 2)


Unnamed: 0,cluster,Name
0,INVN-1-1---,3_orientation_marketing_mo_orientations
684,INVN-1-2---,7_co_creation_service_logic
1276,INVN-1-3---,18_creativity_creative_team_employee
1711,INVN-1-4---,23_npd_portfolio_ffe_product
2094,INVN-1-5---,37_libraries_tics_capability_library
2382,INVN-1-6---,44_crowdsourcing_gamification_crowd_contests
2639,INVN-1-7---,54_tqm_qm_quality_scqm
2858,INVN-1-8---,58_nsd_service_kibs_hospice
3073,INVN-1-9---,61_culture_controls_esop_clan
3271,INVN-1-10---,76_ddi_launch_salesperson_brand




---



In [None]:
# Default
# Visualize topic similarity using heatmap (self similarity)
hm = topic_model.visualize_heatmap()
#hm.write_html(f"{ROOT_FOLDER_PATH}/heatmap_updated.html")
hm

In [81]:
#pd.DataFrame(hm.data[0]['z'], columns=hm.data[0]['x']).to_csv(f'{ROOT_FOLDER_PATH}/{heatmap_settings["metadata"]["heatmap_analysis_id"]}/heatmap_matrix.csv', index=False)



---



## coordinates

In [82]:
import numpy as np
import umap

def reduce_dimensionality(data):
    # Create a UMAP object with the desired settings
    reducer = umap.UMAP(n_components=2, random_state=heatmap_settings['global']['seed'])

    # Perform dimensionality reduction
    reduced_data = reducer.fit_transform(data)

    return reduced_data

In [83]:
# Reduce dimensionality using UMAP
reduced_data = reduce_dimensionality(hm.data[0]['z'])
# Print the shape of the reduced data
print("Reduced data shape:", reduced_data.shape)

Reduced data shape: (206, 2)


In [84]:
dms = pd.DataFrame(reduced_data)
dms.columns = ['x', 'y']
dms['label'] = tm_summary['Name']
dms['cluster'] = dms['label'].map(test.set_index('Name')['cluster'])
dms.head()

Unnamed: 0,x,y,label,cluster
0,10.140092,4.610168,0_transitions_transition_niche_regime,INVN-2-1---
1,11.565411,4.141211,1_tis_wind_systemic_functions,INVN-2-2---
2,12.196259,3.655769,2_ir_land_journalism_reporting,INVN-99-99---
3,13.745502,2.638422,3_orientation_marketing_mo_orientations,INVN-1-1---
4,13.214653,3.268765,4_dynamic_capabilities_dcs_capability,INVN-4-1---


In [85]:
# Save dms
dms.to_csv(f'{ROOT_FOLDER_PATH}/{heatmap_settings["metadata"]["heatmap_analysis_id"]}/coordinates.csv', index = False)

In [86]:
# Save heatmap
pd.DataFrame(hm.data[0]['z'], columns=dms["cluster"]).to_csv(f'{ROOT_FOLDER_PATH}/{heatmap_settings["metadata"]["heatmap_analysis_id"]}/heatmap_matrix.csv', index=False)



---



## Melted

In [87]:
# prompt: `hm_test` is a squared matrix similarity matrix. This is a symmetric matrix so we only consider the lower triangle, without the diagonal.  Let's get the melted form as a data frame with 3 columns `Source`, `Target`, and `Similarity`. Then, sort it from the largest similarity to the lowest. Remove the pairs with value of zero.
hm_test = hm.data[0]['z']

# Assuming hm_test is your similarity matrix
df = pd.DataFrame(hm_test)

# Get the lower triangle without the diagonal
rows, cols = np.tril_indices(df.shape[0], -1)

# Create a DataFrame with Source, Target, and Similarity
similarity_df = pd.DataFrame({
    'Source': df.columns[rows],
    'Target': df.columns[cols],
    'Similarity': df.values[rows, cols]
})

# Remove rows with similarity of zero
similarity_df = similarity_df[similarity_df['Similarity'] > 0]

# Sort by similarity in descending order
similarity_df = similarity_df.sort_values('Similarity', ascending=False)

similarity_df

Unnamed: 0,Source,Target,Similarity
20877,204,171,0.976255
56,11,1,0.973396
5256,103,3,0.972523
5632,106,67,0.972091
12072,155,137,0.971466
...,...,...,...
12956,161,76,0.000121
7315,121,55,0.000080
20389,202,88,0.000077
13264,163,61,0.000063


In [88]:
# prompt: Using dataframe similarity_df: Replace the values of columns Source and Target with the labels from `dms["cluster"]`

# Replace Source and Target with labels from dms["cluster"]
similarity_df['Source'] = similarity_df['Source'].map(lambda x: dms["cluster"][int(x)]) # Convert x to integer
similarity_df['Target'] = similarity_df['Target'].map(lambda x: dms["cluster"][int(x)]) # Convert x to integer


In [89]:
similarity_df.head(100)

Unnamed: 0,Source,Target,Similarity
20877,HA-2-4---,HA-2-3---,0.976255
56,INVN-2-99---,INVN-2-2---,0.973396
5256,INVTNS-1,INVN-1-1---,0.972523
5632,INVN-9-5---,INVN-9-1---,0.972091
12072,HA-1-6---,HA-1-2---,0.971466
...,...,...,...
9515,INVN-5-11---,INVN-5-4---,0.926539
13583,INVN-8-11---,INVN-8-99---,0.926066
2853,INVN-1-10---,INVN-1-1---,0.925354
1425,INVN-8-99---,INVN-11-99---,0.925264


In [90]:
# # prompt: Using dataframe similarity_df: Replace the values of columns Source and Target with the corresponding "cluster" in the `dms` data frame. Use the column "label" in `dms` to find the matches.

# # Assuming you have a dataframe named 'dms' with 'label' and 'cluster' columns

# # Create a dictionary mapping 'label' to 'cluster' from the 'dms' dataframe
# label_to_cluster = dict(zip(dms['label'], dms['cluster']))

# # Replace 'Source' column values with corresponding 'cluster' values
# similarity_df['Source'] = similarity_df['Source'].map(label_to_cluster)

# # Replace 'Target' column values with corresponding 'cluster' values
# similarity_df['Target'] = similarity_df['Target'].map(label_to_cluster)

# similarity_df.head()

In [91]:
similarity_df.to_csv(f'{ROOT_FOLDER_PATH}/{heatmap_settings["metadata"]["heatmap_analysis_id"]}/heatmap_melted.csv', index = False)