In [1]:
import os
import json
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
from pathlib import Path
from collections import defaultdict
from itertools import cycle

# ML / NLP
from umap import UMAP
from hdbscan import HDBSCAN
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_distances
from bertopic import BERTopic

from sklearn.manifold import TSNE

  from .autonotebook import tqdm as notebook_tqdm


In [46]:
# --- CONFIGURATION ---
# (Adjust these paths as needed)
ROOT_PATH = Path("/Users/cristian/Library/CloudStorage/GoogleDrive-cristianmejia00@gmail.com/My Drive/Bibliometrics_Drive") # Or local path
PROJECT_FOLDER = 'Q339_igem'
ANALYSIS_ID = 'a01_tm__f01_e01__hdbs'
SETTINGS_FILE = "settings_analysis_directive_2025-08-04-16-40.json"
LEVEL = '0'

In [23]:
# --- HELPER FUNCTIONS ---

def load_settings(path):
    with open(path, 'r') as f:
        return json.load(f)

def save_pickle(obj, path):
    with open(path, "wb") as f:
        pickle.dump(obj, f)

def load_pickle(path):
    with open(path, 'rb') as f:
        return pickle.load(f)

In [82]:
def plot_clusters_improved(df_clean, rcs, output_path):
    """
    Plot clusters using df_clean which already contains x, y coordinates and colors.
    Cluster names are retrieved from rcs dataframe.
    """
    # Filter out rows without coordinates
    valid_mask = df_clean['x'].notna() & df_clean['y'].notna()
    df_plot = df_clean[valid_mask].copy()
    
    unique_clusters = sorted(df_plot['cluster_code'].unique())
    
    # Create cluster name mapping from rcs
    cluster_name_map = dict(zip(rcs['cluster_code'], rcs['cluster_label']))
    
    fig, ax = plt.subplots(figsize=(12, 12), dpi=300)
    
    # Plot points using colors from df_clean
    ax.scatter(df_plot['x'], df_plot['y'], 
               c=df_plot['color'], s=2.0, alpha=0.5, edgecolors='none')
    
    # Add Text Labels
    for cluster_code in unique_clusters:
        cluster_mask = df_plot['cluster_code'] == cluster_code
        
        # Get the median center of the cluster points
        x_center = df_plot.loc[cluster_mask, 'x'].median()
        y_center = df_plot.loc[cluster_mask, 'y'].median()
        
        # Get cluster name from rcs
        cluster_name = cluster_name_map.get(cluster_code, str(cluster_code))
        
        # Clean name logic
        words = str(cluster_name).split('_')
        short_name = " ".join(words[:3]) if len(words) > 2 else words[0]
        
        # Only label if the cluster is somewhat large to avoid clutter
        if sum(cluster_mask) > 20: 
            ax.text(x_center, y_center, short_name, fontsize=5, fontweight='bold', 
                    ha='center', va='center', alpha=0.9, color='black',
                    bbox=dict(boxstyle="round,pad=0.1", fc="white", alpha=0.6, ec="none"))

    ax.axis('off')
    plt.tight_layout()
    fig.savefig(output_path / "plot_clusters_tsne.png")
    plt.close()

def plot_years_improved(df_clean, output_path):
    """
    Plot years using df_clean which already contains x, y coordinates and year.
    """
    # Filter out rows without coordinates
    valid_mask = (df_clean['x'].notna() & df_clean['y'].notna() & 
                  df_clean['year'].notna() & (df_clean['year'] > 1900))
    df_plot = df_clean[valid_mask].copy()
    
    # Sort by year so new papers are plotted ON TOP of old ones
    df_plot = df_plot.sort_values('year')

    fig, ax = plt.subplots(figsize=(12, 12), dpi=300)
    
    # Using 'turbo' for a more "scientific" and impactful look
    sc = ax.scatter(df_plot['x'], df_plot['y'], 
                    c=df_plot['year'], cmap='turbo', 
                    s=2.0, alpha=0.5, edgecolors='none')
    
    cbar = plt.colorbar(sc, ax=ax, fraction=0.03, pad=0.04)
    cbar.set_label('Year', rotation=270, labelpad=15)
    
    ax.axis('off')
    plt.tight_layout()
    fig.savefig(output_path / "plot_years_tsne.png")
    plt.close()

In [47]:
# 1. SETUP
base_dir = ROOT_PATH / PROJECT_FOLDER
settings_path = base_dir / ANALYSIS_ID / SETTINGS_FILE
settings = load_settings(settings_path)

In [48]:
# Load RCS
rcs = pd.read_csv(base_dir / ANALYSIS_ID / f'level{LEVEL}' / 'rcs_merged.csv').reset_index(drop=True)

# Define color palette
fukan_colors = ["#f00f15", "#2270e7", "#e5e510", "#ff8103", "#4f3dd1", "#26cc3a", "#ec058e", "#9cb8c2", "#fffdd0", "#b40e68"]
fukan_colors_extended = fukan_colors + ["#5afb5a", "#beaed4", "#fdc086", "#99fdff", "#c430ff", "#e4dbe0", "#bf5b17", "#666666"]

# Add color column to rcs if it doesn't exist
if 'color' not in rcs.columns:
    # Get the number of rows in rcs (equivalent to length(id_com) in R)
    n_rows = len(rcs)
    # Recycle color_palette to match the length of rcs
    import itertools
    color_palette = list(itertools.islice(itertools.cycle(fukan_colors_extended), n_rows))
    rcs['color'] = color_palette
    print(f"Added 'color' column to rcs with {n_rows} colors (recycled from {len(fukan_colors_extended)} base colors)")
else:
    print("'color' column already exists in rcs")

Added 'color' column to rcs with 68 colors (recycled from 18 base colors)


In [93]:
rcs.columns

Index(['cluster', 'cluster_code', 'main_cluster', 'cluster_name', 'documents',
       'documents_percent', 'documents_cummulative', 'PY_Min.', 'PY_X1st.Qu.',
       'PY_Median', 'PY_Mean', 'PY_X3rd.Qu.', 'PY_Max.', 'PY_sd', 'Z9_Min.',
       'Z9_X1st.Qu.', 'Z9_Median', 'Z9_Mean', 'Z9_X3rd.Qu.', 'Z9_Max.',
       'Z9_sd', 'participation', 'growth_rate', 'rcs_label', 'hub_title',
       'hub_year', 'hub_degree', 'hub_id', 'hub_type1', 'hub_type2', 'AU',
       'WC', 'Countries', 'Institutions', 'DE', 'ID', 'color',
       'cluster_label'],
      dtype='object')

In [94]:
rcs['cluster_label'] = rcs['cluster_code'].astype(str) + "-" + rcs['hub_title']
rcs.head()

Unnamed: 0,cluster,cluster_code,main_cluster,cluster_name,documents,documents_percent,documents_cummulative,PY_Min.,PY_X1st.Qu.,PY_Median,...,hub_type1,hub_type2,AU,WC,Countries,Institutions,DE,ID,color,cluster_label
0,1,1,1,,150,3.57,3.57,2009,2016.0,2021.0,...,ARTICLE,ARTICLE,georgia_state; bioplus-shanghai; frankfurt; gd...,fabrics; bioplastics; clean dyes and pigments;...,chn; usa; deu; can; jpn,bio+ (guangdong) institute of science and tech...,biosynthesis; production; yeast; coli; cerevisiae,manufacturing; high school; biomanufacturing; ...,#f00f15,"1-Cannabis \""\""Vaccine\""\"""
1,2,2,2,,122,2.91,6.48,2009,2017.0,2019.0,...,ARTICLE,ARTICLE,nju-china; afcm-egypt; cpu_china; fudan; lzu-c...,"biofilms; fertility, contraceptives, uterine h...",chn; usa; hkg; deu; egy,"nanjing university\n\n\nnanjing, jiangsu, chin...",cancer; therapy; system; cells; delivery,therapeutics; oncology; health & medicine; hig...,#2270e7,2-ErythrO2
2,3,3,3,,121,2.88,9.36,2009,2016.0,2019.0,...,ARTICLE,ARTICLE,washu_stlouis; sorbonne_u_paris; china-fafu; g...,antimicrobial resistance; algal blooms; clean ...,chn; usa; deu; fra; gbr,"tacoma, wa, usa; sorbonne université, \n\n\nfa...",; engineering; nitrogen; production; water,environment; high school; bioremediation; food...,#e5e510,3-Salt Vault
3,4,4,4,,120,2.86,12.22,2009,2011.0,2014.0,...,ARTICLE,ARTICLE,cambridge; macquarie_australia; bonn; brocku; ...,advancements in dna assembly; biosafety concer...,chn; usa; gbr; jpn; fra,tokyo university of agriculture and technology...,light; coli; ; e; control,new application; foundational advance; manufac...,#ff8103,4-Snow White
4,5,5,5,,115,2.74,14.96,2009,2015.5,2019.0,...,ARTICLE,ARTICLE,fdr-hb_peru; hbut-china; hust-china; cornell; ...,heavy metals; household and industrial waste; ...,chn; usa; can; deu; gbr,"university of lethbridge\n\n\nlethbridge, albe...",heavy; metal; cadmium; bioremediation; system,environment; high school; bioremediation; new ...,#4f3dd1,5-Highway To Platinum


In [83]:
# Load coords
coords = pd.read_csv(base_dir / ANALYSIS_ID/ 'document_coords_tsne.csv')

# Rename columns to x and y
coords.columns = ['uuid', 'UT', 'x', 'y']

coords.head()

Unnamed: 0,uuid,UT,x,y
0,791aefbd-0fde-417e-8e1b-2f7210647032,173,-38.240646,-27.92819
1,84d56c08-f62f-4bfd-986c-71a44c1af2ed,174,-54.59228,76.619385
2,8a69e820-ef66-4438-8ed2-08943ab2c6b1,175,15.818377,-33.132805
3,572f8dd4-bb30-4d59-a319-2a5e650cce90,176,15.20262,-69.07794
4,be21c9a1-0aac-4e87-86f5-f6023d374461,177,67.60592,-4.861532


In [84]:
# Load article report
article_report = pd.read_csv(base_dir / ANALYSIS_ID / f'level{LEVEL}' / 'article_report.csv', encoding='cp1250').reset_index(drop=True)
article_report.head()


Unnamed: 0,Cluster Index,Cluster Code,Authors,Publication Years,DOI,Title,Abstract,Citations,Degree,Author Keywords,Categories,Countries,ID,uuid
0,1,1,BNU-China,2022,https://doi.org/https://2022.igem.wiki/bnu-china,"Cannabis \""\""Vaccine\""\""","In 2020, the United nations officially recogni...",1,1.0,cannabis; vaccine,RNAi/siRNA viral vectors; Biofilms; Gene thera...,CHN,4117,74aebc93-6208-436a-b088-06835e1e41ab
1,1,1,JSNU-China,2016,https://doi.org/https://2016.igem.org/Team:JSN...,Little strokes fell great oaks,Citrus fruits and vegetables contain much ant...,1,0.843277,little; strokes; fell; great; oaks,,CHN,2000,d1bbe07c-b05e-4974-8ad4-180df422bc76
2,1,1,JLU-CP,2024,https://doi.org/https://2024.igem.wiki/jlu-cp,PalettEmo,"PalettEmo is composed of \""\""Palette\""\"" and \...",1,0.821557,palettemo,Bio-based dyes and pigments; Protein- and cell...,CHN,5354,8a8c35f5-8d4f-48fb-bb00-a7cbbd56f729
3,1,1,CUHKSZ,2021,https://doi.org/https://2021.igem.org/Team:CUHKSZ,EthaNO,Alcoholic beverages usually play an important ...,1,0.804218,ethano,Addressing local issues facing farmers; Agricu...,CHN,4101,fa25db7c-8c4d-4854-8e77-006042aa4ba6
4,1,1,Tuebingen,2018,https://doi.org/https://2018.igem.org/Team:Tue...,BoNT C - Licence to enter,In modern medicine treatment options involve m...,1,0.764912,bont; c; licence; enter,,DEU,2689,e0250cbd-f610-4e45-a5d9-e2f44122b9cb


In [86]:
# Create df_clean
# 1. Select and rename columns from article_report
df_clean = article_report[['uuid', 'ID', 'Publication Years', 'Citations', 'Degree', 'Cluster Code']].copy()
df_clean.columns = ['uuid', 'UT', 'year', 'citations', 'degree', 'cluster_code']

# Fix uuid encoding: ensure both are strings and strip whitespace
df_clean['uuid'] = df_clean['uuid'].astype(str).str.strip()
coords_fixed = coords.copy()
coords_fixed['uuid'] = coords_fixed['uuid'].astype(str).str.strip()

# 2. Merge with coords to add x and y columns
df_clean = df_clean.merge(coords_fixed, on='UT', how='left')

# 3. Assign colors by matching cluster with rcs
# Create a mapping from cluster to color from rcs
cluster_color_map = dict(zip(rcs['cluster_code'], rcs['color']))
df_clean['color'] = df_clean['cluster_code'].map(cluster_color_map)

print(f"df_clean shape: {df_clean.shape}")
#print(f"Rows with coordinates: {df_clean['x'].notna().sum()}")
df_clean.head()

df_clean shape: (4199, 10)


Unnamed: 0,uuid_x,UT,year,citations,degree,cluster_code,uuid_y,x,y,color
0,74aebc93-6208-436a-b088-06835e1e41ab,4117,2022,1,1.0,1,8fca4f1a-823a-42de-8563-ed156dd37486,21.8012,74.85305,#f00f15
1,d1bbe07c-b05e-4974-8ad4-180df422bc76,2000,2016,1,0.843277,1,e4fa35cc-cc4a-4803-b61d-0a5d40446766,-30.392696,1.19809,#f00f15
2,8a8c35f5-8d4f-48fb-bb00-a7cbbd56f729,5354,2024,1,0.821557,1,f705cd80-ce8a-4c01-be23-27be15bab56c,-25.896702,-39.042973,#f00f15
3,fa25db7c-8c4d-4854-8e77-006042aa4ba6,4101,2021,1,0.804218,1,c34499a4-be94-4ece-9565-b7696411e818,-36.98501,-1.759328,#f00f15
4,e0250cbd-f610-4e45-a5d9-e2f44122b9cb,2689,2018,1,0.764912,1,4e6d566a-e6a3-423c-9ca0-f7b63bf610ed,21.506678,75.58264,#f00f15


## Visualize with TSNE

In [95]:
# Pass the t-SNE coordinates to your plotting functions
# Note: I slightly increased point size (s=1.5) and alpha (0.5) for better visibility
output_dir = base_dir / ANALYSIS_ID / f'level{LEVEL}'

plot_clusters_improved(df_clean, rcs, output_dir)

plot_years_improved(df_clean, output_dir)

print("Pipeline Finished Successfully.")

Pipeline Finished Successfully.


---

# Centroid-based plots
Instead of plotting individual data points, we can visualize the centroids of clusters to get a clearer overview of the data distribution. The size of each centroid represents the number of points in that cluster.

In [96]:
def plot_centroid_bubbles(df_clean, rcs, output_path):
    """
    Plots 1 bubble per cluster at the centroid location using df_clean and rcs.
    Size = Number of papers.
    Color = From rcs dataframe.
    """
    
    # 1. Setup Data - Filter out rows without coordinates
    valid_mask = df_clean['x'].notna() & df_clean['y'].notna()
    df_plot = df_clean[valid_mask].copy()
    
    unique_clusters = sorted(df_plot['cluster_code'].unique())
    
    # 2. Create mappings from rcs
    cluster_color_map = dict(zip(rcs['cluster_code'], rcs['color']))
    cluster_name_map = dict(zip(rcs['cluster_code'], rcs['cluster_label']))
    
    # 3. Aggregate Data
    centroids = []
    sizes = []
    colors = []
    labels = []
    
    for cluster_code in unique_clusters:
        cluster_mask = df_plot['cluster_code'] == cluster_code
        
        # Location: Mean of all points in this cluster
        x_center = df_plot.loc[cluster_mask, 'x'].mean()
        y_center = df_plot.loc[cluster_mask, 'y'].mean()
        centroids.append([x_center, y_center])
        
        # Size: Number of papers
        count = sum(cluster_mask)
        sizes.append(count)
        
        # Color from rcs
        colors.append(cluster_color_map.get(cluster_code, '#666666'))
        
        # Label from rcs
        cluster_name = cluster_name_map.get(cluster_code, str(cluster_code))
        # Clean: "10_dna_rna_gene" -> "dna rna gene"
        words = str(cluster_name).split('_')
        short_name = " ".join(words[:3]) if len(words) > 2 else words[0]
        labels.append(short_name)
        
    centroids = np.array(centroids)
    sizes = np.array(sizes)
    
    # 4. Scaling the Bubble Size
    # We normalize size so the largest bubble is ~5000 points area
    # and the smallest is visible.
    # Adjust 'scale_factor' to make bubbles bigger/smaller overall.
    scale_factor = 5000 
    norm_sizes = (sizes / sizes.max()) * scale_factor
    # Ensure minimum visibility for tiny clusters
    norm_sizes = np.maximum(norm_sizes, 50) 
    
    # 5. Plotting
    fig, ax = plt.subplots(figsize=(12, 12), dpi=300)
    
    # Scatter with transparency and white edges for "Bubble" look
    ax.scatter(centroids[:, 0], centroids[:, 1], 
               s=norm_sizes, c=colors, alpha=0.7, 
               edgecolors='white', linewidth=1.5)
    
    # 6. Labeling
    # We create a hierarchy: labels on huge bubbles are big, small are small
    for i, txt in enumerate(labels):
        # Calculate appropriate font size based on bubble size
        # Heuristic: 6pt minimum, 12pt maximum
        f_size = 6 + (norm_sizes[i] / norm_sizes.max()) * 8
        
        # Only label if cluster is significant enough to not clutter
        if sizes[i] > (sizes.max() * 0.02): 
            ax.text(centroids[i, 0], centroids[i, 1], txt, 
                    ha='center', va='center', 
                    fontsize=f_size, fontweight='bold', color='black',
                    bbox=dict(boxstyle="round,pad=0.1", fc="white", alpha=0.3, ec="none"))

    ax.axis('off')
    plt.tight_layout()
    fig.savefig(output_path / "plot_centroids_sized.png")
    print(f"Saved centroid bubble plot to {output_path}")
    plt.close()

In [97]:
    
# 3. Centroid Bubbles
plot_centroid_bubbles(df_clean, rcs, output_dir)

print("Pipeline Finished Successfully.")

Saved centroid bubble plot to /Users/cristian/Library/CloudStorage/GoogleDrive-cristianmejia00@gmail.com/My Drive/Bibliometrics_Drive/Q339_igem/a01_tm__f01_e01__hdbs/level0
Pipeline Finished Successfully.
