Load packages & define utils

In [30]:
random_state = 42 # notebook consistency

from typing import List
import time
from tqdm import tqdm

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import plotly.express as px

from sklearn.decomposition import PCA
from umap import UMAP
from sklearn.manifold import TSNE
from sklearn.neighbors import NearestNeighbors

# For unsupervised hyperparameter tuning
def mean_jaccard(X_high, X_low, k=10):
    """
    ==== Formula ====
    J = |A and B| / |A or B|

    ==== Example ====
    A,B = ([1,2,3],[2,3,4])
    A and B = {2,3}     -> |A and B| = 2
    A or B  = {1,2,3,4} -> |A or B| = 4
    
    Hence,
    J = 2 / 4 = 0.5
    """
    nn_high = NearestNeighbors(n_neighbors=k+1).fit(X_high)
    nn_low  = NearestNeighbors(n_neighbors=k+1).fit(X_low)
    neigh_high = nn_high.kneighbors(X_high, return_distance=False)[:, 1:]
    neigh_low  = nn_low.kneighbors(X_low, return_distance=False)[:, 1:]

    overlaps = [
        len(set(h) & set(l)) / len(set(h) | set(l))
        for h, l in zip(neigh_high, neigh_low)
    ]
    return np.mean(overlaps)

Import data

In [2]:
data = pd.read_csv('BRCA_data.csv', index_col=0)
X = data.iloc[:,:353].copy()
y = data.iloc[:,353:].copy()

Goal : Find latent groupings or patterns from gene expression.

# PCA

In [3]:
# fit 2D PCA
pca = PCA(n_components=2, random_state=random_state)
X_pca = pca.fit_transform(X)

# interactive plot
fig = px.scatter(
    x=X_pca[:, 0],
    y=X_pca[:, 1],
    title="PCA Projection of BRCA Gene Expression (2 Components)",
    labels={'x': 'PC1', 'y': 'PC2'},
    opacity=0.8,
    width=800,
    height=600
)
fig.update_traces(marker=dict(size=6, color='royalblue', line=dict(width=0)))
fig.update_layout(
    template='plotly_white',
    font=dict(size=14),
    title_x=0.5,
    title_font=dict(size=18),
    margin=dict(l=60, r=60, t=80, b=60)
)

fig.show()


In [4]:
method = 'PCA'
params_dict = {
    'X_high' : X,
    'X_low'  : X_pca,
    'k'     : 10
}
J = mean_jaccard(**params_dict)
print(f'{method}: Jaccard = {J:.4f}')

PCA: Jaccard = 0.0910


# UMAP

In [7]:
hyperparameter_grid = {
    'n_neighbors' : [5, 10, 15, 30, 50],
    'min_dist'    : [0.0, 0.1, 0.3, 0.5],
    'metric'      : ['euclidean', 'cosine', 'correlation']
}

In [None]:
# ---------------------------------
# Generate all parameter combinations
# ---------------------------------
param_combos = [
    (n, d, m)
    for n in hyperparameter_grid['n_neighbors']
    for d in hyperparameter_grid['min_dist']
    for m in hyperparameter_grid['metric']
]

results = []

start = time.time()

# ---------------------------------
# Grid Search
# ---------------------------------
for (n_neighbors, min_dist, metric) in tqdm(param_combos, desc="Tuning UMAP", unit="combo"):
    umap = UMAP(
        n_components=2,
        n_neighbors=n_neighbors,
        min_dist=min_dist,
        metric=metric,
        random_state=random_state,
        n_jobs=1
    )

    X_umap = umap.fit_transform(X)

    params_dict = {
        'X_high': X,
        'X_low': X_umap,
        'k': 10,   # fixed neighborhood size
    }
    J = mean_jaccard(**params_dict)
    
    results.append({
        'n_neighbors': n_neighbors,
        'min_dist': min_dist,
        'metric': metric,
        'jaccard': J
    })

end = time.time()
runtime = end - start

print(f"\nGrid search completed in {runtime:.2f} seconds")

# ---------------------------------
# Analyze Results
# ---------------------------------
results_df = pd.DataFrame(results)
best_row = results_df.loc[results_df['jaccard'].idxmax()]
print("\nBest UMAP configuration:")
print(best_row)

Tuning UMAP: 100%|██████████| 60/60 [00:32<00:00,  1.87combo/s]


Grid search completed in 32.05 seconds

Best UMAP configuration:
n_neighbors           10
min_dist             0.1
metric         euclidean
jaccard         0.224755
Name: 15, dtype: object





In [None]:
# ---------------------------------
# Fit Best Model
# ---------------------------------
umap = UMAP(
    n_components=2,
    n_neighbors=best_row['n_neighbors'],
    min_dist=best_row['min_dist'],
    metric=best_row['metric'],
    random_state=random_state,
    n_jobs=1
)
X_umap = umap.fit_transform(X)

# ---------------------------------
# Visualization
# ---------------------------------
fig = px.scatter(
    x=X_umap[:, 0],
    y=X_umap[:, 1],
    title="UMAP Projection of BRCA Gene Expression (2 Components)",
    labels={'x': 'UMAP1', 'y': 'UMAP2'},
    opacity=0.8,
    width=800,
    height=600
)
fig.update_traces(marker=dict(size=6, color='tomato', line=dict(width=0)))
fig.update_layout(
    template='plotly_white',
    font=dict(size=14),
    title_x=0.5,
    title_font=dict(size=18),
    margin=dict(l=60, r=60, t=80, b=60)
)
fig.show()

# tSNE

In [31]:
hyperparameter_grid = {
    'perplexity'     : [5, 10, 20, 30, 40],
    'learning_rate'  : [50, 100, 200],
    'metric'         : ['euclidean', 'cosine']
}

In [33]:
# ---------------------------------
# Generate all parameter combinations
# ---------------------------------
param_combos = [
    (p, lr, m)
    for p in hyperparameter_grid['perplexity']
    for lr in hyperparameter_grid['learning_rate']
    for m in hyperparameter_grid['metric']
]

results = []

start = time.time()

# ---------------------------------
# Grid Search
# ---------------------------------
for (perplexity, learning_rate, metric) in tqdm(param_combos, desc="Tuning t-SNE", unit="combo"):

    tsne = TSNE(
        n_components=2,
        perplexity=perplexity,
        learning_rate=learning_rate,
        metric=metric,
        random_state=random_state,
        init='pca',
        max_iter=1000,
        verbose=0
    )

    X_tsne = tsne.fit_transform(X)

    params_dict = {
        'X_high': X,
        'X_low': X_tsne,
        'k': 10,    # fixed local neighborhood
    }
    J = mean_jaccard(**params_dict)
    
    results.append({
        'perplexity': perplexity,
        'learning_rate': learning_rate,
        'metric': metric,
        'jaccard': J
    })

end = time.time()
runtime = end - start

print(f"\nGrid search completed in {runtime:.2f} seconds")

# ---------------------------------
# Analyze Results
# ---------------------------------
results_df = pd.DataFrame(results)
best_row = results_df.loc[results_df['jaccard'].idxmax()]
print("\nBest t-SNE configuration:")
print(best_row)

Tuning t-SNE: 100%|██████████| 30/30 [00:35<00:00,  1.18s/combo]


Grid search completed in 35.27 seconds

Best t-SNE configuration:
perplexity              30
learning_rate           50
metric           euclidean
jaccard           0.253707
Name: 18, dtype: object





In [35]:

# ---------------------------------
# Fit Best Model
# ---------------------------------
tsne = TSNE(
    n_components=2,
    perplexity=best_row['perplexity'],
    learning_rate=best_row['learning_rate'],
    metric=best_row['metric'],
    random_state=random_state,
    init='pca',
    max_iter=1000
)
X_tsne = tsne.fit_transform(X)

# ---------------------------------
# Visualization
# ---------------------------------
fig = px.scatter(
    x=X_tsne[:, 0],
    y=X_tsne[:, 1],
    title="t-SNE Projection of BRCA Gene Expression (2 Components)",
    labels={'x': 't-SNE1', 'y': 't-SNE2'},
    opacity=0.8,
    width=800,
    height=600
)
fig.update_traces(marker=dict(size=6, color='mediumseagreen', line=dict(width=0)))
fig.update_layout(
    template='plotly_white',
    font=dict(size=14),
    title_x=0.5,
    title_font=dict(size=18),
    margin=dict(l=60, r=60, t=80, b=60)
)
fig.show()
