# Annotation Embedding Comparisons

### UMAP vs t-SNE vs VAE

In this notebook we are going to compare the annotation-based transformation embedding with untransformed embeddings across different non-linear embedding methods.

In [1]:
import jscatter
import numpy as np
import pandas as pd
import re
import transformation
import colors
from glob import glob

In [2]:
dataset_name = 'TUMOR_006'

**Data Loading:**

In [3]:
dataset = glob(f'data/mair-2022/{dataset_name}*')[0]
df = pd.read_parquet(dataset)

**Data Preparation:**

In [4]:
import transformation
from importlib import reload
reload(transformation)

markers, expression_levels, raw_expressions = transformation.prepare(df)

print(f'Markers: {", ".join(markers)}')
print(f'Expression Levels: {" and ".join(expression_levels.keys())}')

Markers: CD4, CD8, CD3, CD45RA, CD27, CD19, CD103, CD28, CD69, PD1, HLADR, GranzymeB, CD25, ICOS, TCRgd, CD38, CD127, Tim3
Expression Levels: + and -


**Data Transformation:**

In [6]:
import transformation
from importlib import reload
reload(transformation)

transformed_expressions = transformation.transform(df, markers, expression_levels, log=True)

Transform 0-999 of 5388 clusters... done! (29s)
Transform 1000-1999 of 5388 clusters... done! (28s)
Transform 2000-2999 of 5388 clusters... done! (29s)
Transform 3000-3999 of 5388 clusters... done! (29s)
Transform 4000-4999 of 5388 clusters... done! (29s)
Transform 5000-5999 of 5388 clusters... done! (12s)


**UMAP Embedding:**

In [7]:
from sklearn.decomposition import PCA
from umap import UMAP

pca = PCA(n_components=2).fit_transform(
    df[[f'{m}_Windsorized' for m in markers]].values
)

df_ann_embed_umap = transformation.embed(
    df,
    transformed_expressions,
    UMAP(init=pca, random_state=42),
    save_as=f'{dataset_name}_umap_ann'
)
df_raw_embed_umap = transformation.embed(
    df,
    raw_expressions,
    UMAP(init=pca, random_state=42),
    save_as=f'{dataset_name}_umap_raw'
)

## Visualize Embedding

In [19]:
import colors
from importlib import reload
reload(colors)

# Only difference is CD38
phenotypes_cd38 = [
    'CD4-CD8+CD3+CD45RA-CD27+CD19-CD103+CD28+CD69+PD1+HLADR-GranzymeB-CD25-ICOS-TCRgd-CD38+CD127-Tim3-',
    'CD4-CD8+CD3+CD45RA-CD27+CD19-CD103+CD28+CD69+PD1+HLADR-GranzymeB-CD25-ICOS-TCRgd-CD38-CD127-Tim3-'
]

phenotypes_cd8_t_cells = [
    'CD4-CD8+CD3+CD45RA-CD27-CD19-CD103-CD28+CD69+PD1+HLADR-GranzymeB-CD25-ICOS-TCRgd-CD38-CD127+Tim3-', # activate CD8 T cell
    'CD4-CD8+CD3+CD45RA-CD27-CD19-CD103-CD28+CD69-PD1+HLADR-GranzymeB-CD25-ICOS-TCRgd-CD38-CD127+Tim3-', # inactivate CD8 T cell
]


color_map = [colors.gray_dark]+colors.glasbey_light+colors.glasbey_light+colors.glasbey_light

view_config = dict(x='x', y='y', color_by='cellType', color_map=color_map, background_color='black', axes=False, opacity_unselected=0.05)
compose_config = dict(sync_selection=True, sync_hover=True, row_height=640)

In [13]:
# Uncomment the line below to load previously embedded data
df_ann_embed_umap = pd.read_parquet(f'data/{dataset_name}_umap_ann.pq')
df_raw_embed_umap = pd.read_parquet(f'data/{dataset_name}_umap_raw.pq')

plot_ann_embed_umap = jscatter.Scatter(data=df_ann_embed_umap, **view_config)
plot_raw_embed_umap = jscatter.Scatter(data=df_raw_embed_umap, **view_config)
jscatter.compose([plot_ann_embed_umap, plot_raw_embed_umap], **compose_config)

GridBox(children=(HBox(children=(VBox(children=(Button(button_style='primary', icon='arrows', layout=Layout(wi…

In [14]:
cell_idxs = df_ann_embed_umap.query('cellType in @phenotypes_cd38').index
plot_ann_embed_umap.selection(cell_idxs)

<jscatter.jscatter.Scatter at 0x104ad9460>

**t-SNE embeddings:**

In [90]:
import transformation
from openTSNE.sklearn import TSNE
from importlib import reload
reload(transformation)

df_ann_embed_tsne = transformation.embed(
    df,
    transformed_expressions,
    TSNE(n_jobs=-1, random_state=None),
    save_as=f'{dataset_name}_tsne_ann_rnd_2',
)
df_raw_embed_tsne = transformation.embed(
    df,
    raw_expressions,
    TSNE(n_jobs=-1, random_state=None),
    save_as=f'{dataset_name}_tsne_raw_rnd_2',
)


KeyboardInterrupt



In [20]:
# Uncomment the line below to load previously embedded data
df_ann_embed_tsne = pd.read_parquet(f'data/{dataset_name}_tsne_ann.pq')
df_raw_embed_tsne = pd.read_parquet(f'data/{dataset_name}_tsne_raw.pq')

plot_ann_embed_tsne = jscatter.Scatter(data=df_ann_embed_tsne, **view_config)
plot_raw_embed_tsne = jscatter.Scatter(data=df_raw_embed_tsne, **view_config)

jscatter.compose([plot_ann_embed_tsne, plot_raw_embed_tsne], **compose_config)

GridBox(children=(HBox(children=(VBox(children=(Button(button_style='primary', icon='arrows', layout=Layout(wi…

In [21]:
plot_ann_embed_tsne.selection(df_ann_embed_tsne.query('cellType in @phenotypes_cd38').index)
plot_raw_embed_tsne.selection(df_ann_embed_tsne.query('cellType in @phenotypes_cd38').index)

<jscatter.jscatter.Scatter at 0x1ba73e5e0>

In [22]:
plot_ann_embed_tsne.color(by='CD38_Windsorized', map='viridis')
plot_raw_embed_tsne.color(by='CD38_Windsorized', map='viridis')

<jscatter.jscatter.Scatter at 0x1ba73e5e0>

In [23]:
plot_ann_embed_tsne.selection(df_ann_embed_tsne.query('cellType in @phenotypes_cd8_t_cells').index)

<jscatter.jscatter.Scatter at 0x1ba95ae50>

In [24]:
plot_ann_embed_tsne.color(by='CD69_Windsorized', map='viridis')
plot_raw_embed_tsne.color(by='CD69_Windsorized', map='viridis')

<jscatter.jscatter.Scatter at 0x1ba73e5e0>

**VAE:**

In [43]:
from sklearn.preprocessing import MinMaxScaler

import vae
import transformation
from importlib import reload

reload(vae)
reload(transformation)

winsorized_expression = df[[f'{m}_Windsorized' for m in markers]].values

transformed_expressions_norm = MinMaxScaler().fit_transform(transformed_expressions)
winsorized_expression_norm = MinMaxScaler().fit_transform(winsorized_expression)
raw_expression_norm = MinMaxScaler().fit_transform(raw_expressions)

vae_ann = vae.create(len(markers))
vae_win = vae.create(len(markers))
vae_raw = vae.create(len(markers))

vae_ann.fit(transformed_expressions_norm, epochs=50, batch_size=256)
vae_win.fit(winsorized_expression_norm, epochs=50, batch_size=256)
vae_raw.fit(raw_expression_norm, epochs=50, batch_size=256)

df_ann_embed_vae = transformation.to_df(df, vae_ann.encoder.predict(transformed_expressions_norm)[2], save_as=f'{dataset_name}_vae_ann')
df_win_embed_vae = transformation.to_df(df, vae_win.encoder.predict(winsorized_expression_norm)[2], save_as=f'{dataset_name}_vae_win')
df_raw_embed_vae = transformation.to_df(df, vae_raw.encoder.predict(raw_expression_norm)[2], save_as=f'{dataset_name}_vae_raw')

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [28]:
# Uncomment the line below to load previously embedded data
df_ann_embed_vae = pd.read_parquet(f'data/{dataset_name}_vae_ann.pq')
df_win_embed_vae = pd.read_parquet(f'data/{dataset_name}_vae_win.pq')
df_raw_embed_vae = pd.read_parquet(f'data/{dataset_name}_vae_raw.pq')

plot_ann_embed_vae = jscatter.Scatter(data=df_ann_embed_vae, **view_config)
plot_win_embed_vae = jscatter.Scatter(data=df_win_embed_vae, **view_config)
plot_raw_embed_vae = jscatter.Scatter(data=df_raw_embed_vae, **view_config)

jscatter.compose([plot_ann_embed_vae, plot_win_embed_vae, plot_raw_embed_vae], **compose_config)

GridBox(children=(HBox(children=(VBox(children=(Button(button_style='primary', icon='arrows', layout=Layout(wi…