# tSNE Template

This colab is designed to create tSNE plots of the well mean embedding of hypnozoites to see which wells are like other wells.

* The data for this notebook is the "well mean embedding of hypnozoites":
  * The schizonts and artifacts are ignored.
  * The embeddings for all the hypnozoites in a well are averaged by simply taking the mean in each of the 192 dimensions.
  * By default, the DAPI stain (the first 64 of the 192 dimensions) is dropped because we want to look at the parasite-specific stains.
  * Thus the result for each well is a single 128-dimensional embedding vector. So the plots have one dot for each well in the graph.
  * [tSNE](https://en.wikipedia.org/wiki/T-distributed_stochastic_neighbor_embedding) is a method for reducing dots in a high dimensional space into 2 dimensions for better visualization.
  * If you want to think about or measure the true distance between the dots, you want to go back to the embeddings, for example you can do [k-means clustering](https://en.wikipedia.org/wiki/K-means_clustering) on the well mean embeddings.

===========

This colab is a TEMPLATE! To get started, CREATE A COPY, delete this text and type your goal with this colab.



In [None]:
#@title Run this cell only the FIRST time you connect to the colab kernel
!pip install gcsfs

In [None]:
#@title (Not required for connection to the Google Research Bucket) Run this cell after restarting your kernel
from google.colab import auth
auth.authenticate_user()

In [None]:
DATA_ROOT = 'gs://path/to/your/data/'

In [None]:
GCS_PROJECT = 'your_project_name'
GCS_BUCKET = 'your_bucket_name'
GCS_REGION = 'your_region_name'

In [None]:
# set up the file paths to your data
BATCHES_TO_LOAD = ['your-batch-names']


In [None]:
#@title Imports and code setup
import os
import numpy as np
import pandas as pd
import seaborn as sns
import scipy.spatial.distance as distance
from sklearn import ensemble, metrics, preprocessing, manifold, decomposition, mixture, linear_model, neighbors
import altair as alt
import fsspec
from google.cloud import storage
alt.data_transformers.disable_max_rows()

def make_tsne(df):
  dist_list = distance.pdist(df, metric="cosine")
  dist_square = distance.squareform(dist_list)
  tsne = manifold.TSNE(n_components=2, random_state=10, metric="precomputed", init="random")
  embedding_tsne = tsne.fit_transform(dist_square)
  emb_df = pd.DataFrame(embedding_tsne, index=df.index)
  emb_df.columns = ['x', 'y']
  emb_df.set_index(['x', 'y'], append=True, inplace=True)
  return emb_df.reset_index()


def create_parquet_filter(field, values_to_keep):
  # Create the filter for all plates
  filter_list = []
  for p in values_to_keep:
    # AND filters are all within one list
    # We want an OR filter, so we need a list of lists of tuples
    filter_list.append( [(field, '=', p)])

  return filter_list


def load_parquet_to_emb_df(filepath_list, filter_list=None,
                           embedding_name='mean_embedding'):
  # Read the new Parquet output
  emb_df_list = []
  for p in filepath_list:
    with fsspec.open(p) as f:
      if filter_list:
        emb_df_list.append(pd.read_parquet(f, filters=filter_list))
      else:
        emb_df_list.append(pd.read_parquet(f))

  emb_df = pd.concat(emb_df_list)
  if len(emb_df) == 0:
    raise ValueError('The embedding dataframe is empty, did you use the right filters?')

  for col in ['image', 'channel_order']:
    if col in emb_df.columns:
      emb_df.drop(columns=[col], inplace=True)

  # expand the embedding
  tmp_df = pd.DataFrame([pd.Series(x) for x in emb_df[embedding_name]])
  tmp_df.columns = [str(x) for x in range(192)]
  emb_df = pd.concat([emb_df.reset_index(), tmp_df], axis=1)
  emb_df.drop(columns=[embedding_name], inplace=True)

  # set the index
  emb_df.set_index([c for c in emb_df.columns if not c in [str(x) for x in range(192)]], inplace=True)

  return emb_df


# Generate parquet file paths

By default, we assume that you want to load the canonical data for one or more batches. Use this code to create the parquet file paths to load.

If you have different parquet paths that you want to load, you can always skip these steps and manually define PARQUET_EMB_PATHS as a list of parquet files to open.

In [None]:
BATCHES_TO_LOAD = ['your-batch-names']
DATASET_TO_USE = 'gs://path/to/your/dataset'

# The prefix for any well mean files to load
EMB_PREFIX = os.path.join(
    'you_folder/emb', DATASET_TO_USE, '%s/')
# we require any well_mean parquet file paths to contain this
WELL_MEAN_STR = 'canon/well_mean_hypnozoite.parquet'

In [None]:
def glob_cloud(bucket, prefix, must_contain_str=None):
  """Returned files start with gs://[bucket]/[prefix] and contain the must_contain_str if provided."""
  client = storage.Client()
  files_in_dir = [blob.name for blob in client.list_blobs(bucket, prefix=prefix)]
  #print(files_in_dir)
  if not must_contain_str:
    return [os.path.join('gs://%s' % bucket, f) for f in files_in_dir]

  ret_val = []
  for f in files_in_dir:
    if must_contain_str in f:
      ret_val.append(os.path.join('gs://%s' % bucket, f))

  #print('For %s with %s, found:' % (prefix, must_contain_str))
  #print(ret_val)
  return ret_val

In [None]:
#@title Create the list of parquets to load

# Pick option 1 or option 2 and leave the other option commented out.

# Option 1: Generate canonical paths for your batches above
PARQUET_EMB_PATHS = []
for b in BATCHES_TO_LOAD:
  PARQUET_EMB_PATHS.extend(
      glob_cloud(GCS_BUCKET, EMB_PREFIX % b, WELL_MEAN_STR)
  )

# Option 2: Manually define the set of paths to load
#PARQUET_EMB_PATHS = [
#    'gs://path/to/your/dataset/well_mean_parquet/filename',
#    'gs://path/to/your/dataset/well_mean_parquet/filename',
#]

print('Planning to load parquet files:\n\t%s' % '\n\t'.join(PARQUET_EMB_PATHS))

# Load embedding dataframes, generate tSNE coordinates

In [None]:
#@title Optional: filter to specific plates

# In practice, these well mean dataframes are fairly small so it's easiest to
# just load the whole dataframe
filter_list = None
print('No filters, loading the whole embedding dataframe.')

# BUT, you can always limit the data you load to make things faster.
# Some examples of filters
# Limit to only plate my_plate
#filter_list = create_parquet_filter('plate', ['my_plate'])
# Limit to only well my_well (on all the plates)
#filter_list = create_parquet_filter('well', ['my_well'])
# More on filters here:
# https://stackoverflow.com/questions/56522977/using-predicates-to-filter-rows-from-pyarrow-parquet-parquetdataset
# You can use AND and OR filters, etc.
#print(f'Filtering to {filter_list}')

In [None]:
#@title Load all the embeddings into a dataframe
emb_df = load_parquet_to_emb_df(PARQUET_EMB_PATHS, filter_list=filter_list)

In [None]:
emb_df.head()
# The index is the batch/plate/well and the values are all the dimensions
# of the mean embedding per well.
# There's no metadata yet (like blinded_code or actives)

# Calculate tSNE coordinates and graph

This section filters to a subset of data then calculates the tSNE coordinates (i.e. projects the high dimensional embeddings down to 2D in a way to best visualize the distance between elements).

Generally you want to calculate the coordinates once, so each well has a position on the graph. Then you want to display the same graph multiple times with different attributes colors so you can judge how things are clustered.

As an example, below we:
  * Subset on all the wells in two batches except the "uninfected control" wells.
  * Show the same plot colored by batch, hep_lot, and actives
  * Then we can judge the meaning of each 'island' in the plot.

Example Conclusion: There are three islands. Using the plots below, I determine that the top right has the active compounds from both batches, while left is the infected controls and inactive compounds from batch 1 which is hep_lot 1 while the bottom island is the infected controls and inactive compounds from batch 2 which is hep_lot 2.

Note that the tSNE calculation can take a while with a lot of points.You can take a subset to reduce the tSNE calculation time.

If you want to play with different subsets of the data, you can copy and paste this section multiple times into the colab.

(I recommend deleting this text and instead typing about your data subset, what you are trying to achieve, and (after you finish) your conclusions.)

In [None]:
#@title Subset the full embedding dataframe
# drop DAPI and NaNs
subset_emb_df = emb_df[[str(x) for x in range(64,192)]].copy()
subset_emb_df = subset_emb_df.dropna()

# For example, you could subset to only the control wells:
subset_emb_df = subset_emb_df.query('actives != "uninfected control"')

# If you have a lot of wells and want you can mix&match your selections.
# (Be careful that you stay statistically valid).
# One example: Use all the infected control and active control but only
# a subset of the sample wells. Make sure you sample randomly.
# Here I build both datasets and then concatentate them.
control_emb_df = subset_emb_df.query('actives in ["infected control", "active control"]')
some_sample_emb_df = subset_emb_df.query('actives == "sample"').sample(1000)
subset_emb_df = pd.concat([control_emb_df, some_sample_emb_df])

In [None]:
# do a quick validation - is the subset what you expect?
subset_emb_df.reset_index().actives.value_counts()

# in this case: yep, this has the same number of control wells except the
# uninfected control wells are gone, and I've got the 1000 sample wells I asked for.

In [None]:
# make tsne
tsne_df = make_tsne(subset_emb_df)

In [None]:
#@title tSNE
plot_title = 'Pick an informative name: ex: Color by Batch' #@param
plot_width = 1000 #@param
plot_height = 600 #@param
color_by = 'batch' #@param
size_by = 'cp_nuclei' #@param
metadata_to_display = ['batch','well', 'plate', 'actives', 'concentration', 'hep_lot','blinded_concept', 'flag_IQCH30', 'flag_Nuclei', 'cp_nuclei','cp_hypnozoite', ] #@param

base = alt.Chart(
    tsne_df.sort_values(by=color_by,ascending=False),
    title=plot_title,
    width=plot_width,
    height=plot_height,
).mark_circle().encode(
    alt.X(
        'x:Q',
        axis=alt.Axis(grid=False),
        title='X',
    ),
    alt.Y(
        'y:Q',
        axis=alt.Axis(grid=False),
        title='Y',
    ),
    size=alt.Size(size_by, scale=alt.Scale(type='linear')),
    color=alt.Color(
        color_by,
        title=color_by,
        sort=['sample', 'active_control', 'infected_control'],
    ),
    tooltip=[
        alt.Tooltip(metadata, title=metadata) for metadata in metadata_to_display]
)
base.interactive()

In [None]:
#@title tSNE
plot_title = 'Pick an informative name: ex: Color by Hep_Lot' #@param
plot_width = 1000 #@param
plot_height = 600 #@param
color_by = 'hep_lot' #@param
size_by = 'cp_nuclei' #@param
metadata_to_display = ['batch','well', 'plate', 'actives', 'concentration', 'hep_lot','blinded_concept', 'flag_IQCH30', 'flag_Nuclei', 'cp_nuclei','cp_hypnozoite', ] #@param

base = alt.Chart(
    tsne_df.sort_values(by=color_by,ascending=False),
    title=plot_title,
    width=plot_width,
    height=plot_height,
).mark_circle().encode(
    alt.X(
        'x:Q',
        axis=alt.Axis(grid=False),
        title='X',
    ),
    alt.Y(
        'y:Q',
        axis=alt.Axis(grid=False),
        title='Y',
    ),
    size=alt.Size(size_by, scale=alt.Scale(type='linear')),
    color=alt.Color(
        color_by,
        title=color_by,
        sort=['sample', 'active_control', 'infected_control'],
    ),
    tooltip=[
        alt.Tooltip(metadata, title=metadata) for metadata in metadata_to_display]
)
base.interactive()

In [None]:
#@title tSNE
plot_title = 'Pick an informative name: ex: Color by Actives' #@param
plot_width = 1000 #@param
plot_height = 600 #@param
color_by = 'actives' #@param
size_by = 'cp_nuclei' #@param
metadata_to_display = ['batch','well', 'plate', 'actives', 'concentration', 'hep_lot','blinded_concept', 'flag_IQCH30', 'flag_Nuclei', 'cp_nuclei','cp_hypnozoite', ] #@param

base = alt.Chart(
    tsne_df.sort_values(by=color_by,ascending=False),
    title=plot_title,
    width=plot_width,
    height=plot_height,
).mark_circle().encode(
    alt.X(
        'x:Q',
        axis=alt.Axis(grid=False),
        title='X',
    ),
    alt.Y(
        'y:Q',
        axis=alt.Axis(grid=False),
        title='Y',
    ),
    size=alt.Size(size_by, scale=alt.Scale(type='linear')),
    color=alt.Color(
        color_by,
        title=color_by,
        sort=['sample', 'active_control', 'infected_control'],
    ),
    tooltip=[
        alt.Tooltip(metadata, title=metadata) for metadata in metadata_to_display]
)
base.interactive()