# Embedding Creation Only

The standard open source pipeline takes in whole images, performs object detection and lifecycle stage classification, and saves extracted patches with embeddings.

The "Emb Creation Only" version of the pipeline takes the whole site images as input, but also takes a CSV file with parasite centers identified in another application (like CellProfiler). This pipeline extracts the patches and saves the patches and their embeddings.

Because the ML model used to create embeddings from images is a model that is publicly available on TensorFlow hub, this version of the embedding creation pipeline requires no custom ML models.

Both pipelines perform the crop/center/rotate function which reduces noise from uncentered parasite patches. (Our work showed that if you cluster parasites that are uncentered, you'll see clusters with different parasite locations - so instead of one "healthy parasites" group, you might see three with healthy parasites in different locations with the patch.)

In [None]:
#@title Run this cell only the FIRST time you connect to the colab kernel
!pip install gcsfs
!git clone https://github.com/google/cell_img
!pip install --quiet -e cell_img

In [None]:
import os
import pandas as pd
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

import cell_img
from cell_img.common import data_utils
from cell_img.common import image_lib
from cell_img.malaria_liver import metadata_lib
from cell_img.malaria_liver.parasite_emb import make_embeddings_main_lib

In [None]:
#@title Run this cell after restarting your kernel. It will pop up window to grant access.
from google.colab import auth
auth.authenticate_user()

In [None]:
# Set up file paths
DATA_ROOT = 'gs://path/to/public/data'
INPUT_IMAGE_CSV = os.path.join(DATA_ROOT, 'tensorstore/metadata/test_emb_only_creation/input_images.csv')
INPUT_OBJECT_CSV = os.path.join(DATA_ROOT, 'tensorstore/metadata/test_emb_only_creation/input_objects.csv')

# Write locally to this colab,
# This path should be a cloud bucket for non-examples.
OUTPUT_DIR = 'example_output_dir'

# Run the pipeline within the colab

This example runs the model within this colab for this small test dataset.

To run a cloud dataflow job for larger datasets, use the pipeline options and docker installation in the "Create Embeddings for Parasites" colab.

In [None]:
def run_pipeline(image_csv_path, object_metadata_csv_path, output_dir,
                 whole_image_size, num_output_shards=10, options=None):

  # For most of the options, we hardcode the correct values for our
  # pipeline. Most of these options shouldn't ever change.
  pipeline_result = make_embeddings_main_lib.run_emb_creation_only_pipeline(
    image_csv=image_csv_path,
    object_metadata_csv=object_metadata_csv_path,
    raw_channel_order=['w1', 'w2', 'w3'],
    channel_order=['DAPI', 'PVM', 'HSP70'],
    whole_image_size=whole_image_size,
    log_brightness_min=[-7., -7., -7.],
    log_brightness_max=[0., 0., 0.],
    output_dir=output_dir,
    pixels_per_side=64,
    num_output_shards=num_output_shards,
    count_csv_dir='count_csvs',
    crop_size=32,
    stain_indices=[2, 1],
    do_rotate=True,
    do_center=True,
    batch_size=128,
    embedding_model_path='https://tfhub.dev/google/imagenet/efficientnet_v2_imagenet1k_s/feature_vector/2',
    embedding_model_output_size=64,
    embedding_model_output_seed=2342343,
    options=options)

  if hasattr(pipeline_result, '_job'):
    # It was launched on cloud dataflow. Print the URL for the job.
    url = ('https://console.cloud.google.com/dataflow/jobs/%s/%s?project=%s' %
          (pipeline_result._job.location,
          pipeline_result._job.id,
          pipeline_result._job.projectId))
    print(url)
  return pipeline_result

In [None]:
run_pipeline(
    image_csv_path=INPUT_IMAGE_CSV, 
    object_metadata_csv_path=INPUT_OBJECT_CSV, 
    output_dir=OUTPUT_DIR,
    whole_image_size=[2048,2048], 
    num_output_shards=2,
)

# Validating input data and results

In [None]:
input_df = pd.read_csv(INPUT_OBJECT_CSV)

# Reading CSVs require re-formatting some columns to align to our expectations.
input_df['plate'] = input_df['plate'].astype(str).apply(
      lambda x: x.split('.')[0].zfill(5))
input_df['plate_uid'] = input_df['plate']
input_df['site'] = input_df['site'].apply(
    lambda s: str(s).zfill(5))
# If the CSV was saved with no index, drop the unnamed column
unnamed_c = [c for c in input_df.columns if c.startswith('Unnamed:')]
if unnamed_c:
  input_df = input_df.drop(columns=unnamed_c)

In [None]:
input_df.sample(5)

In [None]:
# read in the results to compare against our input
output_df_list = []
for i in range(2):
  output_df_list.append(pd.read_parquet(os.path.join(OUTPUT_DIR, 'patches.parquet-0000%d-of-00002' % i)))
output_df = pd.concat(output_df_list)
output_df.sample(3)

In [None]:
print('There were %d parasites in the input, and there are %d in the output' % (
    len(input_df), len(output_df)
))

In [None]:
edge_query_str = 'center_row < 60 or center_row > 2000 or center_col < 60 or center_col > 2000'

input_on_edge_df = input_df.query(edge_query_str)
output_on_edge_df = output_df.query(edge_query_str)

print('In the input, there are %d parasites on the edge of images but in the output there are only %d' %
      (len(input_on_edge_df), len(output_on_edge_df)))

print('There are %d more parasites in input than output, and %d more eduge parasites' % (
    (len(input_df) - len(output_df), 
     len(input_on_edge_df) - len(output_on_edge_df))))

In [None]:
# The centers will not be exact because of the crop/center/rotate, we can
# sort and then directly compare these lists
input_on_edge_df.sort_values(['center_row', 'center_col'])[
    ['batch', 'plate', 'well', 'site', 'center_row', 'center_col']]

In [None]:
output_on_edge_df.sort_values(['center_row', 'center_col'])[
    ['batch', 'plate', 'well', 'site', 'center_row', 'center_col']]

# this makes it clear that the parasites with center_row < 53 or
# center_row > 2015 are listed above on the input but not below
# on the output.

## Validation: Look at parasite centers

We don't expect these to change, so we are just validating that they look the same.

In [None]:
# Path for where to find the CSVs with the information needed to create the
# tensorstore: image paths and where, within the image_grid, to place each image.
TENSORSTORE_PATH = os.path.join(DATA_ROOT, 'tensorstore')
METADATA_ROOT = os.path.join(TENSORSTORE_PATH, 'metadata')
TS_CSV_FOLDER = 'tensorstore'
CHANNEL_TO_RGB = ['w3', 'w2', 'w1']

# Set up the alignment between our channel names and RGB
CHANNEL_TO_RGB = ['w3', 'w2', 'w1']
meta_ts = metadata_lib.MetadataIndex(
    TENSORSTORE_PATH, CHANNEL_TO_RGB, METADATA_ROOT)

In [None]:
# pick a set of parasites to look at
input_df.query('well == "J12" and site == "00013"')

In [None]:
df_to_show = input_df.query('well == "J12" and site == "00013"').sort_values(
    ['center_row', 'center_col']
)
fig = meta_ts.contact_sheet_for_df(
    example_df=df_to_show, patch_size=40, ncols=3, nrows=1,
    name_for_x_col='center_col', name_for_y_col='center_row')

In [None]:
# Look at the same set of parasites in the output dataframe
df_to_show = output_df.query('well == "J12" and site == "00013"').sort_values(
    ['center_row', 'center_col']
)
fig = meta_ts.contact_sheet_for_df(
    example_df=df_to_show, patch_size=40, ncols=3, nrows=1,
    name_for_x_col='center_col', name_for_y_col='center_row')