# Create Embeddings for Parasites

This colab shows how to run the code to process whole site images into parasite patch embeddings.

Note that this pipeline runs ML models for finding parasites within images and then determining the lifecycle stage of the parasites. These models are not included in the github repo as they vary for different experimental setups (microscopes, etc). 

If you have a different setup that already has parasite centers and stages, you may want to use the "Embeddings Only" colab example, which reads in this data from a csv and only does the patch extraction and embedding creation portions of the pipeline.

In [None]:
#@title Run this cell only the FIRST time you connect to the colab kernel
!pip install gcsfs
!pip install keras==2.11.0
!git clone https://github.com/google/cell_img
!pip install -e cell_img
!pip3 install --upgrade tensorflow

In [None]:
import gcsfs
import fsspec
import os
import datetime
import math

from google.cloud import storage

import pandas as pd
import numpy as np
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

import cell_img
from cell_img.common import io_lib
from cell_img.common import data_utils
from cell_img.malaria_liver.parasite_emb import counts_lib
from cell_img.malaria_liver.parasite_emb import make_embeddings_main_lib

In [None]:
# Not required for connection to the Google Research Bucket
from google.colab import auth
auth.authenticate_user()

In [None]:
DATA_ROOT = 'gs://path/to/your/data/'

In [None]:
GCS_PROJECT = 'your_project_name'
GCS_BUCKET = 'your_bucket_name'
GCS_REGION = 'your_region_name'

In [None]:
# Set up the paths to the ML models for finding and staging parasites
FINDING_MODEL_PATH = 'gs://path/to/your/object/detection/model'
STAGING_MODEL_PATH = 'gs://path/to/your/staging_model/'
ARTIFACT_MODEL_PATH = 'gs://path/to/your/artifact_model/'
ARTIFACT_SCALING_PATH = 'gs://path/to/your/artifact_scaling/'

# constants for the pipeline
DEFAULT_NUM_OUTPUT_SHARDS = 10

## Set up to run pipeline

In [None]:
def get_pipeline_options(project, bucket, region, job_name):
  """Returns cloud dataflow pipeline options."""
  today_str = datetime.datetime.now().isoformat()
  # Replace characters Cloud DataFlow hates
  today_str = today_str.replace(':', '-').replace('.', '-').replace('T', 't')
  job_name = job_name.replace('_', '-')
  options = pipeline_options.PipelineOptions(flags=[
      '--temp_location',
      'gs://%s/tmpfiles' % bucket,
      '--runner',
      'DataflowRunner',
      '--worker_machine_type',
      'm1-ultramem-40',
      '--max_num_workers',
      '24',
      '--disk_size_gb',
      '50',
      '--experiments',
      'use_runner_v2',
      '--sdk_container_image',
      'gcr.io/%s/cell_img_imgs:v0.3' % project,
      '--sdk_location',
      'container',
      '--job_name',
      'n-%s-%s' % (job_name, today_str),
  ])
  options.view_as(pipeline_options.SetupOptions).save_main_session = False
  options.view_as(pipeline_options.GoogleCloudOptions).project = project
  options.view_as(pipeline_options.GoogleCloudOptions).region = region
  dataflow_gcs_location = 'gs://%s/dataflow' % bucket
  options.view_as(pipeline_options.GoogleCloudOptions
                 ).staging_location = '%s/staging' % dataflow_gcs_location
  options.view_as(pipeline_options.GoogleCloudOptions
                 ).temp_location = '%s/temp' % dataflow_gcs_location
  return options


def run_pipeline(image_csv_path, metadata_csv_path, output_dir,
                 whole_image_size, num_output_shards=10, options=None):

  # For most of the options, we hardcode the correct values for our
  # pipeline. Most of these options shouldn't ever change.
  pipeline_result = make_embeddings_main_lib.run_embeddings_pipeline(
    image_csv=image_csv_path,
    metadata_csv=metadata_csv_path,
    raw_channel_order=['w1', 'w2', 'w3'],
    channel_order=['DAPI', 'PVM', 'HSP70'],
    whole_image_size=whole_image_size,
    log_brightness_min=[-7., -7., -7.],
    log_brightness_max=[0., 0., 0.],
    output_dir=output_dir,
    model_path=STAGING_MODEL_PATH,
    min_confidence=0.6,
    pixels_per_side=64,
    count_csv_dir='count_csvs',
    crop_size=32,
    stain_indices=[2, 1],
    do_rotate=True,
    do_center=True,
    batch_size=128,
    embedding_model_path='https://tfhub.dev/google/imagenet/efficientnet_v2_imagenet1k_s/feature_vector/2',
    embedding_model_output_size=64,
    embedding_model_output_seed=2342343,
    staging_model_path=STAGING_MODEL_PATH,
    staging_channel_order=[0, 1, 2],
    artifact_model_path=ARTIFACT_MODEL_PATH,
    artifact_scaling_path=ARTIFACT_SCALING_PATH,
    num_output_shards=num_output_shards,
    options=options)

  if hasattr(pipeline_result, '_job'):
    # It was launched on cloud dataflow. Print the URL for the job.
    url = ('https://console.cloud.google.com/dataflow/jobs/%s/%s?project=%s' %
          (pipeline_result._job.location,
          pipeline_result._job.id,
          pipeline_result._job.projectId))
    print(url)
  return pipeline_result

## Local test with local images

This runs a local instance on test data, images that are just noise.

Note that you can look at the files on your local colab kernel using the directory button on the left side of the screen.

In [None]:
from PIL import Image as PilImage

In [None]:
%%writefile example_image.csv
plate_uid,well,channel,site,image_path
1604,F09,w1,14,im1.tif
1604,F09,w2,14,im2.tif
1604,F09,w3,14,im3.tif

In [None]:
%%writefile example_metadata.csv
plate_uid,well,blinded_code,stain_map,batch
1604,F09,XE-50-HB11,0,Pc22-022

In [None]:
EXAMPLE_IMAGE_SIZE = [100, 200]

In [None]:
example_image_metadata_df = pd.read_csv('example_image.csv')

np.random.seed(12345) # For determinism.

def write_test_image(image_path):
  array = np.random.randint(0, 65535, size=EXAMPLE_IMAGE_SIZE, dtype='uint16')
  pil_image = PilImage.fromarray(array)
  with open(image_path, 'wb') as f:
    pil_image.save(f, 'tiff')
  print('Wrote to %s' % image_path)

_ = example_image_metadata_df['image_path'].map(write_test_image)

In [None]:
run_pipeline('example_image.csv', 'example_metadata.csv', 'example_out_dir',
             num_output_shards=1, whole_image_size=EXAMPLE_IMAGE_SIZE)

In [None]:
pd.read_parquet('example_out_dir/patches.parquet-00000-of-00001')

## Local Test with remote images

Similar to the local test above, but using real images on the cloud bucket.

In [None]:
%%writefile real_example_image.csv
plate_uid,well,channel,site,image_path
1463,K05,w1,13,gs://path/to/your/data/image_data/1463/your_image_for_K05_s13_w1.tif
1463,K05,w2,13,gs://path/to/your/data/image_data/1463/your_image_for_K05_s13_w2.tif
1463,K05,w3,13,gs://path/to/your/data/image_data/1463/your_image_for_K05_s13_w3.tif

In [None]:
%%writefile real_example_metadata.csv
plate_uid,well,blinded_code,stain_map,batch
1463,K05,RA-17-GT96,0,Pc22-018

In [None]:
# The pipeline is very memory intensive on real files, do
# our best to clear up any unused memory before running.
import gc
gc.collect()

In [None]:
run_pipeline('real_example_image.csv', 'real_example_metadata.csv', 'real_example_out_dir',
             num_output_shards=1, whole_image_size=[2048,2048])

In [None]:
pd.read_parquet('real_example_out_dir/patches.parquet-00000-of-00001')

In [None]:
pd.read_parquet('real_example_out_dir/hypnozoite_patches.parquet-00000-of-00001')

## Run embedding creation job on Cloud Dataflow

Running on Google Cloud DataFlow requires connecting to your own Google Cloud instance. The code below will not run with the default setup that is connected to the Google Research Cloud Bucket, but we include it as an example as it is required to complete larger jobs efficiently.

Note on how this pipeline is set up: we're using a docker container to pre-install the python dependencies on cloud workers. This is how it's done - 

1. Go to cloud shell from Google Cloud Console

2. Make a file called "Dockerfile", with contents
```
FROM apache/beam_python3.9_sdk:2.46.0
RUN pip install gcsfs
RUN pip install keras==2.11.0
RUN git clone https://github.com/google/cell_img
RUN pip install -e cell_img
RUN pip3 install --upgrade tensorflow
RUN pip install --upgrade pip
RUN pip install --upgrade "jax[cpu]"
RUN pip install lightgbm
RUN pip install scikit-image
RUN pip install tensorflow-hub
RUN gcloud config set project YOUR_PROJECT_NAME
```
(One way this docker container may fail is if the version of python colab kernels use gets updated to python 3.10 or higher. The list of available beam docker images are [here](https://hub.docker.com/search?q=apache%2Fbeam&type=image). Update the first line after FROM to the docker image with a matching python version found in the link.)

3. Push the docker image by typing this in the shell; make sure to change TAG
```
export TAG={WRITE THE NEW TAG! IT WAS ORIGINALLY v0.3}
export PROJECT=YOUR_PROJECT_NAME
export REPO=cell_img_imgs
export IMAGE_URI=gcr.io/$PROJECT/$REPO:$TAG
docker build . --tag $IMAGE_URI
docker push $IMAGE_URI
```

4. In get_pipeline_options, update the ```--sdk_container_image``` flag with the new ```$IMAGE_URI```. If you don't remember what you used, type ```echo $IMAGE_URI``` to cloud shell again and copy-paste the output.

In [None]:
import apache_beam as beam
from apache_beam.options import pipeline_options
from apache_beam.options.pipeline_options import GoogleCloudOptions
from apache_beam.runners import DataflowRunner

from cell_img.malaria_liver.parasite_emb import make_embeddings_main_lib

In [None]:
options = get_pipeline_options(GCS_PROJECT, GCS_BUCKET, GCS_REGION, job_name)
pipeline_result = run_pipeline(
      image_csv_path=image_csv_path,
      metadata_csv_path=metadata_csv_path,
      whole_image_size=image_size_to_use,
      output_dir=output_dir,
      num_output_shards=DEFAULT_NUM_OUTPUT_SHARDS,
      options=options,
)