In [None]:
# Copyright 2022 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Quick Start: Colabfold inference pipeline with Cloud Batch and Workflows

This notebook demonstrates how to submit inference pipeline runs.

You use the utility functions in the `workflow_executor` module to configure and submit the runs. The `workflow_executor` module contains two functions:
- `prepare_args_for_experiment` - This function formats the runtime parameters for the Google Workflows workflows that implements the pipeline. It also sets default values for a number of runtime parameters
- `execute_workflow` - This function executes the Google Workflows workflow.

This is a complete list of required and optional parameters accepted by the functions:

```
    project_id: str
    region: str
    input_dir: str
    image_uri: str
    job_gcs_path: str
    labels: dict
    machine_type: str = 'n1-standard-4'
    cpu_milli: int = 8000
    memory_mib: int = 30000
    boot_disk_mib: int = 200000
    gpu_type: str = "nvidia-tesla-t4"
    gpu_count: int = 1
    job_gcsfuse_local_dir: str = '/mnt/disks/gcs/colabfold'
    parallelism: int = 8
    template_mode: str = "none"
    use_cpu: bool = False
    use_gpu_relax: bool = False
    use_amber: bool = False
    msa_mode: str = 'mmseqs2_uniref_env'
    model_type: str = 'auto'
    num_models: int = 5
    num_recycle: int = 3
    custom_template_path: str = None
    overwrite_existing_results: bool = False
    rank_by: str = 'auto'
    pair_mode: str = 'unpaired_paired'
    stop_at_score: int = 100
    zip_results: bool = False
```

### Install python libraries

In [None]:
# Install packages
! pip install -U google-cloud-firestore google-cloud-workflows google-cloud-storage

In [None]:
# Reload the kernel before proceeding
%load_ext autoreload
%autoreload 2

### Execute Workflow

In [None]:
from src import workflow_executor

Please set the following variables according to the setup of your environment.

In [None]:
project_id = 'colabfold-batch'    # Project ID. Example: "my_project_id"
region = 'us-central1'    # Region where resources will be created. Example: "us-central1"
parallelism = 64          # check your quota to see maximum

input_dir = 'brd4_design1/input'   # GCS path where you will upload FASTA files.
                                                 # Example: 'my_bucket/input_folder'
image_uri = 'gcr.io/colabfold-batch/colabfold-batch'    # Image built to execute Colabfold
job_gcs_path = 'brd4_design1'     # Bucket name where the resulting artifacts will be created.
                                        # Example: 'my_bucket'

labels = {'experiment': 'brd4_design1', 'member': 'Katrina'}     # Labels to identify your job

Copy local FASTA files to the GCS path.

In [None]:
local_input_dir = './sequences'   # Local directory where your FASTA files are located

In [None]:
# Copy local files to GCS
! gsutil -m cp {local_input_dir}/*.fasta gs://{input_dir}

Execute the following cell to start the Colabfold execution.

In [None]:
# Prepare the environment for execution
args = workflow_executor.prepare_args_for_experiment(
    project_id = project_id,
    region = region,
    parallelism = parallelism,
    input_dir = input_dir,
    image_uri = image_uri,
    job_gcs_path = job_gcs_path,
    labels = labels
)

In [None]:
# Split into 400 jobs at a time, which is the limit for one workflow
runners = args['runners']
split_args = []
for ii in range(0, len(runners), 400):
    new_dict = args.copy()
    new_dict['runners'] = runners[ii : ii + 400]
    split_args.append(new_dict)

# Execute the workflow
workflow_executor.execute_workflow(
    workflow_name='colabfold-workflow',
    args=args
)

In [None]:
# Execute all workflows
workflow_name = 'colabfold-workflow'
for small_args in split_args:
    workflow_executor.execute_workflow(workflow_name=workflow_name, args=small_args)