Running AlphaFold2 (from ColabFold) using Azure Machine Learning
Notebook is similar to Colby T. Ford, Ph.D. (https://github.com/colbyford/azureml-alphafold2)

In [2]:
import numpy as np
import pandas as pd
from azureml.core import Workspace, Dataset, Run

ws = Workspace.from_config()

#Target computation

In [3]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

cluster_name = "alphafold2-ic"

try:
    ## Check for existing compute target
    training_cluster = ComputeTarget(workspace = ws, name = cluster_name)
    print('Found existing cluster.')
except ComputeTargetException:
    ## If it doesn't already exist, create it
    try:
        compute_config = AmlCompute.provisioning_configuration(vm_size = 'STANDARD_NC6', max_nodes = 4)
        training_cluster = ComputeTarget.create(ws, cluster_name, compute_config)
        training_cluster.wait_for_completion(show_output=True)
    except Exception as ex:
        print(ex)

Found existing cluster.


#Configure HyperDrive

In [5]:
from azureml.core import Experiment, ScriptRunConfig, Environment
from azureml.core.conda_dependencies import CondaDependencies
from azureml.train.hyperdrive import GridParameterSampling, RandomParameterSampling, HyperDriveConfig, PrimaryMetricGoal, choice
from azureml.widgets import RunDetails

# Create a Python environment for the experiment
alphafold2_env = Environment("alphafold2")
alphafold2_env.docker.base_image = "cford38/alphafold2_aml:latest"
alphafold2_env.python.user_managed_dependencies = True

## Create a script config
script_config = ScriptRunConfig(source_directory = ".",
                                script='predict.py',
                                arguments = ['--msa_mode', "MMseqs2 (UniRef+Environmental)",
                                             '--num_models', 1,
                                             '--num_recycles', 3,
                                            '--stop_at_score', 90],
                                environment = alphafold2_env,
                                compute_target = training_cluster)

## Sample a range of parameter values
params = GridParameterSampling({ '--sequence_id': choice('alpha_b117_6xc2', 'beta_b1351_7vx1', 'delta_b1617_7v70', 'omicron_b11529_7t9j') })

## Configure hyperdrive settings
hyperdrive = HyperDriveConfig(run_config = script_config, 
                              hyperparameter_sampling = params, 
                              policy = None, 
                              primary_metric_name = 'complete', 
                              primary_metric_goal = PrimaryMetricGoal.MAXIMIZE, 
                              max_total_runs = 4,
                              max_concurrent_runs = 4)

Define Prediction Script

In [6]:
%%writefile predict.py
## Import libraries
import argparse, joblib, os, sys
from azureml.core import Dataset, Run
import pandas as pd
import numpy as np

from colabfold.batch import get_queries
from colabfold.batch import run as foldrun
from colabfold.download import default_data_dir
from colabfold.utils import setup_logging
from pathlib import Path
from Bio import SeqIO

os.makedirs('outputs', exist_ok=True)

## Set the input parameters
parser = argparse.ArgumentParser()
parser.add_argument("--sequence_id", type=str, dest='sequence_id', help='Input Sequence ID')
parser.add_argument("--msa_mode", type=str, dest='msa_mode', help='msa mode')
parser.add_argument("--num_models", type=int, dest='num_models', help='number of structures to predict')
parser.add_argument("--num_recycles", type=int, dest='num_recycles', help='number of recycles')
parser.add_argument("--stop_at_score", type=int, dest='stop_at_score', help='early stop after reaching this p1DDT score.')

args = parser.parse_args()

## Get the experiment run context
run = Run.get_context()
ws = run.experiment.workspace

## Settings
# msa_mode = "MMseqs2 (UniRef+Environmental)" #["MMseqs2 (UniRef+Environmental)", "MMseqs2 (UniRef only)","single_sequence","custom"]
# num_models = 1
# num_recycles = 3
# stop_at_score = 90

sequence_id = args.sequence_id
msa_mode = args.msa_mode
num_models = args.num_models
num_recycles = args.num_recycles
stop_at_score = args.stop_at_score

use_custom_msa = False
use_amber = False
use_templates = False 
do_not_overwrite_results = False
zip_results = False

## Log run options
run.log('sequence_id', str(sequence_id))
run.log('msa_mode', str(msa_mode))
run.log('num_models', str(num_models))
run.log('num_recycles', str(num_recycles))
run.log('stop_at_score', str(stop_at_score))

## load the sequence(s)
print("Loading sequences...")

for record in SeqIO.parse("sequences.fasta", "fasta"):
    if sequence_id == record.description:
        ## Write out the specific sequence fasta file for this node
        SeqIO.write(record, open("run_sequence.fasta", "w"), "fasta")

## Output and Input directories
result_dir = 'outputs/predicted_structures/'
input_dir = 'run_sequence.fasta'

## Set up Logging
setup_logging(Path(result_dir).joinpath("log.txt"))
## Set up query tasks (1 per input sequence)
queries, is_complex = get_queries(input_dir)

## Run Fold Prediction
foldrun(
    queries=queries,
    result_dir=result_dir,
    use_templates=use_templates,
    use_amber=use_amber,
    msa_mode=msa_mode,
    model_type="auto",
    num_models=num_models,
    num_recycles=num_recycles,
    model_order=[1],
    is_complex=is_complex,
    data_dir=default_data_dir,
    keep_existing_results=do_not_overwrite_results,
    rank_by="auto",
    pair_mode="unpaired+paired",
    stop_at_score=stop_at_score,
    zip_results=zip_results,
)

run.log('complete', np.int(1))

run.complete()

Overwriting predict.py


Submit the HyperDrive Experiment

In [7]:
experiment = Experiment(workspace = ws, name = 'alphafold2_hyperdrive')
run = experiment.submit(config = hyperdrive)

## Show the status in the notebook as the experiment runs
RunDetails(run).show()
run.wait_for_completion()

_HyperDriveWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO'…

Performing interactive authentication. Please follow the instructions on the terminal.


AuthenticationException: AuthenticationException:
	Message: Please ensure you have network connection. Error detail: HTTPSConnectionPool(host='login.microsoftonline.com', port=443): Max retries exceeded with url: /organizations/v2.0/.well-known/openid-configuration (Caused by NewConnectionError('<urllib3.connection.VerifiedHTTPSConnection object at 0x7f6cc0774280>: Failed to establish a new connection: [Errno -3] Temporary failure in name resolution'))
	InnerException None
	ErrorResponse 
{
    "error": {
        "code": "UserError",
        "inner_error": {
            "code": "Authentication"
        },
        "message": "Please ensure you have network connection. Error detail: HTTPSConnectionPool(host='login.microsoftonline.com', port=443): Max retries exceeded with url: /organizations/v2.0/.well-known/openid-configuration (Caused by NewConnectionError('<urllib3.connection.VerifiedHTTPSConnection object at 0x7f6cc0774280>: Failed to establish a new connection: [Errno -3] Temporary failure in name resolution'))"
    }
}

Collect all the Results

In [8]:
import os

os.makedirs('./results', exist_ok = True)

## Get all child runs
parent_runid = run.id
child_runs = Run.get(ws, parent_runid).get_children(recursive = True)

## For each child run, download the outputs
for child in child_runs:
    child_runid = child.get_details()['runId']
    child_run = Run.get(ws, child_runid)
    
    child_details = child_run.get_details()
    sequence_id = child_details['runDefinition']['arguments'][-1]
    
    child_run.download_files(prefix = 'outputs/predicted_structures', output_directory = f'./results/{sequence_id}', append_prefix = False)

Render PDBs

In [10]:
## Install py3Dmol
# import sys
# !{sys.executable} -m pip install py3Dmol

import py3Dmol

## Get dict of PDB files
pdbs = {}

for path, folder, files in os.walk('./results'):
    for name in files:
        if name.endswith('.pdb'):
            pdbs[name] = f'{path}/{name}'
pdbs## Install py3Dmol
# import sys
# !{sys.executable} -m pip install py3Dmol

import py3Dmol

## Get dict of PDB files
pdbs = {}

for path, folder, files in os.walk('./results'):
    for name in files:
        if name.endswith('.pdb'):
            pdbs[name] = f'{path}/{name}'
# pdbs

###################
from ipywidgets import interact,fixed,IntSlider
import ipywidgets

## Make interactive PDB viewer
def view_pdb(path, style):
    view = py3Dmol.view(js='https://3dmol.org/build/3Dmol.js', width=800, height=800)
    view.addModel(open(path,'r').read(),'pdb')
    view.setStyle({style: {'colorscheme': {'prop':'b','gradient': 'roygb','min':50,'max':90}}})
    view.zoomTo()
    return view

def pdb_viewer(structure, style):
    pdb_path = pdbs[structure]
    return view_pdb(pdb_path, style).show()

interact(pdb_viewer,
         structure=ipywidgets.Dropdown(
            options=pdbs.keys(),
            description='Structure:'),
         style=ipywidgets.Dropdown(
             options = ['line', 'stick', 'sphere', 'cartoon'],
             value='cartoon',
             description='Style:')
        )

interactive(children=(Dropdown(description='Structure:', options=('delta_b1617_7v70_unrelaxed_rank_1_model_1.p…

<function __main__.pdb_viewer(structure, style)>

Thank's!