In [6]:
# Importing base libraries
import numpy as np
import pandas as pd
from azureml.core import Workspace, Dataset, Run

You should have  Azure Workspace, and config.json file in the directory of this script

In [7]:
from azureml.core import Workspace

# Setting up workspace
ws = Workspace.from_config()
print(ws.name, ws.resource_group, ws.location, ws.subscription_id, sep = '\n')

gromdimon
grom
eastus2
bb2e50f5-469f-46f5-874d-dfd7eb37c411


The build.py file I found also in Colby T.Ford jupyter notebook. 
So I decided not to reinvent bicycle and just use the script. 

In [3]:
%%writefile build.py

# Import libraries
import argparse, joblib, os, sys
from azureml.core import Dataset, Run
import pandas as pd
import numpy as np

from colabfold.batch import get_queries
from colabfold.batch import run as foldrun
from colabfold.download import default_data_dir
from colabfold.utils import setup_logging
from pathlib import Path
from Bio import SeqIO

os.makedirs('outputs', exist_ok=True)

# Set the input parameters
parser = argparse.ArgumentParser()
parser.add_argument("--sequence_id", type=str, dest='sequence_id', help='Input Sequence ID')
parser.add_argument("--msa_mode", type=str, dest='msa_mode', help='msa mode')
parser.add_argument("--num_models", type=int, dest='num_models', help='number of structures to predict')
parser.add_argument("--num_recycles", type=int, dest='num_recycles', help='number of recycles')
parser.add_argument("--stop_at_score", type=int, dest='stop_at_score', help='early stop after reaching this p1DDT score.')

args = parser.parse_args()

# Get the experiment run context
run = Run.get_context()
ws = run.experiment.workspace

# Settings
# msa_mode = "MMseqs2 (UniRef+Environmental)" #["MMseqs2 (UniRef+Environmental)", "MMseqs2 (UniRef only)","single_sequence","custom"]
# num_models = 1
# num_recycles = 3
# stop_at_score = 90

sequence_id = args.sequence_id
msa_mode = args.msa_mode
num_models = args.num_models
num_recycles = args.num_recycles
stop_at_score = args.stop_at_score

use_custom_msa, use_amber, use_templates, do_not_overwrite_results, zip_results = False, False, False, False, False

# Log run options
run.log('sequence_id', str(sequence_id))
run.log('msa_mode', str(msa_mode))
run.log('num_models', str(num_models))
run.log('num_recycles', str(num_recycles))
run.log('stop_at_score', str(stop_at_score))

# Load sequences
print("Loading sequences...")
for record in SeqIO.parse("sequences.fasta", "fasta"):
    if sequence_id == record.description:
        # Write out the specific sequence fasta file for this node
        SeqIO.write(record, open("run_sequence.fasta", "w"), "fasta")

# Output and Input directories
result_dir = 'outputs/predicted_structures/'
input_dir = 'run_sequence.fasta'

# Set up Logging
setup_logging(Path(result_dir).joinpath("log.txt"))
# Set up query tasks (1 per input sequence)
queries, is_complex = get_queries(input_dir)

# Run Fold Prediction
foldrun(
    queries=queries,
    result_dir=result_dir,
    use_templates=use_templates,
    use_amber=use_amber,
    msa_mode=msa_mode,
    model_type="auto",
    num_models=num_models,
    num_recycles=num_recycles,
    model_order=[1],
    is_complex=is_complex,
    data_dir=default_data_dir,
    keep_existing_results=do_not_overwrite_results,
    rank_by="auto",
    pair_mode="unpaired+paired",
    stop_at_score=stop_at_score,
    zip_results=zip_results,
)

run.log('complete', np.int(1))

run.complete()

Overwriting build.py


Build cluster for running the experiment

In [9]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

# Name the cluster of this work
cluster_name = "alphafold2"

try:
    # Check if cluster exists
    building_cluster = ComputeTarget(workspace = ws, name = cluster_name)
    print('This cluster exists.')
except ComputeTargetException:
    # Otherwise, create it
    try:
        compute_config = AmlCompute.provisioning_configuration(max_nodes = 4)
        # If you want exact vm_size choose any another. To show available vm's run:
        # AmlCompute.supported_vmsizes(workspace = ws)
        building_cluster = ComputeTarget.create(ws, cluster_name, compute_config)
        building_cluster.wait_for_completion(show_output=True)
    except Exception as ex:
        # If something wait wrong
        print(ex)

This cluster exists.


In [10]:
from azureml.core import Environment
from azureml.core.runconfig import DockerConfiguration
from azureml.core.conda_dependencies import CondaDependencies

# Name of environment
alphafold2_env = Environment("alphafold2_env")

# Downloading package requirements in form of docker base image
# I took existing docker image from cford38
alphafold2_env.docker.base_image = "cford38/alphafold2_aml:latest"
alphafold2_env.python.user_managed_dependencies = True

In [12]:
from azureml.core import ScriptRunConfig
from azureml.train.hyperdrive import GridParameterSampling, RandomParameterSampling, HyperDriveConfig, PrimaryMetricGoal, choice

# Create a script config
script_config = ScriptRunConfig(source_directory = ".",
                                script='build.py',
                                arguments = ['--msa_mode', "MMseqs2 (UniRef+Environmental)",
                                             '--num_models', 1,
                                             '--num_recycles', 3,
                                             '--stop_at_score', 90],
                                environment = alphafold2_env,
                                compute_target = building_cluster)

# Sample a range of parameter values
params = GridParameterSampling({ '--sequence_id': choice('reference', 'alpha', 'delta', 'omicron') })

# Configure hyperdrive settings
hyperdrive = HyperDriveConfig(run_config = script_config, 
                              hyperparameter_sampling = params, 
                              policy = None, 
                              primary_metric_name = 'complete', 
                              primary_metric_goal = PrimaryMetricGoal.MAXIMIZE, 
                              max_total_runs = 4,
                              max_concurrent_runs = 3)

In [13]:
from azureml.core import Experiment

# Set experiment
experiment_name = 'alphafold2_corona'
experiment = Experiment(workspace = ws, name = experiment_name)

In [None]:
from azureml.widgets import RunDetails

run = experiment.submit(config=hyperdrive)
run

# Show the status in the notebook as the experiment runs
RunDetails(run).show()
run.wait_for_completion()

_HyperDriveWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO'…

In [11]:
import os
from azureml.core import Run

os.makedirs('./results', exist_ok = True)

## Get all child runs
parent_runid = run.id
child_runs = Run.get(ws, parent_runid).get_children(recursive = True)

## For each child run, download the outputs
for child in child_runs:
    child_runid = child.get_details()['runId']
    child_run = Run.get(ws, child_runid)
    
    child_details = child_run.get_details()
    sequence_id = child_details['runDefinition']['arguments'][-1]
    
    child_run.download_files(prefix = 'outputs/predicted_structures', output_directory = f'./results/{sequence_id}', append_prefix = False)

In [12]:
# Collecting results for visualising
import py3Dmol

# Get dict of PDB files
pdbs = {}

for path, folder, files in os.walk('./results'):
    for name in files:
        if name.endswith('.pdb'):
            pdbs[name] = f'{path}/{name}'

In [13]:
# Make interactive PDB viewer
from ipywidgets import interact,fixed,IntSlider
import ipywidgets

def view_pdb(path, style):
    view = py3Dmol.view(js='https://3dmol.org/build/3Dmol.js', width=800, height=800)
    view.addModel(open(path,'r').read(),'pdb')
    view.setStyle({style: {'colorscheme': {'prop':'b','gradient': 'roygb','min':50,'max':90}}})
    view.zoomTo()
    return view

def pdb_viewer(structure, style):
    pdb_path = pdbs[structure]
    return view_pdb(pdb_path, style).show()


interact(pdb_viewer,
         structure=ipywidgets.Dropdown(
            options=pdbs.keys(),
            description='Structure:'),
            style=ipywidgets.Dropdown(
            options = ['line', 'stick', 'sphere', 'cartoon'],
            value='cartoon',
            description='Style:')
        )

interactive(children=(Dropdown(description='Structure:', options=(), value=None), Dropdown(description='Style:…

<function __main__.pdb_viewer(structure, style)>