# Install protein folding models
Runs on serverless

In [0]:
%pip install pyyaml
%restart_python

In [0]:
spark.sql("CREATE CATALOG IF NOT EXISTS protein_folding")
for model in ['alphafold','proteinmpnn','boltz','esmfold','rfdiffusion']:
    spark.sql(f"CREATE SCHEMA IF NOT EXISTS protein_folding.{model}")

In [0]:
# these are large, but are required for AF2
# If true, download all datasets required - but, we do not include download 
# of the full BFD even if set to True, opting to use bfd_small only for that
# dataset expecting only very minor performance degradation with much faster inference.
download_af2_datasets = False
email = ""

In [0]:
from pathlib import Path

notebook_path = dbutils.notebook.entry_point.getDbutils().notebook().getContext().notebookPath().get()
directory_path = '/'.join(notebook_path.split('/')[:-1])

if download_af2_datasets:
    default_yaml_path = "/Workspace"+directory_path+"/install_incl_af2downloads.yaml"
else:
    default_yaml_path = "/Workspace"+directory_path+"/install.yaml"

base_path = str(Path("/Workspace"+directory_path+"/../tutorials").resolve())

print(default_yaml_path)
print(base_path)

In [0]:
import re
with open(default_yaml_path, 'r') as file:
    yaml_content = file.read()

updated_yaml_content = re.sub(r'<email>', email, yaml_content)
updated_yaml_content = re.sub(r'<root_path>', base_path, updated_yaml_content)


### note to self:
  - also Azure compute vs aws

In [0]:
from databricks.sdk import WorkspaceClient
from databricks.sdk.service.jobs import JobSettings
from typing import Optional
import yaml
def create_job_from_yaml(yaml_path: Optional[str] = None, yaml_str: Optional[str] = None):
    if yaml_path is not None:
        with open(yaml_path) as f:
            config = yaml.safe_load(f)
    elif yaml_str is not None:
        config = yaml.safe_load(yaml_str)
    else:
        raise ValueError("Either yaml_path or yaml_str must be provided")

    outer_name = [k for k in config['resources']['jobs'].keys()][0]

    # Full job settings deserialization
    job_settings = JobSettings.from_dict(config['resources']['jobs'][outer_name])

    # instantiate the client
    w = WorkspaceClient()
    
    # create a job just with name
    creation_info = w.jobs.create(name='new created job')
    # now use job settings object to populate
    w.jobs.reset(
        job_id=creation_info.job_id,
        new_settings=job_settings,
    )
    return creation_info.job_id

try:
    job_id = create_job_from_yaml(yaml_str = updated_yaml_content)
    print(f"Created job {job_id}")
except Exception as e:
    print(f"Job creation failed: {e}")

#### Run the workflow we just made
 - this will actually download model weights, create registered models and serve them

In [0]:
w = WorkspaceClient()
run_by_id = w.jobs.run_now(job_id=job_id).result()

# could use and_wait on run_now_and_wait but may just timeout due to taking a while, consider...
# clean up the no longer needed job once complete
# w.jobs.delete(job_id=job_id)

### Now make the AF2 workflow ready to be run as needed later

In [0]:

default_yaml_path = "/Workspace"+directory_path+"../tutorials/alphafold"+"/workflow/resources/example_workflow_setup.yaml"
af_notebooks_path = "/Workspace"+directory_path+"../tutorials/alphafold"+"/workflow/notebooks"

# For Azure
fold_compute = "Standard_NC4as_T4_v3"
featurize_compute = "Standard_F8"

with open(default_yaml_path, 'r') as file:
    yaml_content = file.read()

updated_yaml_content = re.sub(r'<email>', email, yaml_content)
updated_yaml_content = re.sub(r'<notebooks_path>', af_notebooks_path, updated_yaml_content)
updated_yaml_content = re.sub(r'<fold_compute>', fold_compute, updated_yaml_content)
updated_yaml_content = re.sub(r'<featurize_compute>', featurize_compute, updated_yaml_content)

try:
    job_id = create_job_from_yaml(yaml_str = updated_yaml_content)
    print(f"Created job {job_id}")
except Exception as e:
    print(f"Job creation failed: {e}")

In [0]:
js = get_jobsettings(yaml_str=updated_yaml_content)

In [0]:
js.tasks[0].task_key

In [0]:
from databricks.sdk import WorkspaceClient
from databricks.sdk.service.jobs import JobSettings
from typing import Optional
import yaml

def create_job_from_yaml(yaml_path: Optional[str] = None, yaml_str: Optional[str] = None):
    if yaml_path is not None:
        with open(yaml_path) as f:
            config = yaml.safe_load(f)
    elif yaml_str is not None:
        config = yaml.safe_load(yaml_str)
    else:
        raise ValueError("Either yaml_path or yaml_str must be provided")
    
    outer_name = [k for k in config['resources']['jobs'].keys()][0]
  
    # Full job settings deserialization
    job_settings = JobSettings.from_dict(config['resources']['jobs'][outer_name])
    
    # instantiate the client
    w = WorkspaceClient()
    
    # create a job just with name
    creation_info = w.jobs.create(name='new created job')
    # now use job settings object to populate
    w.jobs.reset(
        job_id=creation_info.job_id,
        new_settings=job_settings,
    )
    return creation_info.job_id

try:
    job_id = create_job_from_yaml(yaml_str = updated_yaml_content)
    print(f"Created job {job_id}")
except Exception as e:
    print(f"Job creation failed: {e}")