In [0]:
# ref: https://github.com/chaidiscovery/chai-lab/tree/main
# https://www.chaidiscovery.com/blog/introducing-chai-1
# https://www.biorxiv.org/content/10.1101/2024.10.10.615955v2.full.pdf
# https://en.wikipedia.org/wiki/Multiple_sequence_alignment

# Installation
# # version on pypi:
# pip install chai_lab==0.6.1

# # newest available version (updates daily to test features that weren't released yet):
# pip install git+https://github.com/chaidiscovery/chai-lab.git
# This Python package requires Linux, Python 3.10 or later, and a GPU with CUDA and bfloat16 support. We recommend using an A100 80GB or H100 80GB or L40S 48GB chip, but A10 and A30 will work for smaller complexes. Users have also reported success with consumer-grade RTX 4090.

In [0]:
# version on pypi:
# pip install chai_lab==0.6.1

# newest available version (updates daily to test features that weren't released yet):
!pip install git+https://github.com/chaidiscovery/chai-lab.git

## might want to pip install to Vols? it's pretty fast

In [0]:
!pip install --upgrade urllib3
# Successfully installed urllib3-2.3.0

In [0]:
dbutils.library.restartPython()

In [0]:
%sh
python --version

In [0]:
!which chai-lab

In [0]:
# https://www.ncbi.nlm.nih.gov/protein/UXQ07652.1?report=fasta
# https://www.rcsb.org/downloads/fasta
# https://files.wwpdb.org/pub/pdb/derived_data/pdb_seqres.txt.gz

#### not sure what's the right format for the input file -- it kept complaining the ENTITY TYPE not supported from FASTA examples obtain in above links...

# https://github.com/seqeralabs/nf-chai --> format seems to be acceptable... | there are examples in repo https://github.com/chaidiscovery/chai-lab/blob/main/tests/example_inputs.py but no obvious format example -- although the same example is mentioned here as well https://github.com/chaidiscovery/chai-lab/blob/main/examples/predict_structure.py 

# Create a FASTA file with example content
fasta_content = """>protein|name=example-of-long-protein
AGSHSMRYFSTSVSRPGRGEPRFIAVGYVDDTQFVRFDSDAASPRGEPRAPWVEQEGPEYWDRETQKYKRQAQTDRVSLRNLRGYYNQSEAGSHTLQWMFGCDLGPDGRLLRGYDQSAYDGKDYIALNEDLRSWTAADTAAQITQRKWEAAREAEQRRAYLEGTCVEWLRRYLENGKETLQRAEHPKTHVTHHPVSDHEATLRCWALGFYPAEITLTWQWDGEDQTQDTELVETRPAGDGTFQKWAAVVVPSGEEQRYTCHVQHEGLPEPLTLRWEP
>protein|name=example-of-short-protein
AIQRTPKIQVYSRHPAENGKSNFLNCYVSGFHPSDIEVDLLKNGERIEKVEHSDLSFSKDWSFYLLYYTEFTPTEKDEYACRVNHVTLSQPKIVKWDRDM
>protein|name=example-peptide
GAAL
>ligand|name=example-ligand-as-smiles
CCCCCCCCCCCCCC(=O)O
"""

# Write the content to a file
with open('test_protein.fasta', 'w') as f:
    f.write(fasta_content)

In [0]:
!ls -lah

In [0]:
# /Workspace/Users/may.merkletan@databricks.com/hls-proteinfolding/tutorials/chai1/test_output

In [0]:
# %sh 
# chai-lab fold "AGSHSMRYFSTSVSRPGRGEPRFIAVGYVDDTQFVR" test_output

In [0]:
%sh 
chai-lab fold test_protein.fasta ./test_output

In [0]:
import chai_lab 
chai_lab.__version__

In [0]:
## hmm not sure if the module/function has similar calls for python vs shell cmds?

# chai_lab.main.run_inference(fasta_file: Path, *, output_dir: Path, use_esm_embeddings: bool=True, use_msa_server: bool=False, msa_server_url: str="https://api.colabfold.com", msa_directory: Path | None=None, constraint_path: Path | None=None, use_templates_server: bool=False, template_hits_path: Path | None=None, recycle_msa_subsample: int=0, num_trunk_recycles: int=3, num_diffn_timesteps: int=200, num_diffn_samples: int=5, num_trunk_samples: int=1, seed: int | None=None, device: str | None=None, low_memory: bool=True) -> StructureCandidates)

# some assistant help!
# https://github.com/chaidiscovery/chai-lab/blob/main/examples/predict_structure.py

In [0]:
from pathlib import Path

output_dir = Path('test_output2_pyfunc') ## POSIX Path needs to be empty
fasta_file = Path('test_protein.fasta')
candidates = chai_lab.chai1.run_inference(fasta_file, output_dir=output_dir)

In [0]:
candidates

In [0]:
cif_paths = candidates.cif_paths
agg_scores = [rd.aggregate_score.item() for rd in candidates.ranking_data]
agg_scores

In [0]:
import numpy as np

# Load pTM, ipTM, pLDDTs and clash scores for sample 2
scores = np.load(output_dir.joinpath("scores.model_idx_2.npz"))
scores

In [0]:
[k for k in scores.keys()]

In [0]:
scores.get('aggregate_score'), scores.get('ptm'), scores.get('iptm'), scores.get('per_chain_ptm'), scores.get('per_chain_pair_iptm'), scores.get('has_inter_chain_clashes'), scores.get('chain_chain_clashes')

In [0]:
#  might need to review https://github.com/chaidiscovery/chai-lab/tree/main/examples 