# Uni-Fold Folding

In [1]:
%load_ext autoreload
%autoreload 2

## Dataclass comparison

In [2]:
from dataclasses import asdict
from prtm.models.unifold.config import (
    Model2FT, MultimerAF2V3, Model1AF2, make_data_config_dataclass, make_data_config, model_config, SHAPE_SCHEMA
)





[2024-02-19 20:41:04,522] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)
PyRosetta-4 2023 [Rosetta PyRosetta4.conda.linux.cxx11thread.serialization.CentOS.python310.Release 2023.47+release.5fe66cd241adb376f3a0af661ea0dcd77ea0dbbe 2023-11-21T10:47:25] retrieved from: http://www.pyrosetta.org
(C) Copyright Rosetta Commons Member Institutions. Created in JHU by Sergey Lyskov and PyRosetta Team.


In [3]:
old_config = model_config("multimer_af2_v3")
new_config = MultimerAF2V3()

In [4]:
new_config.globals

GlobalsConfig(block_size=None, d_pair=128, d_msa=256, d_template=64, d_extra_msa=64, d_single=384, chunk_size=4, eps=1e-05, inf=30000.0, max_recycling_iters=3, alphafold_original_mode=True)

In [8]:
num_res = 300
old_data_cfg, old_feature_names = make_data_config(
    old_config.data,
    mode="predict",
    num_res=num_res,
    is_multimer=False,
    use_templates=True,
)
new_data_cfg, new_feature_names = make_data_config_dataclass(
    new_config.data,
    num_res=num_res,
    is_multimer=False,
    use_templates=True,
)

In [9]:
def compare_dicts(dict1, dict2, path=""):
    # Check if both arguments are dictionaries
    if not (isinstance(dict1, dict) and isinstance(dict2, dict)):
        return dict1 == dict2, [] if dict1 == dict2 else [f"Values at path '{path}' are not equal: {dict1} != {dict2}"]

    # Check if both dictionaries have the same keys
    diff_keys = set(dict1.keys()).symmetric_difference(set(dict2.keys()))
    differences = [f"Key '{path + ('.' if path else '') + str(key)}' is not present in both dictionaries" for key in diff_keys]

    # Recursively compare values of common keys
    for key in set(dict1.keys()).intersection(set(dict2.keys())):
        are_equal, diffs = compare_dicts(dict1[key], dict2[key], path + ('.' if path else '') + str(key))
        if not are_equal:
            differences.extend(diffs)

    return len(differences) == 0, differences

In [10]:
is_same, mismatch = compare_dicts(
    old_config.to_dict(),
    asdict(new_config),
)

In [11]:
# Missing keys
for k in mismatch:
    if k.startswith("Key"):
        print(k)

Key 'data.supervised' is not present in both dictionaries
Key 'data.train' is not present in both dictionaries
Key 'data.eval' is not present in both dictionaries
Key 'data.common.features' is not present in both dictionaries


In [12]:
# Mismatched values
for k in mismatch:
    if k.startswith("Value"):
        print(k)

Values at path 'data.common.recycling_features' are not equal: ['msa_chains', 'msa_mask', 'msa_row_mask', 'bert_mask', 'true_msa', 'msa_feat', 'extra_msa_deletion_value', 'extra_msa_has_deletion', 'extra_msa', 'extra_msa_mask', 'extra_msa_row_mask', 'is_distillation'] != ('msa_chains', 'msa_mask', 'msa_row_mask', 'bert_mask', 'true_msa', 'msa_feat', 'extra_msa_deletion_value', 'extra_msa_has_deletion', 'extra_msa', 'extra_msa_mask', 'extra_msa_row_mask', 'is_distillation')
Values at path 'data.common.multimer_features' are not equal: ['assembly_num_chains', 'asym_id', 'sym_id', 'num_sym', 'entity_id', 'asym_len', 'cluster_bias_mask'] != ('assembly_num_chains', 'asym_id', 'sym_id', 'num_sym', 'entity_id', 'asym_len', 'cluster_bias_mask')
Values at path 'data.common.template_features' are not equal: ['template_all_atom_positions', 'template_sum_probs', 'template_aatype', 'template_all_atom_mask'] != ('template_all_atom_positions', 'template_sum_probs', 'template_aatype', 'template_all_at

In [9]:
set(asdict(new_config.data.common.features).keys()).symmetric_difference(set(SHAPE_SCHEMA.keys()))

set()

In [10]:
is_same, mismatch = compare_dicts(
    old_data_cfg.to_dict(),
    asdict(new_data_cfg),
)

In [11]:
# Missing keys
for k in mismatch:
    if k.startswith("Key"):
        print(k)

Key 'eval' is not present in both dictionaries
Key 'supervised' is not present in both dictionaries
Key 'train' is not present in both dictionaries


## Folding

In [2]:
from prtm import protein
from prtm.models.unifold.modeling import UniFoldForFolding
from prtm.visual import view_superimposed_structures
from prtm.models.unifold.modules.alphafold import AlphaFold





[2024-02-19 20:43:40,336] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)
PyRosetta-4 2023 [Rosetta PyRosetta4.conda.linux.cxx11thread.serialization.CentOS.python310.Release 2023.47+release.5fe66cd241adb376f3a0af661ea0dcd77ea0dbbe 2023-11-21T10:47:25] retrieved from: http://www.pyrosetta.org
(C) Copyright Rosetta Commons Member Institutions. Created in JHU by Sergey Lyskov and PyRosetta Team.


In [3]:
from prtm.models.unifold import config

## Fold Monomer

In [4]:
monomer_sequence = (
    "LILNLRGGAFVSNTQITMADKQKKFINEIQEGDLVRSYSITDETFQQNAVTSIV"
    "KHEADQLCQINFGKQHVVCTVNHRFYDPESKLWKSVCPHPGSGISFLKKYDYLLS"
    "EEGEKLQITEIKTFTTKQPVFIYHIQVENNHNFFANGVLAHAMQVSI"
)
monomer_sequence_dict = {"A": monomer_sequence}

In [5]:
uf_folder = UniFoldForFolding(model_name="multimer_1_af2_v3", use_templates=True, random_seed=0)

In [6]:
results = []
for model_name in UniFoldForFolding.available_models:
    #if "multimer" not in model_name and "symm" not in model_name:
    if "symm" not in model_name:
        print("Folding", model_name)
        folder = UniFoldForFolding(model_name=model_name, use_templates=True, random_seed=0)
        #results.append(folder(monomer_sequence_dict, max_recycling_iters=3, num_ensembles=2))

Folding model_2_ft
Folding multimer_ft
Folding model_1_af2
Folding model_2_af2
Folding model_3_af2
Chosen model is trained without templates, setting use_templates=False.
Folding model_4_af2
Chosen model is trained without templates, setting use_templates=False.
Folding model_5_af2
Chosen model is trained without templates, setting use_templates=False.
Folding multimer_1_af2_v3
Folding multimer_2_af2_v3
Folding multimer_3_af2_v3


Downloading: "https://huggingface.co/conradry/unifold-alphafold-weights/resolve/main/params_model_3_multimer_v3.pth" to /home/ubuntu/.cache/torch/hub/checkpoints/unifold_multimer_3_af2_v3.pth
100%|████████████████████████████████████| 357M/357M [00:08<00:00, 45.2MB/s]


Folding multimer_4_af2_v3
Folding multimer_5_af2_v3


Downloading: "https://huggingface.co/conradry/unifold-alphafold-weights/resolve/main/params_model_5_multimer_v3.pth" to /home/ubuntu/.cache/torch/hub/checkpoints/unifold_multimer_5_af2_v3.pth
100%|████████████████████████████████████| 357M/357M [00:09<00:00, 39.2MB/s]


In [7]:
uf_folder = UniFoldForFolding(model_name="model_2_ft", use_templates=True, random_seed=0)
#af_folder = UniFoldForFolding(
#    model_name="model_1_af2", use_templates=True, random_seed=0
#)

In [9]:
uf_monomer_structure, uf_aux_output = uf_folder(monomer_sequence_dict, max_recycling_iters=3, num_ensembles=2)
#af_monomer_structure, af_aux_output = af_folder(monomer_sequence_dict, max_recycling_iters=3, num_ensembles=2)

Loaded result from cache.
Loaded result from cache.




In [10]:
with open("/home/ubuntu/repos/prtm/test/unifold/reference_model_2_ft.pdb", mode="r") as f:
    s2 = protein.Protein37.from_pdb_string(f.read())

In [11]:
#uf_monomer_structure.show()

In [12]:
view_superimposed_structures(uf_monomer_structure, s2, color1="green")

<py3Dmol.view at 0x7fde4ab01ff0>

In [None]:
view_superimposed_structures(uf_monomer_structure, af_monomer_structure, color1="green")

## Fold Multimer

In [None]:
complex_sequence_a = (
    "TTPLVHVASVEKGRSYEDFQKVYNAIALKLREDDEYDNYIGYGPVLVRLAWHTSGTW"
    "DKHDNTGGSYGGTYRFKKEFNDPSNAGLQNGFKFLEPIHKEFPWISSGDLFSLGGVTA"
    "VQEMQGPKIPWRCGRVDTPEDTTPDNGRLPDADKDADYVRTFFQRLNMNDREVVALMGAH"
    "ALGKTHLKNSGYEGPWGAANNVFTNEFYLNLLNEDWKLEKNDANNEQWDSKSGYMMLPTDY"
    "SLIQDPKYLSIVKEYANDQDKFFKDFSKAFEKLLENGITFPKDAPSPFIFKTLEEQGL"
)
complex_sequence_b = (
    "TEFKAGSAKKGATLFKTRCLQCHTVEKGGPHKVGPNLHGIFGRHSGQAEGYSYTDA"
    "NIKKNVLWDENNMSEYLTNPKKYIPGTKMAIGGLKKEKDRNDLITYLKKACE"
)
complex_sequence_dict = {"A": complex_sequence_a, "B": complex_sequence_b}

In [None]:
uf_folder = UniFoldForFolding(model_name="multimer_ft", use_templates=True, random_seed=0)
af_folder = UniFoldForFolding(
    model_name="multimer_4_af2_v3", use_templates=True, random_seed=0
)

In [None]:
uf_complex_structure, uf_comp_aux_output = uf_folder(
    complex_sequence_dict, max_recycling_iters=3, num_ensembles=2
)
af_complex_structure, af_compp_aux_output = af_folder(
    complex_sequence_dict, max_recycling_iters=3, num_ensembles=2
)

In [None]:
view_superimposed_structures(
    uf_complex_structure.get_chain("A"), af_complex_structure.get_chain("A"), color1="green"
)

In [None]:
view_superimposed_structures(
    uf_complex_structure.get_chain("B"), af_complex_structure.get_chain("B"), color1="green"
)

## Fold Symmetric

In [None]:
symmetric_sequence = (
    "PPYTVVYFPVRGRCAALRMLLADQGQSWKEEVVTVETWQEGSLKASCLYGQLPKFQDGD"
    "LTLYQSNTILRHLGRTLGLYGKDQQEAALVDMVNDGVEDLRCKYISLIYTNYEAGKDDYV"
    "KALPGQLKPFETLLSQNQGGKTFIVGDQISFADYNLLDLLLIHEVLAPGCLDAFPLLSAY"
    "VGRLSARPKLKAFLASPEYVNLPINGNGKQ"
)
symmetric_sequence_dict = {"A": symmetric_sequence}

In [None]:
sym_folder = UniFoldForFolding(
    model_name="uf_symmetry", use_templates=True, random_seed=0, symmetry_group="C2"
)

In [None]:
sym_structure, sym_aux_output = sym_folder(symmetric_sequence_dict, max_recycling_iters=3, num_ensembles=2)

In [None]:
sym_structure.show()