Integrated to Bohrium Apps (#130)

dptech-corp · Sep 18, 2023 · 595c0c3 · 595c0c3
1 parent b1c89a2
commit 595c0c3
Showing 1 changed file with 196 additions and 0 deletions.
diff --git a/launching.py b/launching.py
@@ -0,0 +1,196 @@
+# Copyright 2022 DP Technology
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+entrypoint for Uni-Fold @ Bohrium Apps. 
+temporary URL: https://labs.dp.tech/projects/unifold/
+"""
+
+import sys
+
+from dp.launching.typing import (
+    BaseModel,
+    Field,
+    Int,
+    String,
+    Boolean,
+    OutputDirectory,
+)
+from dp.launching.cli import to_runner, default_minimal_exception_handler
+
+import os
+import re
+import random
+import hashlib
+import numpy as np
+from pathlib import Path
+from unifold.colab.data import validate_input
+from unifold.msa.utils import divide_multi_chains
+
+import pickle
+import gzip
+from unifold.msa import parsers
+from unifold.msa import pipeline
+from unifold.data.utils import compress_features
+from unifold.data.protein import PDB_CHAIN_IDS
+from unifold.colab.mmseqs import get_msa_and_templates
+from unifold.colab.model import colab_inference
+
+
+MIN_SINGLE_SEQUENCE_LENGTH = 16 # to satisfy mmseqs
+MAX_SINGLE_SEQUENCE_LENGTH = 3000
+MAX_MULTIMER_LENGTH = 3000
+PARAM_DIR = "/root/params"
+
+
+class UnifoldOptions(BaseModel):
+    sequence: String = Field(
+        min_length=6, max_length=3000,
+        description="Input sequence(s). For multimeric targets, please separate different chains with `;`.",
+    )
+    name: String = Field(
+        default="unifold", min_length=0, max_length=31,
+        description="Name of the target. "
+    )
+    symmetry_group: String = Field(default="C1")
+    use_template: Boolean = Field(default=True)
+    use_msa: Boolean = Field(default=True)
+    num_recycling: Int = Field(default=4, ge=1, le=8)
+    num_ensembles: Int = Field(default=2, ge=1, le=5)
+    num_replica: Int = Field(default=1, ge=1, le=5,
+        description="Times of repeatedly running Uni-Fold (with different preprocessing seeds for MSA sampling, etc.)."
+    )
+    seed: Int = Field(default=0, ge=0)
+    output_dir: OutputDirectory = Field(
+        default="./output"
+    )  # default will be override after online
+
+def main(opts: UnifoldOptions) -> int:
+    input_sequences = opts.sequence.strip().split(";")
+    symmetry_group = opts.symmetry_group
+    output_dir_base = opts.output_dir.get_full_path()
+    jobname = re.sub(r'[^a-zA-Z0-9]', '_', opts.name).lower()
+    target_id = jobname
+    use_templates = opts.use_template
+    msa_mode = "MMseqs2" if opts.use_msa else "single_sequence"
+    times = opts.num_replica
+    num_ensembles = opts.num_ensembles
+    num_recycling = opts.num_recycling
+    manual_seed = opts.seed
+
+    os.makedirs(output_dir_base, exist_ok=True)
+
+    sequences, is_multimer, symmetry_group = validate_input(
+        input_sequences=input_sequences,
+        symmetry_group=symmetry_group,
+        min_length=MIN_SINGLE_SEQUENCE_LENGTH,
+        max_length=MAX_SINGLE_SEQUENCE_LENGTH,
+        max_multimer_length=MAX_MULTIMER_LENGTH
+    )
+    descriptions = ['> '+target_id+' seq'+str(ii) for ii in range(len(sequences))]
+
+    if is_multimer:
+        divide_multi_chains(target_id, output_dir_base, sequences, descriptions)
+
+    s = []
+    for des, seq in zip(descriptions, sequences):
+        s += [des, seq]
+
+    unique_sequences = []
+    [unique_sequences.append(x) for x in sequences if x not in unique_sequences]
+
+    if len(unique_sequences)==1:
+        homooligomers_num = len(sequences)
+    else:
+        homooligomers_num = 1
+
+    with open(os.path.join(output_dir_base, f"{jobname}.fasta"), "w") as f:
+        f.write("\n".join(s))
+
+    result_dir = Path(output_dir_base)
+    output_dir = os.path.join(output_dir_base, target_id)
+
+    (
+        unpaired_msa,
+        paired_msa,
+        template_results,
+    ) = get_msa_and_templates(
+        target_id,
+        unique_sequences,
+        result_dir=result_dir,
+        msa_mode=msa_mode,
+        use_templates=use_templates,
+        homooligomers_num = homooligomers_num
+    )
+
+    for idx, seq in enumerate(unique_sequences):
+        chain_id = PDB_CHAIN_IDS[idx]
+        sequence_features = pipeline.make_sequence_features(
+                sequence=seq, description=f'> {jobname} seq {chain_id}', num_res=len(seq)
+            )
+        monomer_msa = parsers.parse_a3m(unpaired_msa[idx])
+        msa_features = pipeline.make_msa_features([monomer_msa])
+        template_features = template_results[idx]
+        feature_dict = {**sequence_features, **msa_features, **template_features}
+        feature_dict = compress_features(feature_dict)
+        features_output_path = os.path.join(
+                output_dir, "{}.feature.pkl.gz".format(chain_id)
+            )
+        pickle.dump(
+            feature_dict, 
+            gzip.GzipFile(features_output_path, "wb"), 
+            protocol=4
+            )
+        if is_multimer:
+            multimer_msa = parsers.parse_a3m(paired_msa[idx])
+            pair_features = pipeline.make_msa_features([multimer_msa])
+            pair_feature_dict = compress_features(pair_features)
+            uniprot_output_path = os.path.join(
+                output_dir, "{}.uniprot.pkl.gz".format(chain_id)
+            )
+            pickle.dump(
+                pair_feature_dict,
+                gzip.GzipFile(uniprot_output_path, "wb"),
+                protocol=4,
+            )
+
+    best_result = colab_inference(
+        target_id=target_id,
+        data_dir=output_dir,
+        param_dir=PARAM_DIR,
+        output_dir=output_dir,
+        symmetry_group=symmetry_group,
+        is_multimer=is_multimer,
+        max_recycling_iters=num_recycling - 1,
+        num_ensembles=num_ensembles,
+        times=times,
+        manual_seed=manual_seed,
+        device="cuda:0",                # do not change this on colab.
+    )
+
+    return
+
+
+
+def to_parser():
+    return to_runner(
+        UnifoldOptions,
+        main,
+        version='0.1.0',
+        exception_handler=default_minimal_exception_handler,
+    )
+
+if __name__ == '__main__':
+    import sys
+    to_parser()(sys.argv[1:])