## Summary

---

## Imports

In [None]:
from dotenv import load_dotenv

load_dotenv("../.env", override=True)

In [None]:
import io
import os
import random
import tempfile
from pathlib import Path

import pandas as pd
import requests
import sqlalchemy as sa
from kmbio import PDB
from kmtools import structure_tools

In [None]:
pd.set_option("max_columns", 1_000)
pd.set_option("max_rows", 1_000)

## Parameters

In [None]:
NOTEBOOK_DIR = Path("03_submit_mutations_affinity").resolve()
NOTEBOOK_DIR.mkdir(exist_ok=True)

NOTEBOOK_DIR

In [None]:
JOB_ID = "spike-sars2-in"

JOB_ID

In [None]:
USER_INPUT_DIR = Path(os.environ["DATA_DIR"], "user_input").resolve(strict=True)
JOB_DIR = USER_INPUT_DIR.joinpath(JOB_ID)
JOB_DIR.mkdir(exist_ok=True)

JOB_DIR

In [None]:
ELASPIC_REST_API_URL = "http://localhost:8055"

ELASPIC_REST_API_URL

In [None]:
DB_USER = os.environ["DB_USER"]
DB_PASSWORD = os.environ["DB_PASSWORD"]
DB_HOST = os.environ["DB_HOST"]
DB_PORT = os.environ["DB_PORT"]

engine = sa.create_engine(f"mysql://{DB_USER}:{DB_PASSWORD}@{DB_HOST}:{DB_PORT}/elaspic_webserver")

## Helper functions

In [None]:
def download_csv(url):
    header = {
        "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.75 Safari/537.36",
        "X-Requested-With": "XMLHttpRequest",
    }
    df = pd.read_csv(io.StringIO(requests.get(url, headers=header).text))
    return df

In [None]:
def structure_to_blob(structure):
    with tempfile.NamedTemporaryFile(suffix=".pdb") as out:
        PDB.save(structure, out.name)
        with open(out.name, "rt") as fin:
            data = fin.read()
    return data

In [None]:
def sequence_matches_structure(sequence, structure_blob):
    with tempfile.NamedTemporaryFile(suffix=".pdb") as tmp_file:
        with open(tmp_file.name, "wt") as fout:
            fout.write(structure_blob)
        structure = PDB.load(tmp_file.name)

    chain_sequence = structure_tools.get_chain_sequence(
        structure[0]["A"], if_unknown="replace", unknown_residue_marker=""
    )
    return sequence == chain_sequence

In [None]:
def parse_mutation_data(structure, stability_df):
    amino_acids = "GVALICMFWPDESTYQNKRH"
    assert len(set(amino_acids)) == 20

    chain_sequence = structure_tools.get_chain_sequence(
        structure[0]["A"], if_unknown="replace", unknown_residue_marker=""
    )
    resnum_to_idx_mapping = {r.id[1]: i for i, r in enumerate(structure[0]["A"])}

    mutations = []
    effects = []
    num_mismatches = 0
    for tup in stability_df.itertuples():
        idx = resnum_to_idx_mapping.get(tup.site_SARS2)
        if idx is None or chain_sequence[idx] != tup.wildtype:
            num_mismatches += 1
            continue
        if tup.wildtype not in amino_acids or tup.mutant not in amino_acids:
            if tup.mutant != "*":
                print(f"Skipping strange mutation {tup.wildtype} → {tup.mutant}.")
            continue
        elif tup.wildtype == tup.mutant:
            # Exclude "mutations" where the wildtype and mutant residues are the same
            continue
        elif pd.isnull(tup.expr_avg):
            # For some reason, the effect is missing in some rows
            continue
        mutations.append(f"{tup.wildtype}{idx + 1}{tup.mutant}")
        effects.append(tup.expr_avg)
    print(f"Num mismatches: {num_mismatches}.")

    results = [
        {
            "dataset": NOTEBOOK_DIR.name,
            "name": f"spike-unbound|spike-closed",
            "protein_sequence": chain_sequence,
            "mutation": mutations,
            "effect": effects,
            "effect_type": "Deep mutation scan",
            "protein_structure": structure_to_blob(structure),
        }
    ]
    return results

## Workspace

### Load protein structure

In [None]:
structure_ref = PDB.load("rcsb://6M0J.pdb")

In [None]:
PDB.view_structure(structure_ref)

In [None]:
structure = PDB.Structure("rbd-ace2")
model = PDB.Model(0)
structure.add(model)
chain2 = structure_ref[0]["A"].copy()
chain2.id = "B"
chain1 = structure_ref[0]["E"].copy()
chain1.id = "A"
model.add(chain1)
model.add(chain2)

In [None]:
PDB.view_structure(structure)

### Load a list of mutations

In [None]:
affinity_df = download_csv("https://ars.els-cdn.com/content/image/1-s2.0-S0092867420310035-mmc2.csv")

In [None]:
affinity_df.tail()

### Parse mutation data

In [None]:
protein_mutation_info = parse_mutation_data(structure, affinity_df)[0]

list(protein_mutation_info.keys())

In [None]:
protein_mutation_info["mutation"][0]

### Remove finished mutations

In [None]:
sql = f"""\
select *
from elaspic_core_mutation_local
where protein_id = '{JOB_ID}' and ddg is not NULL;
"""

finished_df = pd.read_sql_query(sql, engine)
len(finished_df)

In [None]:
finished_df.head()

In [None]:
finished_df = finished_df[finished_df["ddg"].notnull()]
len(finished_df)

In [None]:
len(protein_mutation_info["mutation"])

In [None]:
finished_mutations = set(finished_df["mutation"])
unfinished_mutations = list(set(protein_mutation_info["mutation"]) - finished_mutations)
# random.shuffle(unfinished_mutations)

len(unfinished_mutations)

In [None]:
# Validate all unfinished mutations
df = structure.to_dataframe()
df = df[df["chain_idx"] == 0]

for mutation in unfinished_mutations:
    assert df[(df["residue_idx"] == (int(mutation[1:-1]) - 1))]["residue_resname"].values[0] == structure_tools.constants.A_DICT[mutation[0]]

### Submit jobs

In [None]:
mutations = unfinished_mutations
pdb_mutations = [f"1_{mutation}" for mutation in mutations]
pdb_mutation = ",".join(pdb_mutations)

pdb_mutation[:1000]

In [None]:
job_payload = {
    "api_token": os.environ["API_TOKEN"],
    "job_id": JOB_ID,
    "job_type": "local",
    "job_email": "alexey.strokach@kimlab.org",
    "mutations": [
        {
            "mutations": pdb_mutation,
            "protein_id": JOB_ID,
            "structure_file": "input.pdb",
        }
    ],
}

In [None]:
# ELASPIC_REST_API_URL = "http://192.168.6.18:8080/api/v1/"
ELASPIC_REST_API_URL = "http://192.168.6.241:8080/api/v1/"

ELASPIC_REST_API_URL

In [None]:
# r = requests.post(ELASPIC_REST_API_URL, json=job_payload)
# if not r.ok:
#     print(f"Bad response from ELASPIC REST server: {r}")
# else:
#     status = r.json().get("status", None)
#     print(f"status: {status}")

In [None]:
ELASPIC_REST_API_URL = "http://localhost:8055/"

ELASPIC_REST_API_URL

In [None]:
!ls ../../elaspic2/notebooks/07_benchmarks

In [None]:
df = pd.read_csv("../../elaspic2/notebooks/07_benchmarks/el2_rbd_affinity.csv")

In [None]:
df.head()

In [None]:
if "el2_pred" not in finished_df:
    finished_df = finished_df.merge(df[["mutation", "el2_pred"]], how="left", on=["mutation"])

In [None]:
# assert finished_df["el2_pred"].notnull().all()

In [None]:
finished_df[finished_df["el2_pred"].isnull()]

In [None]:
# with engine.connect() as conn:
#     for tup in finished_df.itertuples():
#         if pd.isnull(tup.el2_pred):
#             continue
#         sql = f"""\
# update elaspic_interface_mutation_local
# set el2_score = {tup.el2_pred}
# where protein_id = '{tup.protein_id}' and mutation = '{tup.mutation}'
# """
#         conn.execute(sql)

In [None]:
http://elaspic.ccbr.proteinsolver.org/result/spike-sars2-in/6zoy-rbd.pdb.F165T/?p=h11709

In [None]:
import re


def getPnM(p):
    """Return protein and mutation from the format PROT.MUT."""
    protnMut = re.match(r"(.+)\.([A-Za-z]{1}[0-9]+[A-Za-z]{1}_?[0-9]*)$", p)
    if not protnMut:
        return None, None
    return protnMut.group(1).upper(), protnMut.group(2).upper()

In [None]:
path = "/result/spike-sars2-in/6zoy-rbd.pdb.F165T/"

currentIDs = path.split("/")

currentIDs

In [None]:
getPnM(currentIDs[3])

In [None]:
xxx ="http://elaspic.ccbr.proteinsolver.org/result/spike-sars2-in/6zoy-rbd.pdb.F165T/?p=h11709"

In [None]:
!ls -al

In [None]:
os.makedirs()

In [None]:
import random

"%06x" % random.randint(1, 16777215)

In [None]:
import uuid

uuid.uuid4().hex[:12]

In [None]:
0o002