## Summary

---

## Imports

In [None]:
import concurrent.futures
import json
import os
import re
import socket
import subprocess
import sys
import tempfile
from pathlib import Path

import kmbio
import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
from ev2.plugins.proteinsolver import (
    ProteinSolver,
    ProteinSolverAnalyzeError,
    ProteinSolverBuildError,
)
from kmbio import PDB
from tqdm.notebook import tqdm

In [None]:
ProteinSolver.load_model()

## Parameters

In [None]:
NOTEBOOK_DIR = Path("02_run_elaspic").resolve()
NOTEBOOK_DIR.mkdir(exist_ok=True)

NOTEBOOK_DIR

In [None]:
if "DATAPKG_OUTPUT_DIR" in os.environ:
    OUTPUT_DIR = Path(os.getenv("DATAPKG_OUTPUT_DIR")).joinpath("elaspic-v2").resolve()
else:
    OUTPUT_DIR = NOTEBOOK_DIR.parent
OUTPUT_DIR.mkdir(exist_ok=True)

OUTPUT_DIR

In [None]:
if (slurm_tmpdir := os.getenv("SLURM_TMPDIR")) is not None:
    os.environ["TMPDIR"] = slurm_tmpdir
    
print(tempfile.gettempdir())

In [None]:
if "scinet" in socket.gethostname():
    CPU_COUNT = 40
else:
    CPU_COUNT = max(1, len(os.sched_getaffinity(0)))

CPU_COUNT = CPU_COUNT // 2

CPU_COUNT

In [None]:
DATASET_NAME = os.getenv("DATASET_NAME")
TASK_ID = os.getenv("SLURM_ARRAY_TASK_ID")
TASK_COUNT = os.getenv("ORIGINAL_ARRAY_TASK_COUNT") or os.getenv("SLURM_ARRAY_TASK_COUNT")

TASK_ID = int(TASK_ID) if TASK_ID is not None else None
TASK_COUNT = int(TASK_COUNT) if TASK_COUNT is not None else None

DATASET_NAME, TASK_ID, TASK_COUNT

In [None]:
DEBUG = TASK_ID is None

if DEBUG:
    DATASET_NAME = "starr-2020-core"
    TASK_ID = 1
    TASK_COUNT = 1
else:
    assert DATASET_NAME is not None
    assert TASK_ID is not None
    assert TASK_COUNT is not None

DATASET_NAME, TASK_ID, TASK_COUNT

## Workspace

### Load data

In [None]:
input_file = OUTPUT_DIR.joinpath("01_load_data", f"{DATASET_NAME}.parquet")

input_file

In [None]:
pfile = pq.ParquetFile(input_file)

pfile.num_row_groups

In [None]:
assert TASK_COUNT == pfile.num_row_groups

In [None]:
INPUT_DF = pfile.read_row_group(TASK_ID - 1).to_pandas(integer_object_nulls=True)

In [None]:
display(INPUT_DF.head(2))
print(len(INPUT_DF))

### Create tasks

In [None]:
output_dir = OUTPUT_DIR.joinpath(NOTEBOOK_DIR.name)
output_dir.mkdir(exist_ok=True)

output_dir

In [None]:
input_data = []
for row in INPUT_DF.itertuples():
    for mutation, effect in zip(row.mutation, row.effect):
        if pd.isnull(effect):
            print(f"Skipping mutation {mutation} because the effect is unknown ({effect}).")
            continue
        input_data.append({
            "unique_id": row.unique_id,
            "mutation": mutation,
            "effect": effect,
            "effect_type": row.effect_type,
        })

input_df = pd.DataFrame(input_data)
len(input_df)

In [None]:
if DATASET_NAME == "starr-2020-core":
    elaspic_path = Path("/home/kimlab1/database_data/elaspic_v2/user_input/b3a357/.elaspic").resolve(strict=True)
elif DATASET_NAME == "starr-2020-interface":
    elaspic_path = Path("/home/kimlab1/database_data/elaspic_v2/user_input/7cc10a/.elaspic").resolve(strict=True)
else:
    raise Exception

In [None]:
results_core = []
results_interface = []

for file in os.listdir(elaspic_path):
    if file.startswith("mutation_"):
        mutation = file.split(".")[0].split("_")[2]
#         print(mutation, file)
        with elaspic_path.joinpath(file).open("rt") as fin:
            data_list = json.load(fin)
        for data in data_list:
            assert mutation == data["mutation"]
            if "idxs" in data:
                if DATASET_NAME.endswith("-core"):
                    continue
                if data["idxs"] != [0, 1]:
                    print(f"Skipping interaction {data}.")
                    continue
                foldx_score_wt = float(data["analyse_complex_energy_wt"].split(",")[0])
                foldx_score_mut = float(data["analyse_complex_energy_mut"].split(",")[0])
                results_interface.append({
                    "mutation": mutation,
                    "elaspic_score": float(data["ddg"]),
                    "provean_score": float(data["provean_score"]),
                    "foldx_score": foldx_score_mut - foldx_score_wt,
                })
            else:
                foldx_score_wt = float(data["stability_energy_wt"].split(",")[0])
                foldx_score_mut = float(data["stability_energy_mut"].split(",")[0])
                results_core.append({
                    "mutation": mutation,
                    "elaspic_score": float(data["ddg"]),
                    "provean_score": float(data["provean_score"]),
                    "foldx_score": foldx_score_mut - foldx_score_wt,

                })

In [None]:
results_core_df = pd.DataFrame(results_core)

len(results_core_df)

In [None]:
results_interface_df = pd.DataFrame(results_interface, columns=results_core_df.columns)
results_interface_df["provean_score"] = results_interface_df["provean_score"].fillna(np.nan)

len(results_interface_df)

In [None]:
results_df = results_core_df.merge(results_interface_df, on=["mutation"], how="left", suffixes=("_core", "_interface"))

results_df["elaspic_score"] = [
    ((elaspic_score_interface) if pd.notnull(elaspic_score_interface) else elaspic_score_core)
    for elaspic_score_core, elaspic_score_interface
    in results_df[["elaspic_score_core", "elaspic_score_interface"]].values
]

# if results_df["provean_score_interface"].notnull().any():
#     assert np.allclose(results_df["provean_score_core"].values, results_df["provean_score_interface"].values, equal_nan=True)
results_df["provean_score"] = results_df[["provean_score_core"]].mean(axis=1)

results_df["foldx_score"] = [
    ((foldx_score_interface) if pd.notnull(foldx_score_interface) else foldx_score_core)
    for foldx_score_core, foldx_score_interface
    in results_df[["foldx_score_core", "foldx_score_interface"]].values
]

In [None]:
len(input_df)

In [None]:
input_wresults_df = input_df.merge(results_df, on=["mutation"])
len(input_wresults_df)  # Core: 749, Interface: 2891

In [None]:
from scipy import stats

stats.spearmanr(input_wresults_df["effect"], -input_wresults_df["elaspic_score"])  # 0.53153

In [None]:
from scipy import stats

stats.spearmanr(input_wresults_df["effect"], -input_wresults_df["foldx_score"])  # 0.4854

In [None]:
from scipy import stats

stats.spearmanr(input_wresults_df["effect"], input_wresults_df["provean_score"])  # 0.4741

In [None]:
test_stats = {
    # Core
    ("starr-2020-core", "elaspic_score", "core"): -0.081719600581758,
    ("starr-2020-core", "foldx_score", "core"): 0.48478755976010524,
    ("starr-2020-core", "provean_score", "core"): 0.4261149009423275,
    # Interface
    ("starr-2020-interface", "elaspic_score", "core"): 0.5140238363135885,
    ("starr-2020-interface", "foldx_score", "core"): 0.5294542669183915,
    ("starr-2020-interface", "provean_score", "core"): 0.4133588948415616,
}

In [None]:
raise Exception("Done!")

In [None]:
tasks = []
for row in tqdm(INPUT_DF.itertuples(), total=len(INPUT_DF)):

    with tempfile.NamedTemporaryFile(suffix=".pdb") as tmp_file:
        with open(tmp_file.name, "wt") as fout:
            fout.write(row.protein_structure)
        try:
            data_core = ProteinSolver.build(tmp_file.name, row.protein_sequence, None)
            if row.ligand_sequence is not None:
                data_interface = ProteinSolver.build(
                    tmp_file.name, row.protein_sequence, row.ligand_sequence
                )
            else:
                data_interface = None
        except ProteinSolverBuildError as e:
            print(e)
            continue

    _seen = set()
    for idx in range(len(row.mutation)):
        mutation = row.mutation[idx]
        if mutation in _seen:
            print(
                f"Already added mutation '{mutation}' for protein ({row.unique_id}, {row.dataset}, {row.name})."
            )
            continue
        _seen.add(mutation)

        aa = "GVALICMFWPDESTYQNKRH"
        if re.search(f"^[{aa}][1-9]+[0-9]*[{aa}]$", mutation) is None:
            print(f"Skipping mutation {mutation} because it appears to be malformed.")
            continue

        if mutation[0] == mutation[-1]:
            print(f"Skipping mutation {mutation} because the wildtype and mutant residues are the same.")
            continue

        data_mut = {
            "unique_id": row.unique_id,
            "mutation": row.mutation[idx],
            "effect": row.effect[idx],
        }

        tasks.append((data_core, data_interface, data_mut))

len(tasks)

In [None]:
tasks[0]

### Evaluate mutations

In [None]:
def worker(input):
    data_core, data_interface, data_mut = input
    mutation = data_mut["mutation"]
    try:
        results_core = ProteinSolver.analyze_mutation(f"A_{mutation}", data_core)
        if data_interface is not None:
            results_interface = ProteinSolver.analyze_mutation(f"A_{mutation}", data_interface)
        else:
            results_interface = {}
    except ProteinSolverAnalyzeError as e:
        print(e)
        return None

    results = {
        **data_mut,
        **{f"proteinsolver_core_{key}": value for key, value in results_core.items()},
        **{f"proteinsolver_interface_{key}": value for key, value in results_interface.items()},
    }
    return results

In [None]:
worker(tasks[0])

In [None]:
with concurrent.futures.ProcessPoolExecutor(CPU_COUNT) as pool:
    results = list(tqdm(pool.map(worker, tasks), total=len(tasks)))

results_df = pd.DataFrame([l for l in results if l is not None])

In [None]:
output_file = output_dir.joinpath(f"{DATASET_NAME}-{TASK_ID}-{TASK_COUNT}.parquet")

output_file

In [None]:
pq.write_table(pa.Table.from_pandas(results_df, preserve_index=False), output_file)