## Summary

---

## Imports

In [None]:
import concurrent.futures
import json
import os
import re
import socket
import subprocess
import sys
import tempfile
from pathlib import Path

import kmbio
import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
from elaspic2.plugins.proteinsolver import (
    ProteinSolver,
    ProteinSolverAnalyzeError,
    ProteinSolverBuildError,
)
from kmbio import PDB
from scipy import stats
from tqdm.notebook import tqdm

In [None]:
ProteinSolver.load_model()

## Parameters

In [None]:
NOTEBOOK_DIR = Path("02_run_elaspic").resolve()
NOTEBOOK_DIR.mkdir(exist_ok=True)

NOTEBOOK_DIR

In [None]:
if "DATAPKG_OUTPUT_DIR" in os.environ:
    OUTPUT_DIR = Path(os.getenv("DATAPKG_OUTPUT_DIR")).joinpath("elaspic2").resolve()
else:
    OUTPUT_DIR = NOTEBOOK_DIR.parent
OUTPUT_DIR.mkdir(exist_ok=True)

OUTPUT_DIR

In [None]:
if (slurm_tmpdir := os.getenv("SLURM_TMPDIR")) is not None:
    os.environ["TMPDIR"] = slurm_tmpdir
    
print(tempfile.gettempdir())

In [None]:
if "scinet" in socket.gethostname():
    CPU_COUNT = 40
else:
    CPU_COUNT = max(1, len(os.sched_getaffinity(0)))

CPU_COUNT = CPU_COUNT // 2

CPU_COUNT

In [None]:
DATASET_NAME = os.getenv("DATASET_NAME")
TASK_ID = os.getenv("SLURM_ARRAY_TASK_ID")
TASK_COUNT = os.getenv("ORIGINAL_ARRAY_TASK_COUNT") or os.getenv("SLURM_ARRAY_TASK_COUNT")

TASK_ID = int(TASK_ID) if TASK_ID is not None else None
TASK_COUNT = int(TASK_COUNT) if TASK_COUNT is not None else None

DATASET_NAME, TASK_ID, TASK_COUNT

In [None]:
DEBUG = TASK_ID is None

if DEBUG:
    DATASET_NAME = "starr-2020-interface"
    TASK_ID = 1
    TASK_COUNT = 1
else:
    assert DATASET_NAME is not None
    assert TASK_ID is not None
    assert TASK_COUNT is not None

DATASET_NAME, TASK_ID, TASK_COUNT

## Workspace

### Load data

In [None]:
input_file = OUTPUT_DIR.joinpath("01_load_data", f"{DATASET_NAME}.parquet")

input_file

In [None]:
pfile = pq.ParquetFile(input_file)

pfile.num_row_groups

In [None]:
assert TASK_COUNT == pfile.num_row_groups

In [None]:
INPUT_DF = pfile.read_row_group(TASK_ID - 1).to_pandas(integer_object_nulls=True)

In [None]:
display(INPUT_DF.head(2))
print(len(INPUT_DF))

### Create tasks

In [None]:
output_dir = OUTPUT_DIR.joinpath(NOTEBOOK_DIR.name)
output_dir.mkdir(exist_ok=True)

output_dir

In [None]:
input_data = []
for row in INPUT_DF.itertuples():
    for mutation, effect in zip(row.mutation, row.effect):
        if pd.isnull(effect):
            print(f"Skipping mutation {mutation} because the effect is unknown ({effect}).")
            continue
        input_data.append({
            "unique_id": row.unique_id,
            "mutation": mutation,
            "effect": effect,
            "effect_type": row.effect_type,
        })

input_df = pd.DataFrame(input_data)
len(input_df)

In [None]:
if DATASET_NAME == "starr-2020-core":
    elaspic_path = Path("/home/kimlab1/database_data/elaspic_v2/user_input/spike-sars2-co/.elaspic").resolve(strict=True)
elif DATASET_NAME == "starr-2020-interface":
    elaspic_path = Path("/home/kimlab1/database_data/elaspic_v2/user_input/spike-sars2-in/.elaspic").resolve(strict=True)
else:
    raise Exception

In [None]:
results_core = []
results_interface = []

for file in os.listdir(elaspic_path):
    if file.startswith("mutation_"):
        mutation = file.split(".")[0].split("_")[2]
#         print(mutation, file)
        with elaspic_path.joinpath(file).open("rt") as fin:
            data_list = json.load(fin)
        for data in data_list:
            assert mutation == data["mutation"]
            if "idxs" in data:
                if DATASET_NAME.endswith("-core"):
                    continue
                if data["idxs"] != [0, 1]:
                    print(f"Skipping interaction {data}.")
                    continue
                foldx_score_wt = float(data["analyse_complex_energy_wt"].split(",")[0])
                foldx_score_mut = float(data["analyse_complex_energy_mut"].split(",")[0])
                results_interface.append({
                    "mutation": mutation,
                    "elaspic_score": float(data["ddg"]),
                    "provean_score": float(data["provean_score"]),
                    "foldx_score": foldx_score_mut - foldx_score_wt,
                })
            else:
                foldx_score_wt = float(data["stability_energy_wt"].split(",")[0])
                foldx_score_mut = float(data["stability_energy_mut"].split(",")[0])
                results_core.append({
                    "mutation": mutation,
                    "elaspic_score": float(data["ddg"]),
                    "provean_score": float(data["provean_score"]),
                    "foldx_score": foldx_score_mut - foldx_score_wt,

                })

In [None]:
results_core_df = pd.DataFrame(results_core)

len(results_core_df)

In [None]:
results_interface_df = pd.DataFrame(results_interface, columns=results_core_df.columns)
results_interface_df["provean_score"] = results_interface_df["provean_score"].fillna(np.nan)

len(results_interface_df)

In [None]:
results_df = results_core_df.merge(results_interface_df, on=["mutation"], how="left", suffixes=("_core", "_interface"))

results_df["elaspic_score"] = [
    ((elaspic_score_interface) if pd.notnull(elaspic_score_interface) else elaspic_score_core)
    for elaspic_score_core, elaspic_score_interface
    in results_df[["elaspic_score_core", "elaspic_score_interface"]].values
]

# if results_df["provean_score_interface"].notnull().any():
#     assert np.allclose(results_df["provean_score_core"].values, results_df["provean_score_interface"].values, equal_nan=True)
results_df["provean_score"] = results_df[["provean_score_core"]].mean(axis=1)

results_df["foldx_score"] = [
    ((foldx_score_interface) if pd.notnull(foldx_score_interface) else foldx_score_core)
    for foldx_score_core, foldx_score_interface
    in results_df[["foldx_score_core", "foldx_score_interface"]].values
]

In [None]:
len(input_df)

In [None]:
input_wresults_df = input_df.merge(results_df, on=["mutation"])
len(input_wresults_df)  # Core: 749, Interface: 2891

In [None]:
stats.spearmanr(input_wresults_df["effect"], -input_wresults_df["elaspic_score"])  # 0.5014374415058359 / 0.5573847932834505

In [None]:
stats.spearmanr(input_wresults_df["effect"], -input_wresults_df["foldx_score"])  # 0.4621773590138444 / 0.5118665337096965

In [None]:
stats.spearmanr(input_wresults_df["effect"], input_wresults_df["provean_score"])  # 0.4433472260225827 / 0.4561308022195954

In [None]:
df = input_wresults_df.copy()

df["mutation_resnum"] = df["mutation"]
df["mutation"] = df["mutation_resnum"].apply(lambda x: f"{x[0]}{int(x[1:-1]) + 320}{x[-1]}")
df = df[["mutation", "elaspic_score", "provean_score", "foldx_score"]]

output_file = f"07_benchmarks/elaspic-{DATASET_NAME}.csv"
df.to_csv(output_file, index=False)

output_file