# Summary

Run EL2 to calculate stability.

### Executing

```bash
export NOTEBOOK_PATH="$(realpath 20_el2_affinity.ipynb)"
export DATASET_NAME="elaspic-interface-mutation-local"
export ORIGINAL_ARRAY_TASK_COUNT=9
sbatch --export=DATASET_NAME,NOTEBOOK_PATH,ORIGINAL_ARRAY_TASK_COUNT --array=1-9 ../scripts/run_notebook_cpu.sh

export NOTEBOOK_PATH="$(realpath 20_el2_affinity.ipynb)"
export DATASET_NAME="uniprot-domain-pair-mutation"
export ORIGINAL_ARRAY_TASK_COUNT=1358
sbatch --export=DATASET_NAME,NOTEBOOK_PATH,ORIGINAL_ARRAY_TASK_COUNT --array=1000-1400 ../scripts/run_notebook_cpu.sh

# On Cedar
 --ntasks-per-node=48
```

---

## Imports

In [1]:
import os
import socket
import tempfile
from pathlib import Path

import elaspic2 as el2
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
from kmbio import PDB
from kmtools import structure_tools
from tqdm.notebook import tqdm

## Parameters

In [2]:
NOTEBOOK_DIR = Path("20_el2_affinity").resolve()
NOTEBOOK_DIR.mkdir(exist_ok=True)

NOTEBOOK_DIR

PosixPath('/project/6008029/strokach/workspace/elaspic2/notebooks/20_el2_affinity')

In [3]:
if "DATAPKG_OUTPUT_DIR" in os.environ:
    OUTPUT_DIR = Path(os.getenv("DATAPKG_OUTPUT_DIR")).joinpath("elaspic2").resolve()
else:
    OUTPUT_DIR = NOTEBOOK_DIR.parent
OUTPUT_DIR.mkdir(exist_ok=True)

OUTPUT_DIR

PosixPath('/scratch/strokach/datapkg_output_dir/elaspic2')

In [4]:
if (slurm_tmpdir := os.getenv("SLURM_TMPDIR")) is not None:
    os.environ["TMPDIR"] = slurm_tmpdir
    
print(tempfile.gettempdir())

/localscratch/strokach.42267810.0


In [5]:
if "scinet" in socket.gethostname():
    CPU_COUNT = 40
else:
    CPU_COUNT = max(1, len(os.sched_getaffinity(0)))

CPU_COUNT = max(1, CPU_COUNT // 2)

CPU_COUNT

16

In [6]:
DATASET_NAME = os.getenv("DATASET_NAME")
TASK_ID = os.getenv("SLURM_ARRAY_TASK_ID")
TASK_COUNT = os.getenv("ORIGINAL_ARRAY_TASK_COUNT") or os.getenv("SLURM_ARRAY_TASK_COUNT")

TASK_ID = int(TASK_ID) if TASK_ID is not None else None
TASK_COUNT = int(TASK_COUNT) if TASK_COUNT is not None else None

DATASET_NAME, TASK_ID, TASK_COUNT

(None, None, None)

In [7]:
DEBUG = TASK_ID is None

if DEBUG:
    DATASET_NAME = "elaspic-interface-mutation-local"
    TASK_ID = 1
    TASK_COUNT = 9
else:
    assert DATASET_NAME is not None
    assert TASK_ID is not None
    assert TASK_COUNT is not None

DATASET_NAME, TASK_ID, TASK_COUNT

('elaspic-interface-mutation-local', 1, 9)

In [8]:
!ls {OUTPUT_DIR}/../elaspic-data/12_el2_to_recalculate

elaspic-core-mutation-local.parquet	  uniprot-domain-mutation.parquet
elaspic-interface-mutation-local.parquet  uniprot-domain-pair-mutation.parquet


## Workspace

### Load data

In [9]:
input_file = OUTPUT_DIR.joinpath(
    "..", "elaspic-data", "12_el2_to_recalculate", f"{DATASET_NAME}.parquet"
).resolve(strict=True)

input_file

PosixPath('/scratch/strokach/datapkg_output_dir/elaspic-data/12_el2_to_recalculate/elaspic-interface-mutation-local.parquet')

In [10]:
pfile = pq.ParquetFile(input_file)

pfile.num_row_groups

9

In [11]:
assert TASK_COUNT == pfile.num_row_groups, (TASK_COUNT, pfile.num_row_groups)

In [12]:
INPUT_DF = pfile.read_row_group(TASK_ID - 1).to_pandas(integer_object_nulls=True)

In [13]:
display(INPUT_DF.head(2))
print(len(INPUT_DF))

Unnamed: 0,interface_id,mutation,model_filename_wt,chain_modeller,mutation_modeller,structure
0,4,P37A,/home/kimlab1/database_data/elaspic_v2/user_in...,B,P37A,FoldX generated pdb file\n\nOutput generated b...
1,7,E38A,/home/kimlab1/database_data/elaspic_v2/user_in...,A,E38A,FoldX generated pdb file\n\nOutput generated b...


500


### Create tasks

In [14]:
model = el2.ELASPIC2()



In [15]:
results = []
for tup in tqdm(INPUT_DF.itertuples(), total=len(INPUT_DF)):
    if not tup.structure.strip():
        continue

    with tempfile.NamedTemporaryFile(suffix=".pdb") as structure_file_obj:
        with open(structure_file_obj.name, "wt") as fout:
            fout.write(tup.structure)
        structure = PDB.load(structure_file_obj.name)
        protein_sequence = structure_tools.get_chain_sequence(
            structure[0][tup.chain_modeller], if_unknown="replace", unknown_residue_marker=""
        )

        ligand_sequence = ""
        for chain in structure[0].chains:
            if chain.id == tup.chain_modeller:
                continue
            ligand_sequence = structure_tools.get_chain_sequence(
                structure[0][chain.id], if_unknown="replace", unknown_residue_marker=""
            )
            if ligand_sequence:
                break
        if not ligand_sequence:
            print(f"Skipping row with no ligand sequence: {tup._replace(structure='')}")
            continue

        protein_stability_features = model.build(
            structure_file=structure_file_obj.name,
            protein_sequence=protein_sequence,
            ligand_sequence=None,
            remove_hetatms=True,
        )
        protein_affinity_features = model.build(
            structure_file=structure_file_obj.name,
            protein_sequence=protein_sequence,
            ligand_sequence=ligand_sequence,
            remove_hetatms=True,
        )
    mutation_stability_features = model.analyze_mutation(
        tup.mutation_modeller, protein_stability_features
    )
    mutation_affinity_features = model.analyze_mutation(
        tup.mutation_modeller, protein_affinity_features
    )

    # Get final predictions
    row = tup._asdict()
    del row["Index"], row["model_filename_wt"], row["structure"]

    row["protbert_score"] = (
        mutation_affinity_features["protbert_interface_score_wt"]
        - mutation_affinity_features["protbert_interface_score_mut"]
    )
    row["proteinsolver_score"] = mutation_affinity_features["proteinsolver_interface_score_wt"]
    row["el2_score"] = model.predict_mutation_effect(
        [mutation_stability_features], [mutation_affinity_features]
    ).item()

    results.append(row)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=500.0), HTML(value='')))




In [16]:
results_df = pd.DataFrame(results)

results_df.head()

Unnamed: 0,interface_id,mutation,chain_modeller,mutation_modeller,protbert_score,proteinsolver_score,el2_score
0,4,P37A,B,P37A,0.994034,0.578799,0.936279
1,7,E38A,A,E38A,0.97716,0.033567,-0.253692
2,8,E38A,A,E38A,0.97716,0.020043,-1.3094
3,10,P37A,A,P37A,0.994034,0.359032,0.751546
4,15,N70W,G,N70W,0.905906,0.537607,1.375707


In [17]:
output_file = OUTPUT_DIR.joinpath(
    NOTEBOOK_DIR.name, DATASET_NAME, f"{DATASET_NAME}-{TASK_ID:04d}-{TASK_COUNT:04d}.parquet"
)
output_file.parent.mkdir(exist_ok=True, parents=True)

output_file

PosixPath('/scratch/strokach/datapkg_output_dir/elaspic2/20_el2_affinity/elaspic-interface-mutation-local/elaspic-interface-mutation-local-0001-0009.parquet')

In [18]:
pq.write_table(pa.Table.from_pandas(results_df, preserve_index=False), output_file)

In [19]:
with output_file.with_suffix(".SUCCESS").open("w") as fout:
    pass