## Summary

Calculate features using [Rosetta's `cartesian_ddg` protocol](https://www.rosettacommons.org/docs/latest/cartesian-ddG).

### Executing

```bash
DATASET_NAME="elaspic-training-set-core" NOTEBOOK_PATH="$(realpath 02_run_rosetta_ddg.ipynb)" sbatch --array=1-162 ../scripts/run_notebook_cpu.sh

DATASET_NAME="protherm-dagger-core" NOTEBOOK_PATH="$(realpath 02_run_rosetta_ddg.ipynb)" sbatch --array=1-2 ../scripts/run_notebook_cpu.sh

DATASET_NAME="rocklin-2017-core" NOTEBOOK_PATH="$(realpath 02_run_rosetta_ddg.ipynb)" sbatch --array=1-1 ../scripts/run_notebook_cpu.sh

```


---

## Imports

In [None]:
import concurrent.futures
import os
import re
import socket
import sys
import tempfile
from pathlib import Path

import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
from ev2.plugins.modeller import Modeller
from ev2.plugins.rosetta_ddg import RosettaDDG
from kmbio import PDB
from kmtools.structure_tools import DomainTarget
from tqdm.notebook import tqdm

## Parameters

In [None]:
NOTEBOOK_DIR = Path("02_run_rosetta_ddg").resolve()
NOTEBOOK_DIR.mkdir(exist_ok=True)

NOTEBOOK_DIR

In [None]:
if "DATAPKG_OUTPUT_DIR" in os.environ:
    OUTPUT_DIR = Path(os.getenv("DATAPKG_OUTPUT_DIR")).joinpath("elaspic-v2").resolve()
else:
    OUTPUT_DIR = NOTEBOOK_DIR.parent
OUTPUT_DIR.mkdir(exist_ok=True)

OUTPUT_DIR

In [None]:
if (slurm_tmpdir := os.getenv("SLURM_TMPDIR")) is not None:
    os.environ["TMPDIR"] = slurm_tmpdir
    
print(tempfile.gettempdir())

In [None]:
if "scinet" in socket.gethostname():
    CPU_COUNT = 40
else:
    CPU_COUNT = max(1, len(os.sched_getaffinity(0)))

CPU_COUNT

In [None]:
DATASET_NAME = os.getenv("DATASET_NAME")
TASK_ID = os.getenv("SLURM_ARRAY_TASK_ID")
TASK_COUNT = os.getenv("ORIGINAL_ARRAY_TASK_COUNT") or os.getenv("SLURM_ARRAY_TASK_COUNT")

TASK_ID = int(TASK_ID) if TASK_ID is not None else None
TASK_COUNT = int(TASK_COUNT) if TASK_COUNT is not None else None

DATASET_NAME, TASK_ID, TASK_COUNT

In [None]:
DEBUG = TASK_ID is None

if DEBUG:
    DATASET_NAME = "elaspic-training-set-core"
    TASK_ID = 43
    TASK_COUNT = 162
else:
    assert DATASET_NAME is not None
    assert TASK_ID is not None
    assert TASK_COUNT is not None

DATASET_NAME, TASK_ID, TASK_COUNT

## Workspace

### Load data

In [None]:
input_file = OUTPUT_DIR.joinpath("01_load_data", f"{DATASET_NAME}.parquet")

input_file

In [None]:
pfile = pq.ParquetFile(input_file)

pfile.num_row_groups

In [None]:
assert TASK_COUNT == pfile.num_row_groups

In [None]:
INPUT_DF = pfile.read_row_group(TASK_ID - 1).to_pandas(integer_object_nulls=True)

In [None]:
display(INPUT_DF.head(2))
print(len(INPUT_DF))

### Create tasks

In [None]:
output_dir = OUTPUT_DIR.joinpath(NOTEBOOK_DIR.name)
output_dir.mkdir(exist_ok=True)

output_dir

In [None]:
tasks = []
for row in tqdm(INPUT_DF.itertuples(), total=len(INPUT_DF)):

    with tempfile.NamedTemporaryFile(suffix=".pdb") as tmp_file:
        with open(tmp_file.name, "wt") as fout:
            fout.write(row.protein_structure)
        data = RosettaDDG.build(
            tmp_file.name,
            protocol="cartesian_ddg",
            energy_function="beta_nov16_cart",
            interface=0,
        )

    _seen = set()
    for idx in range(len(row.mutation)):
        mutation = row.mutation[idx]
        if mutation in _seen:
            print(
                f"Already added mutation '{mutation}' for protein ({row.unique_id}, {row.dataset}, {row.name})."
            )
            continue
        _seen.add(mutation)

        aa = "GVALICMFWPDESTYQNKRH"
        if re.search(f"^[{aa}][1-9]+[0-9]*[{aa}]$", mutation) is None:
            print(f"Skipping mutation {mutation} because it appears to be malformed.")

        data_mut = {"unique_id": row.unique_id, "effect_type": row.effect_type}
        for column in ["mutation", "effect", "provean_score", "foldx_score", "elaspic_score"]:
            if column in row._fields:
                data_mut[column] = getattr(row, column)[idx]

        tasks.append((data, data_mut, row.protein_sequence))

len(tasks)

In [None]:
tasks[0]

### Wildtype to mutant

In [None]:
def worker_wt2mut(input):
    data, data_mut, _ = input
    mutation = data_mut["mutation"]
    results = RosettaDDG.analyze_mutation(f"A_{mutation}", data)
    results = {f"rosetta_{key}": value for key, value in results.items()}
    return {**data_mut, **results}

In [None]:
with concurrent.futures.ProcessPoolExecutor(CPU_COUNT) as pool:
    results = list(tqdm(pool.map(worker_wt2mut, tasks), total=len(tasks)))

results_wt2mut_df = pd.DataFrame(results)

In [None]:
output_file_wt2mut = output_dir.joinpath(f"{DATASET_NAME}-wt2mut-{TASK_ID}-{TASK_COUNT}.parquet")

output_file_wt2mut

In [None]:
pq.write_table(pa.Table.from_pandas(results_wt2mut_df, preserve_index=False), output_file_wt2mut)

### Mutant to wildtype

In [None]:
def mutate_sequence(protein_sequence, mutation):
    amino_acids = list(protein_sequence)
    amino_acids[int(mutation[1:-1]) - 1] = mutation[-1]
    protein_sequence_mut = "".join(amino_acids)
    return protein_sequence_mut


def create_model(structure_file, protein_sequence, mutation):
    modeller_data = Modeller.build(structure_file.as_posix())
    protein_sequence_mut = mutate_sequence(protein_sequence, mutation)
    target = DomainTarget(0, "A", protein_sequence, None, None, protein_sequence_mut)
    structure_bm, results = Modeller.create_model([target], modeller_data)
    structure_file_mut = structure_file.parent.joinpath(f"{structure_file.stem}-{mutation}.pdb")
    PDB.save(structure_bm, structure_file_mut)
    return structure_file_mut


def worker_mut2wt(input):
    data, data_mut, protein_sequence = input
    mutation = data_mut["mutation"]
    mutation_rev = mutation[-1] + mutation[1:-1] + mutation[0]
    # Mutate model
    structure_file_mut = create_model(Path(data.structure_file), protein_sequence, mutation)
    data = data._replace(structure_file=structure_file_mut.as_posix())
    # Update mutation data
    data_mut["mutation"] = mutation_rev
    for key, value in data_mut.items():
        if isinstance(value, (int, float)):
            data_mut[key] = -value
    return worker_wt2mut((data, data_mut, protein_sequence))

In [None]:
with concurrent.futures.ProcessPoolExecutor(CPU_COUNT) as pool:
    results = list(tqdm(pool.map(worker_mut2wt, tasks), total=len(tasks)))

results_mut2wt_df = pd.DataFrame(results)

In [None]:
output_file_mut2wt = output_dir.joinpath(f"{DATASET_NAME}-mut2wt-{TASK_ID}-{TASK_COUNT}.parquet")

output_file_mut2wt

In [None]:
pq.write_table(pa.Table.from_pandas(results_mut2wt_df, preserve_index=False), output_file_mut2wt)