## Summary

Calculate features using [Rosetta's `cartesian_ddg` protocol](https://www.rosettacommons.org/docs/latest/cartesian-ddG).

### Executing

```bash
DATASET_NAME="elaspic-training-set-core" NOTEBOOK_PATH="$(realpath 02_run_rosetta_ddg.ipynb)" sbatch --array=1-162 ../scripts/run_notebook_cpu.sh

DATASET_NAME="protherm-dagger-core" NOTEBOOK_PATH="$(realpath 02_run_rosetta_ddg.ipynb)" sbatch --array=1-2 ../scripts/run_notebook_cpu.sh

DATASET_NAME="rocklin-2017-core" NOTEBOOK_PATH="$(realpath 02_run_rosetta_ddg.ipynb)" sbatch --array=1-1 ../scripts/run_notebook_cpu.sh

DATASET_NAME="elaspic-training-set-interface" NOTEBOOK_PATH="$(realpath 02_run_rosetta_ddg.ipynb)" sbatch --array=1-26 ../scripts/run_notebook_cpu.sh


export NOTEBOOK_PATH="$(realpath 02_run_proteinsolver.ipynb)"
export DATASET_NAME="dunham-2020-core"
export ORIGINAL_ARRAY_TASK_COUNT=12
sbatch --array=6 --time=24:00:00 --ntasks-per-node=40 --export=NOTEBOOK_PATH,DATASET_NAME,ORIGINAL_ARRAY_TASK_COUNT ../scripts/run_notebook_cpu.sh


export NOTEBOOK_PATH="$(realpath 02_run_protbert.ipynb)"
export DATASET_NAME="dunham-2020-interface"
export ORIGINAL_ARRAY_TASK_COUNT=8
sbatch --array=2,5 --time=24:00:00 --ntasks-per-node=40 --export=NOTEBOOK_PATH,DATASET_NAME,ORIGINAL_ARRAY_TASK_COUNT ../scripts/run_notebook_cpu.sh






export NOTEBOOK_PATH="$(realpath 02_run_protbert.ipynb)"

export DATASET_NAME="starr-2020-interface"
sbatch --array=1-1 --time=72:00:00 --export=NOTEBOOK_PATH,DATASET_NAME ../scripts/run_notebook_cpu.sh


export DATASET_NAME="dunham-2020-interface"
sbatch --array=1-8 --time=72:00:00 --ntasks-per-node=40 --mem=90G --export=NOTEBOOK_PATH,DATASET_NAME ../scripts/run_notebook_cpu.sh

export DATASET_NAME="skempi-v2-interface"
sbatch --array=1-3 --time=72:00:00 --ntasks-per-node=40 --mem=90G --export=NOTEBOOK_PATH,DATASET_NAME ../scripts/run_notebook_cpu.sh

export DATASET_NAME="skempi-v2-interface"
sbatch --array=1-26 --time=72:00:00 --ntasks-per-node=40 --mem=90G --export=NOTEBOOK_PATH,DATASET_NAME ../scripts/run_notebook_cpu.sh

export DATASET_NAME="huang-2020-core"
sbatch --array=1-1 --time=72:00:00 --ntasks-per-node=40 --mem=90G --export=NOTEBOOK_PATH,DATASET_NAME ../scripts/run_notebook_cpu.sh

export DATASET_NAME="cagi5-frataxin-core"
sbatch --array=1-1 --time=72:00:00 --ntasks-per-node=40 --mem=90G --export=NOTEBOOK_PATH,DATASET_NAME ../scripts/run_notebook_cpu.sh

export DATASET_NAME="starr-2020-core"
sbatch --array=1-1 --time=72:00:00 --ntasks-per-node=40 --mem=90G --export=NOTEBOOK_PATH,DATASET_NAME ../scripts/run_notebook_cpu.sh

export DATASET_NAME="dunham-2020-core"
sbatch --array=1-12 --time=72:00:00 --ntasks-per-node=40 --mem=90G --export=NOTEBOOK_PATH,DATASET_NAME ../scripts/run_notebook_cpu.sh

export DATASET_NAME="rocklin-2017-core"
sbatch --array=1-1 --time=72:00:00 --ntasks-per-node=40 --mem=90G --export=NOTEBOOK_PATH,DATASET_NAME ../scripts/run_notebook_cpu.sh

export DATASET_NAME="protherm-dagger-core"
sbatch --array=1-1 --time=72:00:00 --ntasks-per-node=40 --mem=90G --export=NOTEBOOK_PATH,DATASET_NAME ../scripts/run_notebook_cpu.sh

export DATASET_NAME="elaspic-training-set-core"
sbatch --array=1-162 --time=72:00:00 --ntasks-per-node=40 --mem=90G --export=NOTEBOOK_PATH,DATASET_NAME ../scripts/run_notebook_cpu.sh

```

---

## Imports

In [None]:
import os
from pathlib import Path
import tempfile
import socket
import string
import pyarrow as pa
import pyarrow.parquet as pq

## Parameters

In [None]:
NOTEBOOK_DIR = Path("02_run_rosetta_ddg").resolve()
# NOTEBOOK_DIR = Path("02_run_proteinsolver").resolve()
# NOTEBOOK_DIR = Path("02_run_protbert").resolve()

NOTEBOOK_DIR

In [None]:
if "DATAPKG_OUTPUT_DIR" in os.environ:
    OUTPUT_DIR = Path(os.getenv("DATAPKG_OUTPUT_DIR")).joinpath("elaspic2").resolve()
else:
    OUTPUT_DIR = NOTEBOOK_DIR.parent
OUTPUT_DIR.mkdir(exist_ok=True)

OUTPUT_DIR

In [None]:
datasets = [
    # === Core ===
    "elaspic-training-set-core",
    "protherm-dagger-core",
    "rocklin-2017-core",
    "dunham-2020-core",
    "starr-2020-core",
    "cagi5-frataxin-core",
    "huang-2020-core",
    # === Interface ===
    "elaspic-training-set-interface",
    "skempi-v2-interface",
    # "intact-mutations-interface",
    "dunham-2020-interface",
    "starr-2020-interface",
]

## Workspace

In [None]:
output_dir = OUTPUT_DIR.joinpath(NOTEBOOK_DIR.name)
output_dir.mkdir(exist_ok=True)

output_dir

In [None]:
def check_is_calculated(output_files):
    missing_subtask_idxs = []
    for subtask_idx, file in enumerate(output_files):
        if not file.is_file():
            missing_subtask_idxs.append(subtask_idx)
    
    if missing_subtask_idxs:
        return False, missing_subtask_idxs
    else:
        return True, None

In [None]:
for dataset_name in datasets:
    input_file = OUTPUT_DIR.joinpath("01_load_data", f"{dataset_name}.parquet")
    pfile = pq.ParquetFile(input_file)
    task_count = pfile.num_row_groups

    missing = []
    for task_id in range(1, task_count + 1):
        if NOTEBOOK_DIR.stem in ["02_run_rosetta_ddg"]:
            output_files = [output_dir.joinpath(f"{dataset_name}-wt2mut-{task_id}-{task_count}.parquet")]
            is_calculated, _ = check_is_calculated(output_files)
            if not is_calculated:
                output_files = [
                    output_dir.joinpath(f"{dataset_name}-wt2mut-{task_id}{string.ascii_letters[subtask_idx]}-{task_count}.parquet")
                    for subtask_idx in range(20)                    
                ]
                is_calculated, missing_subtask_idxs = check_is_calculated(output_files)
                if not is_calculated:
                    missing.append((task_id, missing_subtask_idxs))

#             output_file_mut2wt = output_dir.joinpath(f"{dataset_name}-mut2wt-{task_id}-{task_count}.parquet")
#             if not output_file_mut2wt.is_file():
#                 missing.append(task_id)
        else:
            output_file = output_dir.joinpath(f"{dataset_name}-{task_id}-{task_count}.parquet")
            is_calculated, _ = check_is_calculated(output_file)
            if not is_calculated:
                output_files = [
                    output_dir.joinpath(f"{dataset_name}-{task_id}{string.ascii_letters[subtask_idx]}-{task_count}.parquet")
                    for subtask_idx in range(20)                    
                ]
                is_calculated, missing_subtask_idxs = check_is_calculated(output_file)
                if not is_calculated:
                    missing.append((task_id, missing_subtask_idxs))

    if missing:
        print(dataset_name, f"({task_count})")
        for task_id, missing_subtask_idxs in missing:
            print(f"{task_id}:", ",".join(str(i) for i in missing_subtask_idxs))
        print()

### graham

for subtask_id in {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19} ; do
    echo $subtask_id ;
    sbatch --array=1-1 --time=72:00:00 --job-name=skempi-v2-interface --export=NOTEBOOK_PATH="$(realpath 02_run_rosetta_ddg.ipynb)",DATASET_NAME="skempi-v2-interface",ORIGINAL_ARRAY_TASK_COUNT=3,SUBTASK_ID=${subtask_id},SUBTASK_COUNT=20 ../scripts/run_notebook_cpu.sh
done

for subtask_id in {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19} ; do
    echo $subtask_id ;
    sbatch --array=2-2 --time=72:00:00 --job-name=skempi-v2-interface --export=NOTEBOOK_PATH="$(realpath 02_run_rosetta_ddg.ipynb)",DATASET_NAME="skempi-v2-interface",ORIGINAL_ARRAY_TASK_COUNT=3,SUBTASK_ID=${subtask_id},SUBTASK_COUNT=20 ../scripts/run_notebook_cpu.sh
done


### cedar

#### dunham-2020-interface

```bash
for subtask_id in {1,3,4,5,6,9,10,11,12,15,16,17,18} ; do
    echo $subtask_id ;
    sbatch --array=2-2 --time=72:00:00 --export=NOTEBOOK_PATH="$(realpath 02_run_rosetta_ddg.ipynb)",DATASET_NAME="dunham-2020-interface",ORIGINAL_ARRAY_TASK_COUNT=8,SUBTASK_ID=${subtask_id},SUBTASK_COUNT=20 ../scripts/run_notebook_cpu.sh
done

for subtask_id in {15,16,17,18,19} ; do
    echo $subtask_id ;
    sbatch --array=3-3 --time=72:00:00 --export=NOTEBOOK_PATH="$(realpath 02_run_rosetta_ddg.ipynb)",DATASET_NAME="dunham-2020-interface",ORIGINAL_ARRAY_TASK_COUNT=8,SUBTASK_ID=${subtask_id},SUBTASK_COUNT=20 ../scripts/run_notebook_cpu.sh
done

for subtask_id in {0,1,3,4,5,6,7,8,9,10,11,12,13,15,16,17} ; do
    echo $subtask_id ;
    sbatch --array=5-5 --time=72:00:00 --export=NOTEBOOK_PATH="$(realpath 02_run_rosetta_ddg.ipynb)",DATASET_NAME="dunham-2020-interface",ORIGINAL_ARRAY_TASK_COUNT=8,SUBTASK_ID=${subtask_id},SUBTASK_COUNT=20 ../scripts/run_notebook_cpu.sh
done

for subtask_id in {17,18,19} ; do
    echo $subtask_id ;
    sbatch --array=7-7 --time=72:00:00 --export=NOTEBOOK_PATH="$(realpath 02_run_rosetta_ddg.ipynb)",DATASET_NAME="dunham-2020-interface",ORIGINAL_ARRAY_TASK_COUNT=8,SUBTASK_ID=${subtask_id},SUBTASK_COUNT=20 ../scripts/run_notebook_cpu.sh
done

```
