## Summary

Calculate features using [Rosetta's `cartesian_ddg` protocol](https://www.rosettacommons.org/docs/latest/cartesian-ddG).

### Executing

```bash
DATASET_NAME="elaspic-training-set-core" NOTEBOOK_PATH="$(realpath 02_run_rosetta_ddg.ipynb)" sbatch --array=1-162 ../scripts/run_notebook_cpu.sh

DATASET_NAME="protherm-dagger-core" NOTEBOOK_PATH="$(realpath 02_run_rosetta_ddg.ipynb)" sbatch --array=1-2 ../scripts/run_notebook_cpu.sh

DATASET_NAME="rocklin-2017-core" NOTEBOOK_PATH="$(realpath 02_run_rosetta_ddg.ipynb)" sbatch --array=1-1 ../scripts/run_notebook_cpu.sh

DATASET_NAME="elaspic-training-set-interface" NOTEBOOK_PATH="$(realpath 02_run_rosetta_ddg.ipynb)" sbatch --array=1-26 ../scripts/run_notebook_cpu.sh
```


---

## Imports

In [None]:
import os
from pathlib import Path
import tempfile
import socket
import pyarrow as pa
import pyarrow.parquet as pq

## Parameters

In [None]:
# NOTEBOOK_DIR = Path("02_run_rosetta_ddg").resolve(strict=True)
NOTEBOOK_DIR = Path("02_run_proteinsolver").resolve(strict=True)
# NOTEBOOK_DIR = Path("02_run_protbert").resolve(strict=True)

NOTEBOOK_DIR

In [None]:
if "DATAPKG_OUTPUT_DIR" in os.environ:
    OUTPUT_DIR = Path(os.getenv("DATAPKG_OUTPUT_DIR")).joinpath("elaspic-v2").resolve()
else:
    OUTPUT_DIR = NOTEBOOK_DIR.parent
OUTPUT_DIR.mkdir(exist_ok=True)

OUTPUT_DIR

In [None]:
datasets = {
    "elaspic-training-set-core": 162,
    "protherm-dagger-core": 2,
    "rocklin-2017-core": 1,
    "elaspic-training-set-interface": 26,
}

## Workspace

In [None]:
output_dir = OUTPUT_DIR.joinpath(NOTEBOOK_DIR.name)
output_dir.mkdir(exist_ok=True)

output_dir

In [None]:

for dataset_name, task_count in datasets.items():
    input_file = OUTPUT_DIR.joinpath("01_load_data", f"{dataset_name}.parquet")
    pfile = pq.ParquetFile(input_file)
    assert task_count == pfile.num_row_groups, (task_count, pfile.num_row_groups)

    missing = []
    for task_id in range(1, task_count + 1):
        if NOTEBOOK_DIR.name in ["02_rosetta_ddg"]:
            output_file_wt2mut = output_dir.joinpath(f"{dataset_name}-wt2mut-{task_id}-{task_count}.parquet")
            if not output_file_wt2mut.is_file():
                missing.append(task_id)
                continue
            output_file_mut2wt = output_dir.joinpath(f"{dataset_name}-mut2wt-{task_id}-{task_count}.parquet")
            if not output_file_mut2wt.is_file():
                missing.append(task_id)
        else:
            output_file = output_dir.joinpath(f"{dataset_name}-{task_id}-{task_count}.parquet")
            if not output_file.is_file():
                missing.append(task_id)
    print(f'{dataset_name}: {",".join(str(i) for i in missing)}.')