## Summary

---

## Imports

In [1]:
import concurrent.futures
import itertools
import os
import socket
import tempfile
import urllib.request
from datetime import datetime
from pathlib import Path

import dotenv
import elaspic2 as el2
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import seaborn as sns
import synapseclient
import synapseutils
from elaspic2.plugins.rosetta_ddg import RosettaDDG
from kmbio import PDB
from kmtools import structure_tools
from tqdm.auto import tqdm



## Parameters

In [2]:
NOTEBOOK_DIR = Path("31_cagi6_hmbs_rosetta").resolve()
NOTEBOOK_DIR.mkdir(exist_ok=True)

NOTEBOOK_DIR

PosixPath('/home/kimlab5/strokach/workspace/elaspic/elaspic2-cagi6/notebooks/31_cagi6_hmbs_rosetta')

In [3]:
UNIPROT_ID = "P08397"

UNIPROT_ID

'P08397'

In [4]:
if "scinet" in socket.gethostname():
    CPU_COUNT = 40
else:
    CPU_COUNT = max(1, len(os.sched_getaffinity(0)))

# CPU_COUNT = max(1, CPU_COUNT // 2)

CPU_COUNT

32

In [5]:
TASK_ID = os.getenv("SLURM_ARRAY_TASK_ID")
TASK_COUNT = os.getenv("ORIGINAL_ARRAY_TASK_COUNT") or os.getenv("SLURM_ARRAY_TASK_COUNT")

TASK_ID = int(TASK_ID) if TASK_ID is not None else None
TASK_COUNT = int(TASK_COUNT) if TASK_COUNT is not None else None

TASK_ID, TASK_COUNT

(None, None)

In [6]:
DEBUG = TASK_ID is None

if DEBUG:
    TASK_ID = 1
    TASK_COUNT = 20
else:
    assert TASK_ID is not None
    assert TASK_COUNT is not None

TASK_ID, TASK_COUNT

(1, 20)

## Load data

In [7]:
def load_sequence(sequence_file):
    with sequence_file.open("rt") as fin:
        data = fin.read()
    chunks = []
    for line in data.split("\n"):
        if line.startswith(">"):
            continue
        chunks.append(line.strip())
    return "".join(chunks)

In [8]:
sequence_file = NOTEBOOK_DIR.parent.joinpath("30_cagi6_hmbs", f"{UNIPROT_ID}.fasta")

assert sequence_file.is_file()

In [9]:
sequence = load_sequence(sequence_file)

sequence[:5]

'MSGNG'

In [10]:
structure_file = NOTEBOOK_DIR.parent.joinpath("30_cagi6_hmbs", f"{UNIPROT_ID}.pdb")

assert structure_file.is_file()

In [11]:
results_to_fill_file = NOTEBOOK_DIR.parent.joinpath("30_cagi6_hmbs", "results-to-fill.parquet")

results_to_fill_df = pq.read_table(results_to_fill_file).to_pandas()

In [12]:
display(results_to_fill_df.head(2))
print(len(results_to_fill_df))

Unnamed: 0,aa_substitution,score,sd,comments,mut
0,p.Ala112Arg,*,*,*,A112R
1,p.Ala112Asn,*,*,*,A112N


6239


## Run Rosetta

### Construct chunk

In [13]:
chunk_idx = TASK_ID - 1

chunk_idx

0

In [14]:
chunk_size = len(results_to_fill_df) // TASK_COUNT + 1

chunk_size

312

In [15]:
start = chunk_idx * chunk_size
stop = start + chunk_size

result_df = results_to_fill_df.iloc[start:stop]

In [16]:
display(result_df.head(2))
print(len(result_df))

Unnamed: 0,aa_substitution,score,sd,comments,mut
0,p.Ala112Arg,*,*,*,A112R
1,p.Ala112Asn,*,*,*,A112N


312


### Run Rosetta

In [17]:
rosetta_ddg_data = RosettaDDG.build(
    structure_file,
    protocol="cartesian_ddg",
    energy_function="beta_nov16_cart",
    interface=0,
)

In [18]:
def rosetta_ddg_worker(mut, data):
    results = RosettaDDG.analyze_mutation(f"A_{mut}", data)
    results = {"mut": mut} | {f"rosetta_{key}": value for key, value in results.items()}
    return results

In [19]:
rosetta_results_file = NOTEBOOK_DIR.joinpath(f"results-rosetta-{TASK_ID}-of-{TASK_COUNT}.parquet")

if rosetta_results_file.is_file():
    rosetta_results_df = pq.read_table(rosetta_results_file).to_pandas()
else:
    with concurrent.futures.ThreadPoolExecutor(CPU_COUNT) as pool:
        rosetta_results = list(
            tqdm(
                pool.map(
                    rosetta_ddg_worker,
                    result_df["mut"].values.tolist(),
                    itertools.repeat(rosetta_ddg_data),
                ),
                total=len(result_df),
            )
        )
    rosetta_results_df = pd.DataFrame(rosetta_results)
    pq.write_table(
        pa.Table.from_pandas(rosetta_results_df, preserve_index=False), rosetta_results_file
    )

  0%|          | 0/312 [00:00<?, ?it/s]

KeyboardInterrupt: 