# Summary

Calculate CCMpred mutation scores.

## Job submission

### Humsavar

```bash
export NOTEBOOK_PATH="$(realpath 31_run_ccmpred.ipynb)"
export DATASET_NAME="humsavar"
export DATASET_PATH="30_humsavar/humsavar-gby-protein-waln.parquet"
export ORIGINAL_ARRAY_TASK_COUNT=12557

sbatch --export DATASET_NAME,DATASET_PATH,NOTEBOOK_PATH,ORIGINAL_ARRAY_TASK_COUNT --array=1-1 --time 24:00:00 --ntasks-per-node=4 --mem=12G ../scripts/run_notebook_cpu.sh
```

### CAGI6 sherloc

```bash
export NOTEBOOK_PATH="$(realpath 31_run_ccmpred.ipynb)"
export DATASET_NAME="cagi6-sherloc"
export DATASET_PATH="30_cagi6_sherloc/input-data-gby-protein.parquet"
export ORIGINAL_ARRAY_TASK_COUNT=4182

sbatch --export DATASET_NAME,DATASET_PATH,NOTEBOOK_PATH,ORIGINAL_ARRAY_TASK_COUNT --array=1-1 --time 24:00:00 --ntasks-per-node=4 --mem=12G ../scripts/run_notebook_cpu.sh
```


In [1]:
import concurrent.futures
import itertools
import math
import os
import re
import socket
import subprocess
import sys
import tempfile
from pathlib import Path

import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
from tqdm.notebook import tqdm

#### Parameters

In [2]:
NOTEBOOK_DIR = Path("31_run_ccmpred").resolve()
NOTEBOOK_DIR.mkdir(exist_ok=True)

NOTEBOOK_DIR

PosixPath('/lustre07/scratch/strokach/workspace/elaspic2-cagi6/notebooks/31_run_ccmpred')

In [3]:
if (slurm_tmpdir := os.getenv("SLURM_TMPDIR")) is not None:
    os.environ["TMPDIR"] = slurm_tmpdir

print(tempfile.gettempdir())

/tmp


In [4]:
if "scinet" in socket.gethostname():
    CPU_COUNT = 40
else:
    CPU_COUNT = max(1, len(os.sched_getaffinity(0)))

CPU_COUNT = max(1, CPU_COUNT // 2)

CPU_COUNT

2

In [5]:
DATASET_NAME = os.getenv("DATASET_NAME")
DATASET_PATH = os.getenv("DATASET_PATH")
TASK_ID = os.getenv("SLURM_ARRAY_TASK_ID")
TASK_COUNT = os.getenv("ORIGINAL_ARRAY_TASK_COUNT") or os.getenv(
    "SLURM_ARRAY_TASK_COUNT"
)

TASK_ID = int(TASK_ID) if TASK_ID is not None else None
TASK_COUNT = int(TASK_COUNT) if TASK_COUNT is not None else None

DATASET_NAME, DATASET_PATH, TASK_ID, TASK_COUNT

(None, None, None, None)

In [6]:
DEBUG = TASK_ID is None

if DEBUG:
    DATASET_NAME = "humsavar"
    DATASET_PATH = str(
        NOTEBOOK_DIR.parent.joinpath("30_humsavar", "humsavar-gby-protein-waln.parquet")
    )
    TASK_ID = 1
    TASK_COUNT = 12557  # 4182
else:
    assert DATASET_NAME is not None
    assert DATASET_PATH is not None
    DATASET_PATH = Path(DATASET_PATH).expanduser().resolve()
    assert TASK_COUNT is not None

DATASET_NAME, DATASET_PATH, TASK_ID, TASK_COUNT

('humsavar',
 '/lustre07/scratch/strokach/workspace/elaspic2-cagi6/notebooks/30_humsavar/humsavar-gby-protein-waln.parquet',
 1,
 12557)

In [7]:
output_file = NOTEBOOK_DIR.joinpath(
    DATASET_NAME, f"result-{TASK_ID}-of-{TASK_COUNT}.parquet"
)
output_file.parent.mkdir(exist_ok=True)

output_file

PosixPath('/lustre07/scratch/strokach/workspace/elaspic2-cagi6/notebooks/31_run_ccmpred/humsavar/result-1-of-12557.parquet')

In [8]:
if output_file.is_file():
    raise Exception("Already finished!")

#### Load Data

In [9]:
pfile = pq.ParquetFile(DATASET_PATH)

pfile.num_row_groups

12557

In [10]:
rows_per_chunk = np.ceil(pfile.num_row_groups / TASK_COUNT).astype(int)

rows_per_chunk

1

In [11]:
start = (TASK_ID - 1) * rows_per_chunk
stop = min([pfile.num_row_groups + 1, TASK_ID * rows_per_chunk])

start, stop

(0, 1)

In [12]:
input_df = pfile.read_row_group(start).to_pandas()

In [13]:
input_df

Unnamed: 0,protein_id,mutation,effect,sequence,structure,alignment
0,A0A0C5B5G6,[K14Q],[US],MRWQEMGYIFYPRKLR,HEADER ...,"[>101\n, MRWQEMGYIFYPRKLR\n, >UniRef100_A0A0C5..."


#### Calculate CCMpred scores

Raw probability matrix from ccmpred outputs one 20xL matrix followed by comb(LxL)x21x21 matrix, corresponding to amino acid probabilities per pairwise residue contacts in the alignment

CCMpred learns a generative Markov Random Field model using vertices with single-residue emission potentials $\varepsilon_i(a)$ and edges with pairwise emission potentials $\varepsilon_{i,j}(a,b)$

arr1 = $\varepsilon_i(a)$ where $i$ = MSA column index and $a$ = amino acid index <br>
arr2 = $\varepsilon_{i,j}(a,b)$ where $i,j$ = MSA column indices and $a,b$ = amino acid indices 

##### Find amino acid index positions of CCMpred

In [14]:
# # CCMpred run to see which indices correspond to which amino acid
# testFile = str(Path(aln_path).parent.joinpath('AAtest.aln'))
# outFile = str(Path(aln_path).parent.joinpath('AAtest.mat'))
# rawFile = str(Path(aln_path).parent.joinpath('AAtest.raw'))

# aa_list = 'ARNDCEQGHILKMFPSTWYV'

# with open(testFile,'w') as f:
#     f.write(aa_list)

# bashCommand = [ccmpred_path,'-r',rawFile,testFile,outFile]
# process = subprocess.run(bashCommand,capture_output=True)

# arr1, _, _ = parse_raw_prob(rawFile)

# aa_true_idx = [np.argmax(i) for i in arr1]
# aa_true_list = ''.join([aa_list[i] for i in aa_true_idx])

# # add gap character to front
# aa_true_list = '-' + aa_true_list

# aa_true_list

##### Average log probabilities of all pair-wise mutations

In [15]:
protein_id_column = None

for col in ["protein_id", "uniprot_id"]:
    if col in input_df:
        protein_id_column = col

assert protein_id_column is not None
protein_id_column

'protein_id'

In [16]:
tup = next(input_df.itertuples(index=False))

iterable_fields = []
for field in tup._fields:
    if field in [protein_id_column]:
        continue
    try:
        if len(getattr(tup, field)) == len(tup.mutation):
            iterable_fields.append(field)
    except TypeError:
        pass

iterable_fields

['mutation', 'effect']

In [17]:
# Run CCMpred on alignment
# writes alignment, output, and raw probability files to temp folder (/tmp)
# Potential issue/warning: if more than one alignment in row group, the alignment/output/raw files are overwritten
# since they are written to the same filename '{}/{}_ccmpred_task_{}_of_{}.aln'.format(outDir,outFile,TASK_ID,TASK_COUNT)'


def run_ccmpred(alignment, NOTEBOOK_DIR, DATASET_NAME, TASK_ID, TASK_COUNT):
    def write_to_aln(alignment, outPath, outFile, TASK_ID, TASK_COUNT):
        outDir = Path(outPath).resolve()
        outDir.mkdir(exist_ok=True)
        outFile = "{}/{}_ccmpred_task_{}_of_{}.aln".format(
            outDir, outFile, TASK_ID, TASK_COUNT
        )
        with open(outFile, "w") as fout:
            for line in alignment:
                if line == "" or line[0] == ">":
                    continue
                else:
                    # remove insertions (lower-case letters in .a3m format)
                    seq = "".join(x for x in line if not x.islower())
                    fout.write(seq)
        return outFile

    ccmpred_path = str(NOTEBOOK_DIR.joinpath("ccmpred/bin/ccmpred"))
    aln_path = write_to_aln(
        alignment, tempfile.gettempdir(), DATASET_NAME, TASK_ID, TASK_COUNT
    )
    outFile = str(Path(aln_path).parent.joinpath(Path(aln_path).stem + ".mat"))
    rawFile = str(Path(aln_path).parent.joinpath(Path(aln_path).stem + ".raw"))

    # With raw probability matrix
    bashCommand = [ccmpred_path, "-r", rawFile, aln_path, outFile]
    process = subprocess.run(bashCommand, capture_output=True)

    return rawFile

In [18]:
# Create a probability matrix of pairwise amino acid probabilities
# Output: Pandas DataFrame of LxL where df[i,j] = 21x21 aa probability matrix of MSA positions i,j in L


def get_aa_prob_matrix(rawFile):
    def parse_raw_prob(rawFile):

        # Read raw probability matrix
        with open(rawFile, "r") as fin:
            raw_mat = fin.readlines()

        # Parse raw prob matrix
        arr1 = []
        arr2 = []
        arr3 = []
        tmp_arr = []
        for line in raw_mat:
            line_split = line.strip().split("\t")
            length = len(line_split)
            if length == 1:
                if tmp_arr:
                    arr2.append(tmp_arr)
                arr3.append(line_split)
                tmp_arr = []
            elif length == 20:
                arr1.append(line_split)
            elif length == 21:
                tmp_arr.append(line_split)

        # add last arr
        arr2.append(tmp_arr)

        return np.float_(arr1), np.float_(arr2), arr3

    # TODO: very inefficient way to do this
    # probably doesn't scale well with increased L due to filling an empty Pandas DF

    single_prob_matrix, pairwise_prob_matrix, aa_pair_id = parse_raw_prob(rawFile)

    aa_pair = [l[0].split(" ")[1:] for l in aa_pair_id]

    d = {}
    for i in range(len(np.unique(aa_pair))):
        d[str(i)] = {}

    for idx in range(len(aa_pair)):
        aa_idx = aa_pair[idx]
        d[aa_idx[0]][aa_idx[1]] = pairwise_prob_matrix[idx]
        d[aa_idx[1]][aa_idx[0]] = pairwise_prob_matrix[idx]

    return d, single_prob_matrix

In [19]:
def get_single_aa_logproba(i, aa, s):
    # Obtained by running CCMpred on sample sequence of all amino acids
    alphabet = "ARNDCQEGHILKMFPSTWYV"

    idx = alphabet.index(aa)
    return {"logproba_i": s[i][idx]}

In [20]:
def get_pairwise_aa_logprobas(sequence, i, aa, d):
    # Obtained by running CCMpred on sample sequence of all amino acids + gap and observing arr1 in get_aa_prob_matrix
    alphabet = "-ARNDCQEGHILKMFPSTWYV"

    idx = alphabet.index(aa)

    logprobas = {
        "logproba_ij_ab": [],
        "logproba_ji_ab": [],
        "logproba_ij_ba": [],
        "logproba_ji_ba": [],
    }
    for j, aa in enumerate(sequence):
        if i == j:
            continue
        aa_idx = alphabet.index(aa)
        d_ij = d[str(i)][str(j)]
        d_ji = d[str(j)][str(i)]
        assert d_ij.shape == (21, 21)
        assert d_ji.shape == (21, 21)
        logprobas["logproba_ij_ab"].append(d_ij[idx, aa_idx])
        logprobas["logproba_ji_ab"].append(d_ji[idx, aa_idx])
        logprobas["logproba_ij_ba"].append(d_ij[aa_idx, idx])
        logprobas["logproba_ji_ba"].append(d_ji[aa_idx, idx])
    return logprobas

In [21]:
def get_mutation_scores(sequence, mutation, s, d):
    wt_aa = mutation[0]
    mut_aa = mutation[-1]
    pos = int(mutation[1:-1]) - 1

    logproba_i_wt = get_single_aa_logproba(pos, wt_aa, s)
    logproba_i_mut = get_single_aa_logproba(pos, mut_aa, s)

    logprobas_ij_wt = get_pairwise_aa_logprobas(sequence, pos, wt_aa, d)
    logprobas_ij_mut = get_pairwise_aa_logprobas(sequence, pos, mut_aa, d)

    results = (
        #
        {f"{key}_wt": np.mean(value) for key, value in logproba_i_wt.items()}
        | {f"{key}_mut": np.mean(value) for key, value in logproba_i_mut.items()}
        | {f"{key}_wt": np.mean(value) for key, value in logprobas_ij_wt.items()}
        | {f"{key}_mut": np.mean(value) for key, value in logprobas_ij_mut.items()}
    )

    return results

In [22]:
def validate_mutation(mutation):
    aa = "GVALICMFWPDESTYQNKRH"
    if re.search(f"^[{aa}][1-9]+[0-9]*[{aa}]$", mutation) is None:
        print(f"Skipping mutation {mutation} because it appears to be malformed.")
        return False

    if mutation[0] == mutation[-1]:
        print(
            f"Skipping mutation {mutation} because the wildtype and mutant residues are the same."
        )
        return False

    return True

In [23]:
results = []
for tup in input_df.itertuples(index=False):

    assert all(
        [(len(getattr(tup, field)) == len(tup.mutation)) for field in iterable_fields]
    )

    rawFile = run_ccmpred(
        tup.alignment, NOTEBOOK_DIR, DATASET_NAME, TASK_ID, TASK_COUNT
    )

    if not Path(rawFile).exists():
        print("ccmpred file not found - check memory issues")
        break

    d, s = get_aa_prob_matrix(rawFile)

    for mutation_idx, mutation in enumerate(tup.mutation):
        if not validate_mutation(mutation):
            continue

        scores = get_mutation_scores(tup.sequence, mutation, s, d)

        results.append(
            {
                "protein_id": tup.protein_id,
                "mutation": mutation,
            }
            | {field: getattr(tup, field)[mutation_idx] for field in iterable_fields}
            | {f"ccmpred_{key}": value for key, value in scores.items()}
        )

results_df = pd.DataFrame(results)

In [24]:
display(results_df.head(2))
print(len(results_df))

Unnamed: 0,protein_id,mutation,effect,ccmpred_logproba_i_wt,ccmpred_logproba_i_mut,ccmpred_logproba_ij_ab_wt,ccmpred_logproba_ji_ab_wt,ccmpred_logproba_ij_ba_wt,ccmpred_logproba_ji_ba_wt,ccmpred_logproba_ij_ab_mut,ccmpred_logproba_ji_ab_mut,ccmpred_logproba_ij_ba_mut,ccmpred_logproba_ji_ba_mut
0,A0A0C5B5G6,K14Q,US,3.94491,-0.140137,-0.000168,-0.000168,-8.4e-05,-8.4e-05,-8.4e-05,-8.4e-05,-8.4e-05,-8.4e-05


1


In [25]:
if not DEBUG and not results_df.empty:
    pq.write_table(pa.Table.from_pandas(results_df, preserve_index=False), output_file)