## Summary

---

## Imports

In [1]:
import functools
from pathlib import Path

import elaspic2 as el2
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
from scipy import stats
from sklearn import metrics, model_selection
from tqdm.auto import tqdm

Matplotlib created a temporary config/cache directory at /tmp/matplotlib-1922apv4 because the default path (/home/p/pmkim/strokach/.config/matplotlib) is not a writable directory; it is highly recommended to set the MPLCONFIGDIR environment variable to a writable directory, in particular to speed up the import of Matplotlib and to better support multiprocessing.


In [2]:
pd.set_option("max_columns", 1000)
pd.set_option("max_rows", 1000)

## Parameters

In [3]:
NOTEBOOK_DIR = Path("37_humsavar_combine_results").resolve()
NOTEBOOK_DIR.mkdir(exist_ok=True)

NOTEBOOK_DIR

PosixPath('/gpfs/fs0/scratch/p/pmkim/strokach/workspace/elaspic2-cagi6/notebooks/37_humsavar_combine_results')

## Load results

In [4]:
DATASET_NAME = "humsavar"
DATASET_PATH = str(
    NOTEBOOK_DIR.parent.joinpath("30_humsavar", "humsavar-gby-protein.parquet")
)
DATASET_ALN_PATH = str(
    NOTEBOOK_DIR.parent.joinpath("30_humsavar", "humsavar-gby-protein-waln.parquet")
)
TASK_COUNT = 612
TASK_COUNT_ALN = 12557

DATASET_NAME, DATASET_PATH, TASK_COUNT, TASK_COUNT_ALN

('humsavar',
 '/gpfs/fs0/scratch/p/pmkim/strokach/workspace/elaspic2-cagi6/notebooks/30_humsavar/humsavar-gby-protein.parquet',
 612,
 12557)

In [5]:
pfile = pq.ParquetFile(DATASET_PATH)

assert TASK_COUNT == pfile.num_row_groups

In [6]:
total_num_mutations = 0
for row_group in tqdm(range(pfile.num_row_groups)):
    num_mutations = (
        pfile.read_row_group(row_group, columns=["mutation"])
        .to_pandas()["mutation"]
        .str.len()
        .sum()
    )

    total_num_mutations += num_mutations

total_num_mutations

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=612.0), HTML(value='')))




61179

In [7]:
pfile_aln = pq.ParquetFile(DATASET_ALN_PATH)

assert TASK_COUNT_ALN == pfile_aln.num_row_groups

In [8]:
total_num_aln_mutations = 0
for row_group in tqdm(range(pfile_aln.num_row_groups)):
    num_mutations = (
        pfile_aln.read_row_group(row_group, columns=["mutation"])
        .to_pandas()["mutation"]
        .str.len()
        .sum()
    )

    total_num_aln_mutations += num_mutations

total_num_aln_mutations

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=12557.0), HTML(value='')))




61174

In [9]:
def get_result_files(result_dir, task_count=TASK_COUNT):
    if "msa_analysis" in str(result_dir):
        prefix = "result"
    else:
        prefix = "shard"

    present_files = []
    missing_files = []
    for i in tqdm(range(1, task_count + 1)):
        path = result_dir.joinpath(f"{prefix}-{i}-of-{task_count}.parquet")
        if path.is_file():
            present_files.append(path)
        else:
            missing_files.append(path)
    return present_files, missing_files

In [28]:
def read_files(files, columns=None):
    dfs = []
    for file in tqdm(files):
        try:
            df = pq.read_table(file, columns=columns).to_pandas(integer_object_nulls=True)
        except pa.ArrowInvalid as error:
            print(error)
            continue
        dfs.append(df)
    return pd.concat(dfs, ignore_index=True)

In [11]:
def read_rosetta_files(pfile):
    result_dfs = []
    for task_id in tqdm(range(1, pfile.num_row_groups + 1)):
        row = (
            pfile.read_row_group(task_id - 1, columns=["protein_id", "mutation"])
            .to_pandas()
            .iloc[0]
        )

        input_df = pd.DataFrame(
            {
                "protein_id": [row["protein_id"]] * len(row["mutation"]),
                "mutation": row["mutation"],
            }
        )

        path = NOTEBOOK_DIR.parent.joinpath(
            "31_run_rosetta_ddg",
            DATASET_NAME,
            f"shard-{task_id}-of-{TASK_COUNT}.parquet",
        )
        try:
            rosetta_df = pq.read_table(path).to_pandas()
        except pa.ArrowInvalid:
            print(f"Unreadable file for {path}")
            continue

        if rosetta_df.empty:
            print(f"Empty file for {path}")
            continue

        del rosetta_df["protein_id"]

        result_df = input_df.merge(rosetta_df, on=["mutation"])
        result_dfs.append(result_df)
    return pd.concat(result_dfs, ignore_index=True)

### ProteinSolver

In [None]:
ps_result_dir = NOTEBOOK_DIR.parent.joinpath("31_run_proteinsolver", DATASET_NAME)

In [None]:
present_files, missing_files = get_result_files(ps_result_dir)

assert len(missing_files) == 0
len(present_files), len(missing_files)

In [None]:
result_ps_df = read_files(present_files)

In [None]:
display(result_ps_df.head(2))
print(len(result_ps_df))

assert len(result_ps_df) == total_num_mutations, len(result_ps_df)

### ProtBert

In [None]:
pb_result_dir = NOTEBOOK_DIR.parent.joinpath("31_run_protbert", DATASET_NAME)

In [None]:
present_files, missing_files = get_result_files(pb_result_dir)

assert len(missing_files) == 0
len(present_files), len(missing_files)

In [None]:
result_pb_df = read_files(present_files)

In [None]:
display(result_pb_df.head(2))
print(len(result_pb_df))

assert len(result_pb_df) == total_num_mutations

In [None]:
# proteinsolver_keys = set(result_ps_df[["protein_id", "mutation"]].apply(tuple, axis=1))
# protbert_keys = set(result_pb_df[["protein_id", "mutation"]].apply(tuple, axis=1))
# protbert_missing_keys = proteinsolver_keys - protbert_keys

In [None]:
# missing_task_ids = []
# for row_group in tqdm(range(pfile.num_row_groups)):
#     df = pfile.read_row_group(
#         row_group, columns=["uniprot_id", "mutation"]
#     ).to_pandas()[["uniprot_id", "mutation"]]

#     row_keys = set()
#     for tup in df.itertuples():
#         for mutation in tup.mutation:
#             row_keys.add((tup.uniprot_id, mutation))

#     if protbert_missing_keys & row_keys:
#         missing_task_ids.append(row_group + 1)

# missing_task_ids

### Rosetta

In [None]:
ra_result_dir = NOTEBOOK_DIR.parent.joinpath("31_run_rosetta_ddg", DATASET_NAME)

In [None]:
present_files, missing_files = get_result_files(ra_result_dir)

assert len(missing_files) == 0
len(present_files), len(missing_files)

In [None]:
result_ra_df = read_files(present_files)

In [None]:
display(result_ra_df.head(2))
print(len(result_ra_df))

assert len(result_ra_df) == total_num_mutations

In [None]:
# proteinsolver_keys = set(result_ps_df[["protein_id", "mutation"]].apply(tuple, axis=1))
# rosetta_keys = set(result_ra_df[["protein_id", "mutation"]].apply(tuple, axis=1))
# rosetta_missing_keys = proteinsolver_keys - rosetta_keys

# assert not rosetta_missing_keys

### MSA

In [None]:
msa_result_dir = NOTEBOOK_DIR.parent.joinpath("31_run_msa_analysis", DATASET_NAME)

In [None]:
present_files, missing_files = get_result_files(msa_result_dir, TASK_COUNT_ALN)

assert len(missing_files) == 0
len(present_files), len(missing_files)

In [None]:
result_msa_df = read_files(present_files)

In [None]:
display(result_msa_df.head(2))
print(len(result_msa_df))

assert len(result_msa_df) == total_num_aln_mutations

In [None]:
# proteinsolver_keys = set(result_ps_df[["protein_id", "mutation"]].apply(tuple, axis=1))
# msa_keys = set(result_pb_df[["protein_id", "mutation"]].apply(tuple, axis=1))
# msa_missing_keys = proteinsolver_keys - msa_keys

In [None]:
# missing_task_ids = []
# for row_group in tqdm(range(pfile_aln.num_row_groups)):
#     df = pfile_aln.read_row_group(row_group, columns=["protein_id", "mutation"]).to_pandas()[
#         ["protein_id", "mutation"]
#     ]

#     row_keys = set()
#     for tup in df.itertuples():
#         for mutation in tup.mutation:
#             row_keys.add((tup.protein_id, mutation))

#     if protbert_missing_keys & row_keys:
#         missing_task_ids.append(row_group + 1)

# missing_task_ids

### AlphaFold WT

In [21]:
afwt_result_dir = NOTEBOOK_DIR.parent.joinpath("31_run_alphafold_wt", DATASET_NAME)

In [22]:
present_files, missing_files = get_result_files(afwt_result_dir, TASK_COUNT_ALN)

len(present_files), len(missing_files)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=12557.0), HTML(value='')))




(12002, 555)

In [25]:
protein_mutations_df = pq.read_table(DATASET_ALN_PATH, columns=["protein_id", "mutation"])

In [29]:
columns = [
    "protein_id",
    "experimentally_resolved",
    "predicted_lddt",
    "msa_first_row",
    "single",
    "structure_module",
    "plddt",
    "max_predicted_aligned_error",
    "ptm",
]
result_afwt_df = read_files(present_files, columns=columns).set_index("protein_id")

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=12002.0), HTML(value='')))

Could not open Parquet input source '/gpfs/fs0/scratch/p/pmkim/strokach/workspace/elaspic2-cagi6/notebooks/31_run_alphafold_wt/humsavar/shard-10366-of-12557.parquet': Parquet magic bytes not found in footer. Either the file is corrupted or this is not a parquet file.
Could not open Parquet input source '/gpfs/fs0/scratch/p/pmkim/strokach/workspace/elaspic2-cagi6/notebooks/31_run_alphafold_wt/humsavar/shard-10408-of-12557.parquet': Parquet magic bytes not found in footer. Either the file is corrupted or this is not a parquet file.
Could not open Parquet input source '/gpfs/fs0/scratch/p/pmkim/strokach/workspace/elaspic2-cagi6/notebooks/31_run_alphafold_wt/humsavar/shard-10452-of-12557.parquet': Parquet magic bytes not found in footer. Either the file is corrupted or this is not a parquet file.
Could not open Parquet input source '/gpfs/fs0/scratch/p/pmkim/strokach/workspace/elaspic2-cagi6/notebooks/31_run_alphafold_wt/humsavar/shard-10486-of-12557.parquet': Parquet magic bytes not found

In [30]:
display(result_afwt_df.head(2))
print(len(result_afwt_df))

Unnamed: 0_level_0,experimentally_resolved,predicted_lddt,msa_first_row,single,structure_module,plddt,max_predicted_aligned_error,ptm
protein_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
A0A0C5B5G6,"[[0.23085688054561615, 0.2812912166118622, 0.4...","[[-4.428761959075928, -4.958076477050781, -4.2...","[[-9.192195892333984, -7.8160529136657715, -3....","[[1.0471205711364746, 50.2182731628418, 9.5015...","[[0.006925947964191437, 0.015513338148593903, ...","[57.36422541091451, 58.55118690727977, 63.5633...",31.75,0.027069
P0CJ72,"[[0.44773733615875244, 0.44976410269737244, 0....","[[-4.362471580505371, -4.686018943786621, -3.9...","[[-5.28439474105835, -4.713302135467529, -8.87...","[[-20.71187400817871, 50.35539245605469, 9.119...","[[0.005178235471248627, 0.02119813859462738, -...","[50.6758737960481, 53.870554884328136, 57.2565...",31.75,0.114342


11983


In [31]:
def get_mutation_embeddings(idx, predictions):
    assert idx >= 0

    def as_residue(x):
        return x[idx].astype(np.float32)

    def as_protein(x):
        return x.mean(axis=0).astype(np.float32)

    embeddings = {
        "experimentally_resolved": predictions["experimentally_resolved"],
        "predicted_lddt": predictions["predicted_lddt"],
        "msa_first_row": predictions["msa_first_row"],
        "single": predictions["single"],
        "structure_module": predictions["structure_module"],
    }

    output = {
        "scores_residue_plddt": predictions["plddt"][idx],
        "scores_protein_plddt": np.mean(predictions["plddt"]),
        "scores_protein_max_predicted_aligned_error": predictions["max_predicted_aligned_error"],
        "scores_proten_ptm": predictions["ptm"],
        **{f"features_residue_{key}": as_residue(value) for key, value in embeddings.items()},
        **{f"features_protein_{key}": as_protein(value) for key, value in embeddings.items()},
    }

    return output


# get_mutation_embeddings(0, result_afwt_df.loc["Q8N5M1"])

In [32]:
protein_mutation_lookup = protein_mutations_df.set_index("protein_id")["mutation"].asdict()

AttributeError: 'pyarrow.lib.Table' object has no attribute 'set_index'

In [None]:
results = []
for tup in tqdm(result_afwt_df.itertuples(), total=len(result_afwt_df)):
    mutations = protein_mutation_lookup[tup.protein_id]
    for mutation in mutations:
        features = {
            f"alphafold_core_{key}_wt": value
            for key, value in get_mutation_embeddings(
                int(mutation[1:-1]) - 1, predictions
            ).items()
        }
        results.append(
            {
                "protein_id": protein_id,
                "mutation": mutation,
            }
            | features
        )

result_af_df = pd.DataFrame(results)
len(result_af_df)

## Combine results

In [None]:
len(result_ps_df) == len(result_ps_df["mutation_id"].unique())

In [None]:
result_df = (
    result_ps_df.merge(
        result_pb_df, on=["protein_id", "mutation", "mutation_id", "effect"], how="left"
    )
    .merge(result_msa_df, on=["protein_id", "mutation", "mutation_id"], how="left")
    .merge(result_ra_df, on=["protein_id", "mutation", "mutation_id", "effect"], how="left")
    .merge(
        pd.concat([result_af_df, result_af2_df], ignore_index=True),
        on=["protein_id", "mutation", "mutation_id", "effect"],
        how="left",
    )
)

assert len(result_df) == len(result_df["mutation_id"].unique())
assert not (
    set(result_pb_df["mutation_id"])
    | set(result_ra_df["mutation_id"])
    | set(result_af_df["mutation_id"])
) - set(result_df["mutation_id"])

In [None]:
display(result_df.head(2))
print(len(result_df))

## Calculate EL2 score

In [None]:
model = el2.ELASPIC2()

In [None]:
proteinsolver_columnms = [c for c in result_df if c.startswith("proteinsolver_core_score")]
protbert_columns = [c for c in result_df if c.startswith("protbert_core_")]

el2_missing = result_df[proteinsolver_columnms + protbert_columns].isnull().any(axis=1)
result_df["el2_score"] = np.nan
result_df.loc[~el2_missing, "el2_score"] = model.predict_mutation_effect(
    [
        t._asdict()
        for t in result_df.loc[~el2_missing, proteinsolver_columnms + protbert_columns].itertuples(
            index=False
        )
    ]
)

## Calculate deltas

In [None]:
for column in list(result_df):
    if not column.endswith("_mut"):
        continue

    column_wt = column.removesuffix("_mut") + "_wt"
    if column_wt not in result_df:
        print(column_wt)
        continue

    column_change = column.removesuffix("_mut") + "_change"
    result_df[column_change] = result_df[column_wt] - result_df[column]
    del result_df[column]

## Encode mutation

In [None]:
amino_acids = list("ARNDCEQGHILKMFPSTWYV")

# TODO: AGGGH, this is stupid and dangerous! Refactor!!!
result_df["aa_wt_onehot"] = pd.get_dummies(result_df["mutation"].str[0]).apply(list, axis=1)
result_df["aa_mut_onehot"] = pd.get_dummies(result_df["mutation"].str[-1]).apply(list, axis=1)

## Save results

In [None]:
output_file = NOTEBOOK_DIR.joinpath("combined-results.parquet")

output_file

In [None]:
pq.write_table(
    pa.Table.from_pandas(result_df, preserve_index=False),
    output_file,
    row_group_size=10_000,
)

## Exploratory data analysis

In [None]:
proteinsolver_columnms = [c for c in result_df if c.startswith("proteinsolver_")]
protbert_columns = [c for c in result_df if c.startswith("protbert_")]
rosetta_columns = [c for c in result_df if c.startswith("rosetta_")]
alphafold_columns = [c for c in result_df if c.startswith("alphafold_")]

In [None]:
result_df["effect"].value_counts()

In [None]:
effect_map = {
    "Uncertain significance": 0,
    "Likely benign": -1,
    "Benign": -2,
    "Likely pathogenic": 1,
    "Pathogenic": 2,
}

result_df["effect_score"] = result_df["effect"].map(effect_map)

In [None]:
score_columns = [
    "el2_score",
    "proteinsolver_core_score_change",
    "protbert_core_score_change",
    "msa_KL",
    "rosetta_dg_change",
        "alphafold_core_scores_residue_plddt_wt",
#         "alphafold_core_scores_protein_plddt_wt",
#         "alphafold_core_scores_protein_max_predicted_aligned_error_wt",
#         "alphafold_core_scores_proten_ptm_wt",
]

df = result_df.dropna(subset=score_columns + ["effect_score"])
df = df[df["effect_score"] != 0].reset_index(drop=True)

for col in score_columns:
    corr = stats.spearmanr(df["effect_score"], df[col])
    auc = metrics.roc_auc_score(df["effect_score"] > 0, df[col])
    print(col, corr[0], auc)

In [None]:
score_columns = [
    "el2_score",
    "proteinsolver_core_score_change",
    "protbert_core_score_change",
    "msa_KL",
    "rosetta_dg_change",
    "alphafold_core_scores_residue_plddt_wt",
    "alphafold_core_scores_residue_plddt_change",
#         "alphafold_core_scores_protein_plddt_wt",
#         "alphafold_core_scores_protein_max_predicted_aligned_error_wt",
#         "alphafold_core_scores_proten_ptm_wt",
]

for column in score_columns:
    print(f"{column} {result_df[column].isnull().sum()}")