## Summary

---

## Imports

In [1]:
import functools
from pathlib import Path

import elaspic2 as el2
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
from elaspic2.plugins.msa_analysis import analyze_msa
from scipy import stats
from sklearn import metrics, model_selection
from tqdm.auto import tqdm

Matplotlib created a temporary config/cache directory at /tmp/matplotlib-05krxtv0 because the default path (/home/p/pmkim/strokach/.config/matplotlib) is not a writable directory; it is highly recommended to set the MPLCONFIGDIR environment variable to a writable directory, in particular to speed up the import of Matplotlib and to better support multiprocessing.


In [2]:
pd.set_option("max_columns", 1000)
pd.set_option("max_rows", 1000)

## Parameters

In [3]:
NOTEBOOK_DIR = Path("37_humsavar_combine_results").resolve()
NOTEBOOK_DIR.mkdir(exist_ok=True)

NOTEBOOK_DIR

PosixPath('/gpfs/fs0/scratch/p/pmkim/strokach/workspace/elaspic2-cagi6/notebooks/37_humsavar_combine_results')

## Load results

In [4]:
DATASET_NAME = "humsavar"
DATASET_PATH = str(
    NOTEBOOK_DIR.parent.joinpath("30_humsavar", "humsavar-gby-protein.parquet")
)
DATASET_ALN_PATH = str(
    NOTEBOOK_DIR.parent.joinpath("30_humsavar", "humsavar-gby-protein-waln.parquet")
)
TASK_COUNT = 612
TASK_COUNT_ALN = 12557

DATASET_NAME, DATASET_PATH, TASK_COUNT, TASK_COUNT_ALN

('humsavar',
 '/gpfs/fs0/scratch/p/pmkim/strokach/workspace/elaspic2-cagi6/notebooks/30_humsavar/humsavar-gby-protein.parquet',
 612,
 12557)

In [5]:
pfile = pq.ParquetFile(DATASET_PATH)

assert TASK_COUNT == pfile.num_row_groups

In [6]:
total_num_mutations = 0
for row_group in tqdm(range(pfile.num_row_groups)):
    num_mutations = (
        pfile.read_row_group(row_group, columns=["mutation"])
        .to_pandas()["mutation"]
        .str.len()
        .sum()
    )

    total_num_mutations += num_mutations

total_num_mutations

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=612.0), HTML(value='')))




61179

In [7]:
pfile_aln = pq.ParquetFile(DATASET_ALN_PATH)

assert TASK_COUNT_ALN == pfile_aln.num_row_groups

In [8]:
total_num_aln_mutations = 0
for row_group in tqdm(range(pfile_aln.num_row_groups)):
    num_mutations = (
        pfile_aln.read_row_group(row_group, columns=["mutation"])
        .to_pandas()["mutation"]
        .str.len()
        .sum()
    )

    total_num_aln_mutations += num_mutations

total_num_aln_mutations

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=12557.0), HTML(value='')))




61174

In [9]:
def get_result_files(result_dir, task_count=TASK_COUNT):
    if any([f in str(result_dir) for f in ["msa_analysis","ccmpred"]]):
        prefix = "result"
    elif "31_run_alphafold/" in str(result_dir):
        prefix = "results"
    else:
        prefix = "shard"

    present_files = []
    missing_files = []
    for i in tqdm(range(1, task_count + 1)):
        path = result_dir.joinpath(f"{prefix}-{i}-of-{task_count}.parquet")
        if path.is_file():
            present_files.append(path)
        else:
            missing_files.append(path)
    return present_files, missing_files

In [10]:
def read_files(files, columns=None):
    dfs = []
    for file in tqdm(files):
        try:
            df = pq.read_table(file, columns=columns).to_pandas(integer_object_nulls=True)
        except pa.ArrowInvalid as error:
            print(error)
            continue
        dfs.append(df)
    return pd.concat(dfs, ignore_index=True)

In [11]:
def read_rosetta_files(pfile):
    result_dfs = []
    for task_id in tqdm(range(1, pfile.num_row_groups + 1)):
        row = (
            pfile.read_row_group(task_id - 1, columns=["protein_id", "mutation"])
            .to_pandas()
            .iloc[0]
        )

        input_df = pd.DataFrame(
            {
                "protein_id": [row["protein_id"]] * len(row["mutation"]),
                "mutation": row["mutation"],
            }
        )

        path = NOTEBOOK_DIR.parent.joinpath(
            "31_run_rosetta_ddg",
            DATASET_NAME,
            f"shard-{task_id}-of-{TASK_COUNT}.parquet",
        )
        try:
            rosetta_df = pq.read_table(path).to_pandas()
        except pa.ArrowInvalid:
            print(f"Unreadable file for {path}")
            continue

        if rosetta_df.empty:
            print(f"Empty file for {path}")
            continue

        del rosetta_df["protein_id"]

        result_df = input_df.merge(rosetta_df, on=["mutation"])
        result_dfs.append(result_df)
    return pd.concat(result_dfs, ignore_index=True)

### ProteinSolver

In [12]:
ps_result_dir = NOTEBOOK_DIR.parent.joinpath("31_run_proteinsolver", DATASET_NAME)

ps_result_dir

PosixPath('/gpfs/fs0/scratch/p/pmkim/strokach/workspace/elaspic2-cagi6/notebooks/31_run_proteinsolver/humsavar')

In [13]:
present_files, missing_files = get_result_files(ps_result_dir)

assert len(missing_files) == 0
len(present_files), len(missing_files)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=612.0), HTML(value='')))




(612, 0)

In [14]:
result_ps_df = read_files(present_files)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=612.0), HTML(value='')))




In [15]:
display(result_ps_df.head(2))
print(len(result_ps_df))

assert len(result_ps_df) == total_num_mutations, len(result_ps_df)

Unnamed: 0,protein_id,mutation,effect,proteinsolver_core_score_wt,proteinsolver_core_score_mut,proteinsolver_core_features_residue_wt,proteinsolver_core_features_protein_wt,proteinsolver_core_features_residue_mut,proteinsolver_core_features_protein_mut
0,A0A0C5B5G6,K14Q,US,0.070044,0.04016,"[-0.74697345495224, 1.316953420639038, 0.01530...","[-1.1096992492675781, 0.785698652267456, 0.256...","[0.05442849546670914, 0.7043757438659668, -0.2...","[-1.0095055103302002, 0.7263880372047424, 0.24..."
1,A1L188,F55L,US,0.03052,0.088642,"[-2.3987135887145996, 0.10982956737279892, -0....","[-1.777479648590088, -0.03869394585490227, 0.0...","[-4.062615871429443, -0.34137260913848877, -0....","[-1.7952650785446167, -0.047699496150016785, 0..."


61179


### ProtBert

In [16]:
pb_result_dir = NOTEBOOK_DIR.parent.joinpath("31_run_protbert", DATASET_NAME)

pb_result_dir

PosixPath('/gpfs/fs0/scratch/p/pmkim/strokach/workspace/elaspic2-cagi6/notebooks/31_run_protbert/humsavar')

In [17]:
present_files, missing_files = get_result_files(pb_result_dir)

assert len(missing_files) == 0
len(present_files), len(missing_files)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=612.0), HTML(value='')))




(612, 0)

In [18]:
result_pb_df = read_files(present_files)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=612.0), HTML(value='')))




In [19]:
display(result_pb_df.head(2))
print(len(result_pb_df))

assert len(result_pb_df) == total_num_mutations

Unnamed: 0,protein_id,mutation,effect,protbert_core_score_wt,protbert_core_score_mut,protbert_core_features_residue_wt,protbert_core_features_protein_wt,protbert_core_features_residue_mut,protbert_core_features_protein_mut
0,A0A0C5B5G6,K14Q,US,0.058932,0.033909,"[-0.005357364658266306, 0.032247915863990784, ...","[0.03903713822364807, -0.016466135159134865, 0...","[-0.0028596159536391497, 0.004915683530271053,...","[0.03650001436471939, -0.025432726368308067, 0..."
1,A1L188,F55L,US,0.093019,0.143134,"[0.1373966485261917, 0.06155325844883919, 0.02...","[0.05456249415874481, 0.024172015488147736, 0....","[0.14880749583244324, 0.08820129185914993, 0.0...","[0.056186407804489136, 0.03005887381732464, 0...."


61179


In [20]:
# proteinsolver_keys = set(result_ps_df[["protein_id", "mutation"]].apply(tuple, axis=1))
# protbert_keys = set(result_pb_df[["protein_id", "mutation"]].apply(tuple, axis=1))
# protbert_missing_keys = proteinsolver_keys - protbert_keys

In [21]:
# missing_task_ids = []
# for row_group in tqdm(range(pfile.num_row_groups)):
#     df = pfile.read_row_group(
#         row_group, columns=["uniprot_id", "mutation"]
#     ).to_pandas()[["uniprot_id", "mutation"]]

#     row_keys = set()
#     for tup in df.itertuples():
#         for mutation in tup.mutation:
#             row_keys.add((tup.uniprot_id, mutation))

#     if protbert_missing_keys & row_keys:
#         missing_task_ids.append(row_group + 1)

# missing_task_ids

### Rosetta

In [22]:
ra_result_dir = NOTEBOOK_DIR.parent.joinpath("31_run_rosetta_ddg", DATASET_NAME)

ra_result_dir

PosixPath('/gpfs/fs0/scratch/p/pmkim/strokach/workspace/elaspic2-cagi6/notebooks/31_run_rosetta_ddg/humsavar')

In [23]:
present_files, missing_files = get_result_files(ra_result_dir)

assert len(missing_files) == 0
len(present_files), len(missing_files)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=612.0), HTML(value='')))




(612, 0)

In [24]:
result_ra_df = read_files(present_files)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=612.0), HTML(value='')))




In [25]:
display(result_ra_df.head(2))
print(len(result_ra_df))

assert len(result_ra_df) == total_num_mutations

Unnamed: 0,protein_id,mutation,effect,rosetta_cart_bonded_wt,rosetta_rama_prepro_wt,rosetta_ref_wt,rosetta_hxl_tors_wt,rosetta_p_aa_pp_wt,rosetta_fa_dun_semi_wt,rosetta_fa_dun_rot_wt,rosetta_fa_dun_dev_wt,rosetta_omega_wt,rosetta_dslf_fa13_wt,rosetta_hbond_sc_wt,rosetta_hbond_bb_sc_wt,rosetta_hbond_lr_bb_wt,rosetta_hbond_sr_bb_wt,rosetta_fa_intra_elec_wt,rosetta_fa_elec_wt,rosetta_lk_ball_bridge_uncpl_wt,rosetta_lk_ball_bridge_wt,rosetta_lk_ball_iso_wt,rosetta_lk_ball_wt,rosetta_fa_intra_sol_xover4_wt,rosetta_fa_intra_rep_xover4_wt,rosetta_fa_intra_atr_xover4_wt,rosetta_fa_sol_wt,rosetta_fa_rep_wt,rosetta_fa_atr_wt,rosetta_dg_wt,rosetta_cart_bonded_change,rosetta_rama_prepro_change,rosetta_ref_change,rosetta_hxl_tors_change,rosetta_p_aa_pp_change,rosetta_fa_dun_semi_change,rosetta_fa_dun_rot_change,rosetta_fa_dun_dev_change,rosetta_omega_change,rosetta_dslf_fa13_change,rosetta_hbond_sc_change,rosetta_hbond_bb_sc_change,rosetta_hbond_lr_bb_change,rosetta_hbond_sr_bb_change,rosetta_fa_intra_elec_change,rosetta_fa_elec_change,rosetta_lk_ball_bridge_uncpl_change,rosetta_lk_ball_bridge_change,rosetta_lk_ball_iso_change,rosetta_lk_ball_change,rosetta_fa_intra_sol_xover4_change,rosetta_fa_intra_rep_xover4_change,rosetta_fa_intra_atr_xover4_change,rosetta_fa_sol_change,rosetta_fa_rep_change,rosetta_fa_atr_change,rosetta_dg_change
0,O97980,H16Y,LB/B,71.898,41.522333,-23.888,21.799667,2.173333,53.990667,33.745333,43.407667,103.566667,0.0,-1.52,0.0,0.0,-5.942,-8.984333,-15.621333,-0.212,-0.040333,-45.462,20.883667,11.069667,3.2,-14.431333,91.535333,4.537333,-99.084667,288.143333,0.427,-0.115333,0.83,0.029333,0.168667,0.097333,0.160667,0.111333,-0.000667,0.0,0.0,0.0,0.0,0.0,-0.063667,-0.588667,-0.174,-0.020667,0.016,0.512333,-0.241667,0.011,-0.411667,-1.572333,0.332667,-1.902333,-2.392333
1,A0A0C5B5G6,K14Q,US,28.964,23.071,-2.435,0.003,2.241,17.628,15.728,14.727,26.549,0.0,0.0,0.0,0.0,-1.824,-1.16,-1.069,-0.295,-0.028,-16.627,7.977,4.731,1.432,-8.683,32.06,1.931,-42.063,102.859,1.25,-0.065,0.61,0.001,0.002,2.678,-1.646,0.083,0.149,0.0,0.0,-1.137,0.0,0.0,-0.253,-1.836,0.039,0.001,-0.808,0.576,0.172,0.066,-0.205,1.423,0.07,-1.354,-0.186


61179


In [26]:
# proteinsolver_keys = set(result_ps_df[["protein_id", "mutation"]].apply(tuple, axis=1))
# rosetta_keys = set(result_ra_df[["protein_id", "mutation"]].apply(tuple, axis=1))
# rosetta_missing_keys = proteinsolver_keys - rosetta_keys

# assert not rosetta_missing_keys

### MSA

In [27]:
msa_result_dir = NOTEBOOK_DIR.parent.joinpath("31_run_msa_analysis", DATASET_NAME)

msa_result_dir

PosixPath('/gpfs/fs0/scratch/p/pmkim/strokach/workspace/elaspic2-cagi6/notebooks/31_run_msa_analysis/humsavar')

In [28]:
present_files, missing_files = get_result_files(msa_result_dir, TASK_COUNT_ALN)

assert len(missing_files) == 0
len(present_files), len(missing_files)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=12557.0), HTML(value='')))




(12557, 0)

In [29]:
result_msa_df = read_files(present_files)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=12557.0), HTML(value='')))




In [30]:
display(result_msa_df.head(2))
print(len(result_msa_df))

assert len(result_msa_df) == total_num_aln_mutations

Unnamed: 0,protein_id,mutation,effect,msa_count_wt,msa_count_mut,msa_count_total,msa_proba_wt,msa_proba_mut,msa_proba_total,msa_length,msa_proba,msa_H,msa_KL
0,A0A0C5B5G6,K14Q,US,3.0,0.0,3.0,-1.7492,-3.135494,-61.32359,3,-1.7492,-0.0,2.849038
1,P0CJ72,T13I,LB/B,5.0,9.0,16.0,-1.791759,-1.280934,-66.477422,16,-1.160392,0.862912,1.853562


61174


In [31]:
# proteinsolver_keys = set(result_ps_df[["protein_id", "mutation"]].apply(tuple, axis=1))
# msa_keys = set(result_pb_df[["protein_id", "mutation"]].apply(tuple, axis=1))
# msa_missing_keys = proteinsolver_keys - msa_keys

In [32]:
# missing_task_ids = []
# for row_group in tqdm(range(pfile_aln.num_row_groups)):
#     df = pfile_aln.read_row_group(row_group, columns=["protein_id", "mutation"]).to_pandas()[
#         ["protein_id", "mutation"]
#     ]

#     row_keys = set()
#     for tup in df.itertuples():
#         for mutation in tup.mutation:
#             row_keys.add((tup.protein_id, mutation))

#     if protbert_missing_keys & row_keys:
#         missing_task_ids.append(row_group + 1)

# missing_task_ids

### AlphaFold

In [33]:
af_result_dir = NOTEBOOK_DIR.parent.joinpath("31_run_alphafold", DATASET_NAME)

af_result_dir

PosixPath('/gpfs/fs0/scratch/p/pmkim/strokach/workspace/elaspic2-cagi6/notebooks/31_run_alphafold/humsavar')

In [34]:
present_files, missing_files = get_result_files(af_result_dir, TASK_COUNT_ALN)

len(present_files), len(missing_files)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=12557.0), HTML(value='')))




(12557, 0)

In [35]:
result_af_df = read_files(present_files)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=12557.0), HTML(value='')))




In [36]:
display(result_af_df.head(2))
print(len(result_af_df))

assert len(result_af_df) == total_num_aln_mutations

Unnamed: 0,protein_id,mutation,alphafold_core_score_plddt,alphafold_core_score_protein_plddt,alphafold_core_score_protein_max_predicted_aligned_error,alphafold_core_score_protein_ptm,alphafold_core_features_experimentally_resolved,alphafold_core_features_predicted_lddt,alphafold_core_features_msa_first_row,alphafold_core_features_single,alphafold_core_features_structure_module,alphafold_core_score_predicted_aligned_error_row_mean,alphafold_core_score_predicted_aligned_error_row_max,alphafold_core_score_predicted_aligned_error_col_mean,alphafold_core_score_predicted_aligned_error_col_max,alphafold_core_score_predicted_aligned_error_diag,alphafold_core_features_distogram_row_mean,alphafold_core_features_aligned_confidence_probs_row_mean,alphafold_core_features_pair_row_mean,alphafold_core_features_distogram_row_max,alphafold_core_features_aligned_confidence_probs_row_max,alphafold_core_features_pair_row_max,alphafold_core_features_distogram_col_mean,alphafold_core_features_aligned_confidence_probs_col_mean,alphafold_core_features_pair_col_mean,alphafold_core_features_distogram_col_max,alphafold_core_features_aligned_confidence_probs_col_max,alphafold_core_features_pair_col_max,alphafold_core_features_distogram_diag,alphafold_core_features_aligned_confidence_probs_diag,alphafold_core_features_pair_diag,alphafold_core_score_msa_logits_first_wt,alphafold_core_score_msa_logits_first_mut,alphafold_core_score_msa_logits_mean_wt,alphafold_core_score_msa_logits_mean_mut,alphafold_core_score_msa_logits_max_wt,alphafold_core_score_msa_logits_max_mut,alphafold_core_score_msa_logproba_first_wt,alphafold_core_score_msa_logproba_first_mut,alphafold_core_score_msa_logproba_mean_wt,alphafold_core_score_msa_logproba_mean_mut,alphafold_core_score_msa_logproba_max_wt,alphafold_core_score_msa_logproba_max_mut,alphafold_core_features_msa_first,alphafold_core_features_msa_mean,alphafold_core_features_msa_max
0,A0A0C5B5G6,K14Q,56.406708,62.003188,31.75,0.027069,"[0.0023722572, 0.018356942, 0.10303657, 0.1508...","[-5.2305098, -5.6203985, -4.857919, -3.7483947...","[-3.567015, -5.2736416, -9.009683, -3.05795, 5...","[24.392054, 3.2788322, 21.41848, 21.365389, 6....","[0.005355336, 0.010748506, -0.0057989154, 0.00...",8.574256,17.349895,9.382161,21.666225,0.25235,"[2.496045768260956, -4.968722492456436, -5.866...","[0.06330700959642854, 0.06389473706803983, 0.0...","[3.3381232991814613, 29.497473165392876, -2.04...","[114.23722839355469, 3.4389352798461914, 3.910...","[0.9999160766601562, 0.3767480254173279, 0.279...","[23.421672821044922, 601.749267578125, 35.1634...","[2.496045768260956, -4.968722492456436, -5.866...","[0.0632605097234773, 0.062133513205480995, 0.0...","[5.810895625501871, 36.592444146052, 0.6604679...","[114.23722839355469, 3.4389352798461914, 3.910...","[0.9999160766601562, 0.37512755393981934, 0.28...","[23.421672821044922, 601.749267578125, 35.1634...","[114.23722839355469, -3.2029566764831543, -26....","[0.9999160766601562, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[23.421672821044922, 601.749267578125, 35.1634...",10.019153,1.129459,0.734094,-2.090972,10.019153,1.129459,-0.001928,-8.891622,-3.104038,-5.929103,-0.001928,-5.92326,"[-4.41256046295166, -5.1106367111206055, -9.10...","[-4.500693058404397, -0.7133069470172791, 5.53...","[-4.41256046295166, -0.7046337127685547, 5.566..."
1,P0CJ72,T13I,64.43125,61.304924,31.75,0.114342,"[0.08796918, 0.03081914, -0.044761796, 0.01308...","[-5.798191, -7.272913, -6.322961, -5.6527224, ...","[1.0994258, -0.6997881, -10.329717, -0.7453602...","[-18.75544, 4.724907, -9.10808, -33.123413, -3...","[0.004412666, 0.0069596395, -0.0060222056, 0.0...",10.042964,18.048836,9.345033,20.223888,0.25028,"[0.8779546022415161, -5.235870281855266, -5.63...","[0.04259219509306907, 0.052577671876254804, 0....","[1.9545368279019992, 28.045276482899983, -2.61...","[129.9250030517578, 3.802384376525879, 4.86940...","[0.999992311000824, 0.4319790005683899, 0.2649...","[12.280369758605957, 692.3701171875, 22.935337...","[0.8779546022415161, -5.235870281855266, -5.63...","[0.04238582096998774, 0.05880330579505729, 0.0...","[2.3460903018712997, 26.394239125152428, -2.37...","[129.9250030517578, 3.802384376525879, 4.86940...","[0.999992311000824, 0.46187347173690796, 0.241...","[10.505098342895508, 692.3701171875, 22.935337...","[129.9250030517578, -8.64072036743164, -35.039...","[0.999992311000824, 0.0, 0.0, 0.0, 0.0, 0.0, 0...","[7.677867412567139, 692.3701171875, 22.9353370...",7.333201,2.725174,1.394253,1.605194,7.946972,11.149406,-0.022161,-4.630187,-1.908599,-1.697658,-0.01463,-0.001242,"[3.5523734092712402, -2.357158660888672, -16.4...","[1.1172368567231603, -0.863215982913971, -2.52...","[3.5523734092712402, 1.110595703125, 2.2950477..."


61174


### CCMpred

In [None]:
ccmpred_result_dir = NOTEBOOK_DIR.parent.joinpath("31_run_ccmpred", DATASET_NAME)

In [None]:
present_files, missing_files = get_result_files(ccmpred_result_dir)

len(present_files), len(missing_files)

In [None]:
result_ccmpred_df = read_files(present_files)

In [None]:
display(result_ccmpred_df.head(2))
print(len(result_ccmpred_df))

## Combine results

In [37]:
len(result_ps_df) == len(result_ps_df[["protein_id", "mutation"]].drop_duplicates())

True

In [38]:
result_df = (
    result_ps_df.merge(
        result_pb_df, on=["protein_id", "mutation", "effect"], how="left"
    )
    .merge(result_ra_df, on=["protein_id", "mutation", "effect"], how="left")
    .merge(result_msa_df, on=["protein_id", "mutation", "effect"], how="left")
    .merge(result_af_df, on=["protein_id", "mutation"], how="left")
    .merge(result_ccmpred_df, on=["protein_id", "mutation"], how="left")
)

assert len(result_df) == len(result_df[["protein_id", "mutation"]].drop_duplicates())
assert not (
    set(result_ps_df[["protein_id", "mutation"]].apply(tuple, axis=1))
    | set(result_pb_df[["protein_id", "mutation"]].apply(tuple, axis=1))
    | set(result_ra_df[["protein_id", "mutation"]].apply(tuple, axis=1))
    | set(result_msa_df[["protein_id", "mutation"]].apply(tuple, axis=1))
    | set(result_af_df[["protein_id", "mutation"]].apply(tuple, axis=1))
) - set(result_df[["protein_id", "mutation"]].apply(tuple, axis=1))

In [39]:
display(result_df.head(2))
print(len(result_df))

Unnamed: 0,protein_id,mutation,effect,proteinsolver_core_score_wt,proteinsolver_core_score_mut,proteinsolver_core_features_residue_wt,proteinsolver_core_features_protein_wt,proteinsolver_core_features_residue_mut,proteinsolver_core_features_protein_mut,protbert_core_score_wt,protbert_core_score_mut,protbert_core_features_residue_wt,protbert_core_features_protein_wt,protbert_core_features_residue_mut,protbert_core_features_protein_mut,rosetta_cart_bonded_wt,rosetta_rama_prepro_wt,rosetta_ref_wt,rosetta_hxl_tors_wt,rosetta_p_aa_pp_wt,rosetta_fa_dun_semi_wt,rosetta_fa_dun_rot_wt,rosetta_fa_dun_dev_wt,rosetta_omega_wt,rosetta_dslf_fa13_wt,rosetta_hbond_sc_wt,rosetta_hbond_bb_sc_wt,rosetta_hbond_lr_bb_wt,rosetta_hbond_sr_bb_wt,rosetta_fa_intra_elec_wt,rosetta_fa_elec_wt,rosetta_lk_ball_bridge_uncpl_wt,rosetta_lk_ball_bridge_wt,rosetta_lk_ball_iso_wt,rosetta_lk_ball_wt,rosetta_fa_intra_sol_xover4_wt,rosetta_fa_intra_rep_xover4_wt,rosetta_fa_intra_atr_xover4_wt,rosetta_fa_sol_wt,rosetta_fa_rep_wt,rosetta_fa_atr_wt,rosetta_dg_wt,rosetta_cart_bonded_change,rosetta_rama_prepro_change,rosetta_ref_change,rosetta_hxl_tors_change,rosetta_p_aa_pp_change,rosetta_fa_dun_semi_change,rosetta_fa_dun_rot_change,rosetta_fa_dun_dev_change,rosetta_omega_change,rosetta_dslf_fa13_change,rosetta_hbond_sc_change,rosetta_hbond_bb_sc_change,rosetta_hbond_lr_bb_change,rosetta_hbond_sr_bb_change,rosetta_fa_intra_elec_change,rosetta_fa_elec_change,rosetta_lk_ball_bridge_uncpl_change,rosetta_lk_ball_bridge_change,rosetta_lk_ball_iso_change,rosetta_lk_ball_change,rosetta_fa_intra_sol_xover4_change,rosetta_fa_intra_rep_xover4_change,rosetta_fa_intra_atr_xover4_change,rosetta_fa_sol_change,rosetta_fa_rep_change,rosetta_fa_atr_change,rosetta_dg_change,msa_count_wt,msa_count_mut,msa_count_total,msa_proba_wt,msa_proba_mut,msa_proba_total,msa_length,msa_proba,msa_H,msa_KL,alphafold_core_score_plddt,alphafold_core_score_protein_plddt,alphafold_core_score_protein_max_predicted_aligned_error,alphafold_core_score_protein_ptm,alphafold_core_features_experimentally_resolved,alphafold_core_features_predicted_lddt,alphafold_core_features_msa_first_row,alphafold_core_features_single,alphafold_core_features_structure_module,alphafold_core_score_predicted_aligned_error_row_mean,alphafold_core_score_predicted_aligned_error_row_max,alphafold_core_score_predicted_aligned_error_col_mean,alphafold_core_score_predicted_aligned_error_col_max,alphafold_core_score_predicted_aligned_error_diag,alphafold_core_features_distogram_row_mean,alphafold_core_features_aligned_confidence_probs_row_mean,alphafold_core_features_pair_row_mean,alphafold_core_features_distogram_row_max,alphafold_core_features_aligned_confidence_probs_row_max,alphafold_core_features_pair_row_max,alphafold_core_features_distogram_col_mean,alphafold_core_features_aligned_confidence_probs_col_mean,alphafold_core_features_pair_col_mean,alphafold_core_features_distogram_col_max,alphafold_core_features_aligned_confidence_probs_col_max,alphafold_core_features_pair_col_max,alphafold_core_features_distogram_diag,alphafold_core_features_aligned_confidence_probs_diag,alphafold_core_features_pair_diag,alphafold_core_score_msa_logits_first_wt,alphafold_core_score_msa_logits_first_mut,alphafold_core_score_msa_logits_mean_wt,alphafold_core_score_msa_logits_mean_mut,alphafold_core_score_msa_logits_max_wt,alphafold_core_score_msa_logits_max_mut,alphafold_core_score_msa_logproba_first_wt,alphafold_core_score_msa_logproba_first_mut,alphafold_core_score_msa_logproba_mean_wt,alphafold_core_score_msa_logproba_mean_mut,alphafold_core_score_msa_logproba_max_wt,alphafold_core_score_msa_logproba_max_mut,alphafold_core_features_msa_first,alphafold_core_features_msa_mean,alphafold_core_features_msa_max
0,A0A0C5B5G6,K14Q,US,0.070044,0.04016,"[-0.74697345495224, 1.316953420639038, 0.01530...","[-1.1096992492675781, 0.785698652267456, 0.256...","[0.05442849546670914, 0.7043757438659668, -0.2...","[-1.0095055103302002, 0.7263880372047424, 0.24...",0.058932,0.033909,"[-0.005357364658266306, 0.032247915863990784, ...","[0.03903713822364807, -0.016466135159134865, 0...","[-0.0028596159536391497, 0.004915683530271053,...","[0.03650001436471939, -0.025432726368308067, 0...",28.964,23.071,-2.435,0.003,2.241,17.628,15.728,14.727,26.549,0.0,0.0,0.0,0.0,-1.824,-1.16,-1.069,-0.295,-0.028,-16.627,7.977,4.731,1.432,-8.683,32.06,1.931,-42.063,102.859,1.25,-0.065,0.61,0.001,0.002,2.678,-1.646,0.083,0.149,0.0,0.0,-1.137,0.0,0.0,-0.253,-1.836,0.039,0.001,-0.808,0.576,0.172,0.066,-0.205,1.423,0.07,-1.354,-0.186,3.0,0.0,3.0,-1.7492,-3.135494,-61.32359,3.0,-1.7492,-0.0,2.849038,56.406708,62.003188,31.75,0.027069,"[0.0023722572, 0.018356942, 0.10303657, 0.1508...","[-5.2305098, -5.6203985, -4.857919, -3.7483947...","[-3.567015, -5.2736416, -9.009683, -3.05795, 5...","[24.392054, 3.2788322, 21.41848, 21.365389, 6....","[0.005355336, 0.010748506, -0.0057989154, 0.00...",8.574256,17.349895,9.382161,21.666225,0.25235,"[2.496045768260956, -4.968722492456436, -5.866...","[0.06330700959642854, 0.06389473706803983, 0.0...","[3.3381232991814613, 29.497473165392876, -2.04...","[114.23722839355469, 3.4389352798461914, 3.910...","[0.9999160766601562, 0.3767480254173279, 0.279...","[23.421672821044922, 601.749267578125, 35.1634...","[2.496045768260956, -4.968722492456436, -5.866...","[0.0632605097234773, 0.062133513205480995, 0.0...","[5.810895625501871, 36.592444146052, 0.6604679...","[114.23722839355469, 3.4389352798461914, 3.910...","[0.9999160766601562, 0.37512755393981934, 0.28...","[23.421672821044922, 601.749267578125, 35.1634...","[114.23722839355469, -3.2029566764831543, -26....","[0.9999160766601562, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[23.421672821044922, 601.749267578125, 35.1634...",10.019153,1.129459,0.734094,-2.090972,10.019153,1.129459,-0.001928,-8.891622,-3.104038,-5.929103,-0.001928,-5.92326,"[-4.41256046295166, -5.1106367111206055, -9.10...","[-4.500693058404397, -0.7133069470172791, 5.53...","[-4.41256046295166, -0.7046337127685547, 5.566..."
1,A1L188,F55L,US,0.03052,0.088642,"[-2.3987135887145996, 0.10982956737279892, -0....","[-1.777479648590088, -0.03869394585490227, 0.0...","[-4.062615871429443, -0.34137260913848877, -0....","[-1.7952650785446167, -0.047699496150016785, 0...",0.093019,0.143134,"[0.1373966485261917, 0.06155325844883919, 0.02...","[0.05456249415874481, 0.024172015488147736, 0....","[0.14880749583244324, 0.08820129185914993, 0.0...","[0.056186407804489136, 0.03005887381732464, 0....",54.083333,24.254333,34.115,21.232667,-2.565333,39.302667,36.441667,13.28,22.048667,-3.074333,-3.703667,-0.276,0.0,-52.195,-15.823333,-122.253667,-2.345333,-0.287333,-194.744333,157.881667,16.053667,5.317,-21.941,350.360667,22.526,-417.818333,-40.130333,-0.055667,-0.192667,-0.673,0.007333,0.071333,-2.044,0.503333,0.156333,-0.074333,0.001667,1.006333,0.0,0.0,0.036667,0.147667,1.918667,0.022,-0.004333,0.424333,-0.918,-0.722,0.006,0.876667,-1.299667,0.392333,-0.329333,-0.742667,1014.0,0.0,1055.0,-0.057432,-6.980076,-121.417492,1064.0,-1.155528,0.332007,2.727859,94.872637,83.732183,31.75,0.629771,"[5.2269716, 5.4108214, 6.3251753, 5.2315235, 6...","[-8.547121, -9.857347, -8.743816, -7.98783, -7...","[7.24591, 7.257983, 18.1454, 11.383427, 5.4409...","[-4.8151565, -41.834213, 24.696276, -54.30642,...","[0.00025387853, 0.0075306967, -0.0058459025, 0...",3.380208,14.113509,6.847809,29.38232,0.250023,"[-4.792972142631943, -7.898219147244015, -7.77...","[0.0136974966045523, 0.16878037154989167, 0.20...","[-0.21200577491844022, 7.626773377930796, -2.0...","[204.79286193847656, 10.81344985961914, 5.6720...","[1.0, 0.9096415638923645, 0.4646220803260803, ...","[42.985782623291016, 762.8607788085938, 12.057...","[-4.792972142631943, -7.898219147244015, -7.77...","[0.013732861407453816, 0.1381827002696502, 0.1...","[-1.6025930771956574, 4.840899893560925, -0.51...","[204.79286193847656, 10.81344985961914, 5.6720...","[1.0, 0.9122956991195679, 0.4612761437892914, ...","[42.985782623291016, 762.8607788085938, 11.074...","[204.79286193847656, 10.81344985961914, -24.74...","[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[42.985782623291016, 762.8607788085938, -38.83...",10.447226,2.202247,10.1312,1.594486,12.859581,3.317224,-0.00088,-8.245859,-0.423115,-8.959829,-6.8e-05,-3.660997,"[8.289717674255371, 6.322072982788086, 20.9344...","[2.772727955499385, 1.964366348944311, 14.6376...","[33.750972747802734, 11.12519645690918, 27.888..."


61179


## Calculate EL2 score

In [40]:
model = el2.ELASPIC2()

https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations
Some weights of the model checkpoint at /gpfs/fs0/scratch/p/pmkim/strokach/workspace/elaspic2/src/elaspic2/plugins/protbert/data/prot_bert_bfd were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClas

In [41]:
proteinsolver_columnms = [c for c in result_df if c.startswith("proteinsolver_core_score")]
protbert_columns = [c for c in result_df if c.startswith("protbert_core_")]

el2_missing = result_df[proteinsolver_columnms + protbert_columns].isnull().any(axis=1)
result_df["el2_score"] = np.nan
result_df.loc[~el2_missing, "el2_score"] = model.predict_mutation_effect(
    [
        t._asdict()
        for t in result_df.loc[~el2_missing, proteinsolver_columnms + protbert_columns].itertuples(
            index=False
        )
    ]
)

## Calculate deltas

In [42]:
for column in list(result_df):
    if not column.endswith("_mut"):
        continue

    column_wt = column.removesuffix("_mut") + "_wt"
    if column_wt not in result_df:
        print(column_wt)
        continue

    column_change = column.removesuffix("_mut") + "_change"
    result_df[column_change] = result_df[column_wt] - result_df[column]
    del result_df[column]

## Encode mutation

In [43]:
# For historic reasons, have to use analyze_msa.RESIDUE_ENCODING_WT/MUT
result_df["aa_wt_onehot"] = (
    result_df["mutation"].str[0].map(analyze_msa.RESIDUE_ENCODING_WT).apply(list)
)
result_df["aa_mut_onehot"] = (
    result_df["mutation"].str[-1].map(analyze_msa.RESIDUE_ENCODING_MUT).apply(list)
)

## Save results

In [44]:
output_file = NOTEBOOK_DIR.joinpath("combined-results.parquet")

output_file

PosixPath('/gpfs/fs0/scratch/p/pmkim/strokach/workspace/elaspic2-cagi6/notebooks/37_humsavar_combine_results/combined-results.parquet')

In [45]:
pq.write_table(
    pa.Table.from_pandas(result_df, preserve_index=False),
    output_file,
    row_group_size=10_000,
)

## Exploratory data analysis

In [46]:
proteinsolver_columnms = [c for c in result_df if c.startswith("proteinsolver_")]
protbert_columns = [c for c in result_df if c.startswith("protbert_")]
rosetta_columns = [c for c in result_df if c.startswith("rosetta_")]
alphafold_columns = [c for c in result_df if c.startswith("alphafold_")]

In [47]:
result_df["effect"].value_counts()

LB/B    33125
LP/P    21789
US       6265
Name: effect, dtype: int64

In [48]:
effect_map = {
    "LB/B": -1,
    "US": 0,
    "LP/P": 1,
}

result_df["effect_score"] = result_df["effect"].map(effect_map)

  result_df["effect_score"] = result_df["effect"].map(effect_map)


In [49]:
score_columns = [
    "el2_score",
    "alphafold_core_score_msa_logproba_mean_change",
    "protbert_core_score_change",
    "proteinsolver_core_score_change",
    "msa_KL",
    "rosetta_dg_change",
]

df = result_df.dropna(subset=score_columns + ["effect_score"])
df = df[df["effect_score"] != 0].reset_index(drop=True)

for col in score_columns:
    corr = stats.spearmanr(df["effect_score"], df[col])
    auc = metrics.roc_auc_score(df["effect_score"] > 0, df[col])
    print(col, corr[0], auc)

el2_score 0.5814940493221467 0.8431183144786428
alphafold_core_score_msa_logproba_mean_change 0.5792880576832575 0.841816639708643
protbert_core_score_change 0.46884514239330644 0.77664832545919
proteinsolver_core_score_change 0.3529838117888982 0.7082828030320032
msa_KL 0.3764762573459568 0.722144833634723
rosetta_dg_change 0.32965074966152685 0.6945148178470095


In [50]:
score_columns = [
    "el2_score",
    "alphafold_core_score_msa_logproba_mean_change",
    "protbert_core_score_change",
    "proteinsolver_core_score_change",
    "msa_KL",
    "rosetta_dg_change",
]

for column in score_columns:
    print(f"{column} {result_df[column].isnull().sum()}")

el2_score 0
alphafold_core_score_msa_logproba_mean_change 5
protbert_core_score_change 0
proteinsolver_core_score_change 0
msa_KL 5
rosetta_dg_change 0
