## Summary

---

## Imports

In [None]:
from pathlib import Path

import elaspic2 as el2
import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
from scipy import stats
from sklearn import metrics
from tqdm.auto import tqdm

In [None]:
pd.set_option("max_columns", 1000)
pd.set_option("max_rows", 1000)

## Parameters

In [None]:
NOTEBOOK_DIR = Path("39_cagi6_sherloc_submission").resolve()
NOTEBOOK_DIR.mkdir(exist_ok=True)

NOTEBOOK_DIR

## Load data

In [None]:
submission_template_file = NOTEBOOK_DIR.parent.joinpath(
    "30_cagi6_sherloc", "submission_template.tsv"
)

submission_template_df = pd.read_csv(submission_template_file, sep="\t")

display(submission_template_df.head(2))
len(submission_template_df)

### `training_df`

In [None]:
training_file = NOTEBOOK_DIR.parent.joinpath(
    "30_cagi6_sherloc",
    "CAGI6-Sherloc-clinical-classification",
    "final_train_070821.txt",
)

In [None]:
training_all_df = pd.read_csv(training_file, sep="\t")

display(training_all_df.head(2))
len(training_all_df)

In [None]:
assert not set(submission_template_df["hgvs"]) & set(training_all_df["hgvs"])

In [None]:
training_df = training_all_df[training_all_df["Subcategory Missense"]]

display(training_df.head(2))
len(training_df)

### `testing_df`

In [None]:
testing_file = NOTEBOOK_DIR.parent.joinpath(
    "30_cagi6_sherloc", "CAGI6-Sherloc-clinical-classification", "final_test_070821.txt"
)

In [None]:
testing_all_df = pd.read_csv(testing_file, sep="\t")

display(testing_all_df.head(2))
len(testing_all_df)

In [None]:
assert not set(submission_template_df["hgvs"]) ^ set(testing_all_df["hgvs"])

In [None]:
testing_df = testing_all_df[testing_all_df["Subcategory Missense"]]

display(testing_df.head(2))
len(testing_df)

### `validation_df`

In [None]:
validation_file = NOTEBOOK_DIR.parent.joinpath(
    "30_cagi6_sherloc", "validation_variants.tsv"
)

In [None]:
validation_all_df = pd.read_csv(validation_file, sep="\t").rename(
    columns={"HGVS.c": "hgvs_g"}
)

display(validation_all_df.tail(2))
len(validation_all_df)

In [None]:
def map_g_to_c(str_g, mapper):
    from hgvs.exceptions import HGVSInvalidIntervalError, HGVSUsageError

    var_g = hp.parse_hgvs_variant(str_g)
    for tx_ac in mapper.relevant_transcripts(var_g):
        try:
            var_c = mapper.g_to_c(var_g, tx_ac)
        except (HGVSUsageError, HGVSInvalidIntervalError):
            continue
        yield var_c

In [None]:
validation_mapping_file = NOTEBOOK_DIR.parent.joinpath(
    "30_cagi6_sherloc", "validation-mapping-grch37.parquet"
)

if validation_mapping_file.is_file():
    validation_mapping_df = pq.read_table(validation_mapping_file).to_pandas()
else:
    import hgvs.assemblymapper
    import hgvs.dataproviders.uta
    import hgvs.parser

    hp = hgvs.parser.Parser()
    hdp = hgvs.dataproviders.uta.connect()
    mapper = hgvs.assemblymapper.AssemblyMapper(
        hdp, assembly_name="GRCh37", normalize=False
    )

    results = []
    for tup in tqdm(validation_all_df.itertuples(), total=len(validation_all_df)):
        for var_c in map_g_to_c(tup.hgvs_g, mapper):
            results.append((tup.str_g, str(var_c)))
    validation_mapping_df = pd.DataFrame(results, columns=["hgvs_g", "hgvs"])
    pq.write_table(
        pa.Table.from_pandas(validation_mapping_df, preserve_index=False),
        validation_mapping_file,
    )

In [None]:
display(validation_mapping_df.head(2))
print(len(validation_mapping_df))

In [None]:
validation_df = (
    validation_all_df
    #
    .merge(validation_mapping_df, on=["hgvs_g"], how="left")
)

In [None]:
display(validation_df.tail(2))
print(len(validation_df))
print(len(validation_df["hgvs_g"].unique()))

## Load results

In [None]:
DATASET_NAME = "cagi6-sherloc"
TASK_COUNT = 4182

DATASET_NAME, TASK_COUNT

In [None]:
def get_result_files(result_dir):
    present_files = []
    missing_files = []
    for i in tqdm(range(1, TASK_COUNT + 1)):
        path = result_dir.joinpath(f"shard-{i}-of-{TASK_COUNT}.parquet")
        if path.is_file():
            present_files.append(path)
        else:
            missing_files.append(path)
    return present_files, missing_files

In [None]:
def read_files(files):
    dfs = []
    for file in tqdm(files):
        df = pq.read_table(file).to_pandas(integer_object_nulls=True)
        dfs.append(df)
    return pd.concat(dfs, ignore_index=True)

### ProteinSolver

In [None]:
ps_result_dir = NOTEBOOK_DIR.parent.joinpath("31_run_proteinsolver", DATASET_NAME)

In [None]:
present_files, missing_files = get_result_files(ps_result_dir)

len(present_files), len(missing_files)

In [None]:
result_ps_df = read_files(present_files)

In [None]:
display(result_ps_df.head(2))
print(len(result_ps_df))

### ProtBert

In [None]:
pb_result_dir = NOTEBOOK_DIR.parent.joinpath("31_run_protbert", DATASET_NAME)

In [None]:
present_files, missing_files = get_result_files(pb_result_dir)

len(present_files), len(missing_files)

In [None]:
result_pb_df = read_files(present_files)

In [None]:
display(result_pb_df.head(2))
print(len(result_pb_df))

### AlphaFold

In [None]:
af_result_dir = NOTEBOOK_DIR.parent.joinpath("31_run_alphafold", DATASET_NAME)

In [None]:
present_files, missing_files = get_result_files(af_result_dir)

len(present_files), len(missing_files)

In [None]:
result_af_df = read_files(present_files)

In [None]:
display(result_af_df.head(2))
print(len(result_af_df))

### Rosetta

In [None]:
ra_result_dir = NOTEBOOK_DIR.parent.joinpath("31_run_rosetta_ddg", DATASET_NAME)

In [None]:
present_files, missing_files = get_result_files(ra_result_dir)

len(present_files), len(missing_files)

In [None]:
result_ra_df = read_files(present_files)

In [None]:
display(result_ra_df.head(2))
print(len(result_ra_df))

## Combine results

In [None]:
len(result_ps_df) == len(result_ps_df["mutation_id"].unique())

In [None]:
result_df = (
    result_ps_df.merge(
        result_pb_df, on=["protein_id", "mutation", "mutation_id", "effect"], how="left"
    )
    .merge(
        result_ra_df, on=["protein_id", "mutation", "mutation_id", "effect"], how="left"
    )
    .merge(
        result_af_df, on=["protein_id", "mutation", "mutation_id", "effect"], how="left"
    )
)

len(result_df) == len(result_df["mutation_id"].unique())

In [None]:
display(result_df.head(2))
print(len(result_df))

## Calculate EL2 score

In [None]:
model = el2.ELASPIC2()

In [None]:
proteinsolver_columnms = [
    c for c in result_df if c.startswith("proteinsolver_core_score")
]
protbert_columns = [c for c in result_df if c.startswith("protbert_core_")]

el2_missing = result_df[proteinsolver_columnms + protbert_columns].isnull().any(axis=1)
result_df["el2_score"] = np.nan
result_df.loc[~el2_missing, "el2_score"] = model.predict_mutation_effect(
    [
        t._asdict()
        for t in result_df.loc[
            ~el2_missing, proteinsolver_columnms + protbert_columns
        ].itertuples(index=False)
    ]
)

## Calculate deltas

In [None]:
for column in list(result_df):
    if not column.endswith("_mut"):
        continue

    column_wt = column.removesuffix("_mut") + "_wt"
    if column_wt not in result_df:
        print(column_wt)
        continue

    column_change = column.removesuffix("_mut") + "_change"
    result_df[column_change] = result_df[column_wt] - result_df[column]
    del result_df[column]

## Correlations

In [None]:
proteinsolver_columnms = [c for c in result_df if c.startswith("proteinsolver_")]
protbert_columns = [c for c in result_df if c.startswith("protbert_")]
rosetta_columns = [c for c in result_df if c.startswith("rosetta_")]
alphafold_columns = [c for c in result_df if c.startswith("alphafold_")]

In [None]:
result_df["effect"].value_counts()

In [None]:
effect_map = {
    "Uncertain significance": 0,
    "Likely benign": -1,
    "Benign": -1,
    "Likely pathogenic": 1,
    "Pathogenic": 1,
}

result_df["effect_score"] = result_df["effect"].map(effect_map)

In [None]:
score_columns = [
    "el2_score",
    "proteinsolver_core_score_change",
    "protbert_core_score_change",
    "rosetta_dg_change",
    "alphafold_core_scores_residue_plddt_wt",
    "alphafold_core_scores_protein_plddt_wt",
    "alphafold_core_scores_protein_max_predicted_aligned_error_wt",
    "alphafold_core_scores_proten_ptm_wt",
]

df = result_df.dropna(subset=score_columns + ["effect_score"])

for col in score_columns:
    corr = stats.spearmanr(df["effect_score"], df[col])
    auc = metrics.roc_auc_score(df["effect_score"] > 0, df[col])
    print(col, corr[0], auc)

In [None]:
i = 6

x1 = df[col].str[i]
x2 = df[col].str[i] - df[col.removesuffix("_wt") + "_mut"].str[i]

In [None]:
import matplotlib.pyplot as plt

plt.hist(x1, bins=100)
plt.hist(x2, bins=100)

None

In [None]:
x = df["alphafold_core_features_residue_experimentally_resolved_wt"].apply(np.mean)

corr = stats.spearmanr(df["effect_score"], x)
auc = metrics.roc_auc_score(df["effect_score"] > 0, x)

corr, auc

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

col = "alphafold_core_features_residue_msa_first_row_change"

x = np.vstack(df[col].values)
x = StandardScaler().fit_transform(x)
x = PCA(0.9, svd_solver="full").fit_transform(x)

evals = []
for i in tqdm(range(x.shape[1])):
    x1 = x[:, i]

    corr = stats.spearmanr(df["effect_score"], x1)
    auc = metrics.roc_auc_score(df["effect_score"] > 0, x1)
    evals.append((i, *corr, auc))

evals_df = pd.DataFrame(evals, columns=["i", "corr", "pvalue", "auc"])
evals_df["corr_abs"] = evals_df["corr"].abs()
evals_df = evals_df.sort_values("corr_abs", ascending=False)
del evals_df["corr_abs"]

display(evals_df.head(10))

In [None]:
x.shape

In [None]:
# col = "alphafold_core_features_residue_experimentally_resolved_wt"  # 0.19 [37]
# col = "alphafold_core_features_residue_experimentally_resolved_change"  # 0.11 [37]
# col = "alphafold_core_features_residue_predicted_lddt_wt"  # 0.17 [50]
# col = "alphafold_core_features_residue_predicted_lddt_change"  # 0.04 [50]
# col = "alphafold_core_features_residue_msa_first_row_wt"  # 0.17 [256]
# col = "alphafold_core_features_residue_msa_first_row_change"  # 0.21 [256]
# col = "alphafold_core_features_residue_single_wt"  # 0.20 [384]
# col = "alphafold_core_features_residue_single_change"  # 0.15 [384]
# col = "alphafold_core_features_residue_structure_module_wt"  # 0.18 [384]
col = "alphafold_core_features_residue_structure_module_change"  # 0.05 [384]

# col = "alphafold_core_features_protein_experimentally_resolved_wt"  #
# col = "alphafold_core_features_protein_predicted_lddt_wt"  # 
# col = "alphafold_core_features_protein_msa_first_row_wt"  # 
# col = "alphafold_core_features_protein_single_wt"  # 
# col = "alphafold_core_features_protein_structure_module_wt"  #

num_features = len(result_df[col][0])

evals = []
for i in tqdm(range(num_features)):
    x1 = df[col].str[i]

    corr = stats.spearmanr(df["effect_score"], x1)
    auc = metrics.roc_auc_score(df["effect_score"] > 0, x1)
    evals.append((i, *corr, auc))

evals_df = pd.DataFrame(evals, columns=["i", "corr", "pvalue", "auc"])
evals_df["corr_abs"] = evals_df["corr"].abs()
evals_df = evals_df.sort_values("corr_abs", ascending=False)
del evals_df["corr_abs"]

display(evals_df.head(10))

In [None]:
scores_proten_ptm

In [None]:
validation_df.head()

In [None]:
validation_submission_df = validation_df.assign(
    is_missense=lambda df: df["hgvs"].notnull()
).merge(result_df, left_on=["hgvs"], right_on=["mutation_id"], how="left")

assert len(validation_submission_df) == len(validation_df)

In [None]:
out = (
    validation_submission_df.groupby("hgvs_g")
    .agg({"el2_score": np.nanmean, "is_missense": "max"})
    .fillna(0.0)
    .reset_index()
)

out.head()

In [None]:
out[["hgvs_g", "el2_score"]].to_csv(
    NOTEBOOK_DIR.joinpath("submission-valid.tsv"), index=False, sep="\t"
)

In [None]:
validation_submission_df.head()

In [None]:
df.columns