## Imports

In [None]:
from pathlib import Path

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

## Evaluate submissions

### `40_cagi6_sherloc_submission`

In [None]:
folder = "40_cagi6_sherloc_submission"

dfs = []
for file in sorted(Path(folder, "submission").glob("strokach_modelnumber_*.tsv")):
    df = pd.read_csv(file, sep="\t")
    dfs.append(df)
    num_missing = (df["comment"] == "No prediction (dummy score)").sum()
    print(file.name, num_missing)
display(dfs[0].head(2))

combined_df = None
for i, df in enumerate(dfs):
    if combined_df is None:
        combined_df = df
    else:
        combined_df = combined_df.merge(df, on=["hgvs"], suffixes=("", f"{i}"))

corr_columns = [c for c in combined_df if c.startswith("score")]
corrs = combined_df[corr_columns].corr(method="spearman")
display(corrs)

g = sns.pairplot(combined_df[corr_columns])

### `40_cagi6_hmbs_submission`

In [None]:
folder = "40_cagi6_hmbs_submission"

dfs = []
for file in sorted(Path(folder, "submission").glob("strokach_modelnumber_*.tsv")):
    df = pd.read_csv(file, sep="\t")
    dfs.append(df)
display(dfs[0].head(2))

combined_df = None
for i, df in enumerate(dfs):
    if combined_df is None:
        combined_df = df
    else:
        combined_df = combined_df.merge(df, on=["aa_substitution"], suffixes=("", f"{i}"))

corr_columns = [c for c in combined_df if c.startswith("score")]
corrs = combined_df[corr_columns].corr(method="spearman")
display(corrs)

g = sns.pairplot(combined_df[corr_columns])

### `40_cagi6_cam_submission`

In [None]:
folder = "40_cagi6_cam_submission"

dfs = []
for file in sorted(Path(folder, "submission").glob("strokach_modelnumber_*.tsv")):
    df = pd.read_csv(file, sep="\t")
    dfs.append(df)
display(dfs[0].head(2))

combined_df = None
for i, df in enumerate(dfs):
    if combined_df is None:
        combined_df = df
    else:
        combined_df = combined_df.merge(df, on=["CaM-variant"], suffixes=("", f"{i}"))

corr_columns = [c for c in combined_df if c.startswith("Tm")]
corrs = combined_df[corr_columns].corr(method="spearman")
display(corrs)

g = sns.pairplot(combined_df[corr_columns])

### `40_cagi6_mapk1_submission`

In [None]:
folder = "40_cagi6_mapk1_submission"

dfs = []
for file in sorted(Path(folder, "submission").glob("strokach_modelnumber_*.tsv")):
    df = pd.read_csv(file, sep="\t")
    dfs.append(df)
display(dfs[0].head(2))
    
combined_df = None
for i, df in enumerate(dfs):
    if combined_df is None:
        combined_df = df
    else:
        combined_df = combined_df.merge(df, on=["MAPK1-variant"], suffixes=("", f"{i}"))

corr_columns = [c for c in combined_df if c.startswith("DDG-NotPO4")]
corrs = combined_df[corr_columns].corr(method="spearman")
display(corrs)

g = sns.pairplot(combined_df[corr_columns])

### `40_cagi6_mapk3_submission`

In [None]:
folder = "40_cagi6_mapk3_submission"

dfs = []
for file in sorted(Path(folder, "submission").glob("strokach_modelnumber_*.tsv")):
    df = pd.read_csv(file, sep="\t")
    dfs.append(df)
display(dfs[0].head(2))
    
combined_df = None
for i, df in enumerate(dfs):
    if combined_df is None:
        combined_df = df
    else:
        combined_df = combined_df.merge(df, on=["MAPK3-variant"], suffixes=("", f"{i}"))

corr_columns = [c for c in combined_df if c.startswith("DDG-NotPO4")]
corrs = combined_df[corr_columns].corr(method="spearman")
display(corrs)

g = sns.pairplot(combined_df[corr_columns])