## Calculate significance for site-level escape differences between the non-stabilized nanoparticle and other antigen groups 

In [1]:
import pandas as pd
from scipy.stats import mannwhitneyu

In [2]:
#get summary data
df = pd.read_csv("../results/summaries/Stabilized_HA_sera_escape.csv")

# filter for poor entry
df = df[df["entry in 293T cells"] >= -3].copy()

In [3]:
# groups
# non-stabilized nanoparticle
rc_mean_col = "mean H5-RC_I_1"

# other groups
non_rc_mean_cols = [
    "mean H5-FMLMI-RC_I_1",
    "mean H5-membrane-anchored",
    "mean H5-FMLMI-membrane-anchored",
    "mean H5-foldon",
    "mean H5-FMLMI-foldon"
]

# replicate columns for Mann-Whitney test
rc_group_cols = [
    "H5-RC_I_1-mouse-2",
    "H5-RC_I_1-mouse-3",
    "H5-RC_I_1-mouse-4"
]

non_rc_group_cols = [
    "H5-FMLMI-RC_I_1-mouse-1",
    "H5-FMLMI-RC_I_1-mouse-2",
    "H5-FMLMI-RC_I_1-mouse-3",
    "H5-membrane-anchored-mouse-1",
    "H5-membrane-anchored-mouse-2",
    "H5-membrane-anchored-mouse-3",
    "H5-FMLMI-membrane-anchored-mouse-1",
    "H5-FMLMI-membrane-anchored-mouse-2",
    "H5-FMLMI-membrane-anchored-mouse-3",
    "H5-foldon-mouse-1",
    "H5-foldon-mouse-2",
    "H5-FMLMI-foldon-mouse-1",
    "H5-FMLMI-foldon-mouse-2"
]


In [4]:
# find top 5 sites with greatest summed site escape differences to non-stabilized nanoparticle group

# set negative values to 0 before summing
df[rc_mean_col] = pd.to_numeric(df[rc_mean_col], errors="coerce").fillna(0).clip(lower=0)
for col in non_rc_mean_cols:
    df[col] = pd.to_numeric(df[col], errors="coerce").fillna(0).clip(lower=0)

# group by site and sum across mutations within each group
site_sums = df.groupby("site")[[rc_mean_col] + non_rc_mean_cols].sum()

# calculate difference relative to non-stabilized nanoparticle group
for col in non_rc_mean_cols:
    site_sums[f"diff_{col}"] = site_sums[rc_mean_col] - site_sums[col]

# get difference mean for other groups
site_sums["summed_diff"] = site_sums[[f"diff_{c}" for c in non_rc_mean_cols]].mean(axis=1)

# Select top 15 sites
top_sites = site_sums.nlargest(5, "summed_diff").index.tolist()

In [5]:
top_sites

['90', '57', '125', '78', '276']

In [6]:
# perform Mann-Whitney test for each top site
results = []
for site in top_sites:
    site_df = df[df["site"] == site]

    rc_values = site_df[rc_group_cols].values.flatten()
    non_rc_values = site_df[non_rc_group_cols].values.flatten()

    # Drop missing values
    rc_values = rc_values[~pd.isna(rc_values)]
    non_rc_values = non_rc_values[~pd.isna(non_rc_values)]

    # Perform Mann-Whitney test
    stat, pval = mannwhitneyu(rc_values, non_rc_values, alternative="two-sided")

    results.append({
        "site": site,
        "rc_mean": site_sums.loc[site, rc_mean_col],
        "summed_diff": site_sums.loc[site, "summed_diff"],
        "n_RC": len(rc_values),
        "n_nonRC": len(non_rc_values),
        "mannwhitney_U": stat,
        "p_value": pval
    })

results_df = pd.DataFrame(results)
results_df.to_csv("stabilized_HA_sera_escape_mannwhitney.csv", index=False)

print(results_df)

  site   rc_mean  summed_diff  n_RC  n_nonRC  mannwhitney_U   p_value
0   90  3.169020     2.076179    48      205         6976.5  0.000007
1   57  2.080427     1.437135    33      135         2797.5  0.022930
2  125  2.283600     1.420894    33      147         3485.5  0.000089
3   78  1.724220     1.009511    27      121         2098.5  0.021015
4  276  2.732920     0.999712    45      198         5353.0  0.034954
