# Strain stability validation

In [1]:
import altair as alt

import numpy

import pandas as pd

In [2]:
validation_data = pd.read_csv('strain_stability_validation.csv')

In [3]:
validation_data.head()

Unnamed: 0,strain,Infectivity_retained,pH
0,A/chicken/Khabarovsk/244V/2022,1.0,6.9
1,A/chicken/Khabarovsk/244V/2022,1.0,6.9
2,A/chicken/Khabarovsk/244V/2022,1.0,6.9
3,A/chicken/Khabarovsk/244V/2022,0.421212,5.7
4,A/chicken/Khabarovsk/244V/2022,0.599251,5.7


In [4]:
# Calculate mean and standard deviation for each strain at each pH
df_summary = validation_data.groupby(['strain', 'pH']).agg(
    mean_infectivity=('Infectivity_retained', 'mean'),
    std_infectivity=('Infectivity_retained', 'std')
).reset_index()

In [5]:
color_palette = alt.Scale(
    domain=[
        'A/American Wigeon/South Carolina/USDA-000345-001/2021',
        'A/chicken/Khabarovsk/244V/2022',
        'A/Larusargentatus/Belgium/085710001/2023'
    ],
    range=[
        'grey',
        'orange',
        '#2ca25f'
    ]
)

# Create the base chart
base = alt.Chart(df_summary).encode(
    x=alt.X('pH:Q', scale=alt.Scale(domain=[7, 5.2])),
    y=alt.Y('mean_infectivity:Q', scale=alt.Scale(type='log', domain=[0.005, 1]), title='retained Infectivity'),
    color=alt.Color('strain:N', scale=color_palette)
)

points = base.mark_point(filled=True, size=100).encode(
    tooltip=['strain', 'pH', 'mean_infectivity', 'std_infectivity']
)

error_bars = base.mark_errorbar(extent='stdev').encode(
    yError='std_infectivity:Q'
)

lines = base.mark_line(strokeWidth=3)

# Create the chart
chart = (points + error_bars + lines).properties(
    title='retained Infectivity vs pH',
    width=300,
    height=300
).configure_axis(
    grid=False
).configure_legend(
    title=None,
    orient='right',
    labelFontSize=12,
    symbolSize=100,
    titleFontSize=14,
    labelLimit=400
)

chart.display()
chart.save("strain_pH_stability.html")


In [6]:
dms_data = pd.read_csv("../../results/pH_stability/averages/stability_mut_effect.csv")
dms_data.head()

Unnamed: 0,epitope,site,wildtype,mutant,mutation,stability_mean,stability_median,stability_std,n_models,times_seen,frac_models,Lib1-230822-pH,Lib1-231002-pH,Lib2-230822-pH,Lib2-231002-pH
0,1,-5,E,A,E-5A,-0.02316,-0.02286,0.01936,4,2.5,1.0,-0.04706,-0.02061,0.000131,-0.02511
1,1,-5,E,C,E-5C,-0.1042,-0.09208,0.05642,4,3.5,1.0,-0.08534,-0.09882,-0.1829,-0.04968
2,1,-5,E,D,E-5D,-0.08721,-0.04074,0.1384,4,3.0,1.0,-0.03404,-0.2898,-0.04744,0.02242
3,1,-5,E,F,E-5F,0.06639,0.02081,0.1322,4,4.0,1.0,-0.03441,-0.004097,0.2584,0.04572
4,1,-5,E,G,E-5G,-0.03211,-0.04565,0.03921,4,4.5,1.0,-0.04769,-0.04361,0.02542,-0.06254


In [7]:
df_summary.head()

Unnamed: 0,strain,pH,mean_infectivity,std_infectivity
0,A/American Wigeon/South Carolina/USDA-000345-0...,5.3,0.007732,0.001385
1,A/American Wigeon/South Carolina/USDA-000345-0...,5.5,0.038844,0.005281
2,A/American Wigeon/South Carolina/USDA-000345-0...,5.7,0.167415,0.036671
3,A/American Wigeon/South Carolina/USDA-000345-0...,6.9,1.0,0.0
4,A/Larusargentatus/Belgium/085710001/2023,5.3,0.039979,0.00797


In [8]:
strains = pd.read_csv("strain_mutations.csv")
strains.head()

Unnamed: 0,strain,mutation
0,A/chicken/Khabarovsk/244V/2022,A4T
1,A/chicken/Khabarovsk/244V/2022,H39A
2,A/chicken/Khabarovsk/244V/2022,A91N
3,A/chicken/Khabarovsk/244V/2022,M111L
4,A/chicken/Khabarovsk/244V/2022,N158D


In [9]:
strain_stability = strains.merge(dms_data[['mutation','stability_mean']], on='mutation', how='left')
strain_stability = strain_stability.groupby('strain', as_index=False)['stability_mean'].sum()
strain_stability

Unnamed: 0,strain,stability_mean
0,A/American Wigeon/South Carolina/USDA-000345-0...,0.0
1,A/Larusargentatus/Belgium/085710001/2023,2.10893
2,A/chicken/Khabarovsk/244V/2022,0.92185


In [10]:
df_summary = df_summary.merge(strain_stability, on='strain', how='left')
df_summary

Unnamed: 0,strain,pH,mean_infectivity,std_infectivity,stability_mean
0,A/American Wigeon/South Carolina/USDA-000345-0...,5.3,0.007732,0.001385,0.0
1,A/American Wigeon/South Carolina/USDA-000345-0...,5.5,0.038844,0.005281,0.0
2,A/American Wigeon/South Carolina/USDA-000345-0...,5.7,0.167415,0.036671,0.0
3,A/American Wigeon/South Carolina/USDA-000345-0...,6.9,1.0,0.0,0.0
4,A/Larusargentatus/Belgium/085710001/2023,5.3,0.039979,0.00797,2.10893
5,A/Larusargentatus/Belgium/085710001/2023,5.5,0.458118,0.031097,2.10893
6,A/Larusargentatus/Belgium/085710001/2023,5.7,0.660878,0.13826,2.10893
7,A/Larusargentatus/Belgium/085710001/2023,6.9,1.0,0.0,2.10893
8,A/chicken/Khabarovsk/244V/2022,5.3,0.018338,0.007542,0.92185
9,A/chicken/Khabarovsk/244V/2022,5.5,0.215528,0.045994,0.92185


In [11]:
#drop 6.9 rows
df_summary = df_summary[df_summary.pH != 6.9]

# Group by 'pH' and calculate correlation between 'mean_infectivity' and 'stability_mean'
correlations = df_summary.groupby('pH').apply(lambda group: group['mean_infectivity'].corr(group['stability_mean'])).reset_index()
correlations.columns = ['pH', 'correlation']

# Format the correlation values to two decimal places
correlations['correlation'] = correlations['correlation'].apply(lambda x: f"{x:.2f}")

# Merge correlations back into df_summary
df_summary = df_summary.merge(correlations, on='pH', how='left')

# Check the merged DataFrame
df_summary.head()

Unnamed: 0,strain,pH,mean_infectivity,std_infectivity,stability_mean,correlation
0,A/American Wigeon/South Carolina/USDA-000345-0...,5.3,0.007732,0.001385,0.0,0.99
1,A/American Wigeon/South Carolina/USDA-000345-0...,5.5,0.038844,0.005281,0.0,1.0
2,A/American Wigeon/South Carolina/USDA-000345-0...,5.7,0.167415,0.036671,0.0,0.99
3,A/Larusargentatus/Belgium/085710001/2023,5.3,0.039979,0.00797,2.10893,0.99
4,A/Larusargentatus/Belgium/085710001/2023,5.5,0.458118,0.031097,2.10893,1.0


In [12]:
# Base chart
base = alt.Chart(df_summary).mark_point(filled=True, size=100).encode(
    x=alt.X('stability_mean', scale=alt.Scale(padding=10)),
    y=alt.Y('mean_infectivity', scale=alt.Scale(type='log', padding=10, nice=False)),
    color=alt.Color('strain', legend=alt.Legend(orient='right', title='strain'))
).properties(
    width=200,
    height=200
)

# Text chart for correlation values
text = alt.Chart(df_summary).mark_text(
    align='left',
    baseline='top',
    dx=5,
    dy=5,
    fontSize=12,
    color='black'
).encode(
    x=alt.value(1),  # Adjust the position as needed
    y=alt.value(1),  # Adjust the position as needed
    text=alt.Text('correlation:N')
).transform_calculate(
    correlation='datum.correlation'
)

# Combine the base chart and text annotations
layer = alt.layer(base, text)

# Facet and configure the chart with padding and fixed facet order
chart = layer.facet(
    facet=alt.Facet('pH:N'),
    columns=3
).resolve_scale(
    y='independent'  # Each facet will have its own y-axis
).configure_axis(
    grid=False
).configure_view(
    strokeWidth=0
).configure_legend(
    titleFontSize=12,
    labelFontSize=10,
    orient='right',
    titleLimit=200,
    labelLimit=200
).properties(
    padding={'left': 2, 'right': 20, 'top': 20, 'bottom': 20}  # Increase right padding to ensure the legend is fully visible
)

chart.save("strain_stability_vs_dms.html")
chart
