In [1]:
import pandas as pd 
import altair as alt
import numpy as np
import theme 

alt.themes.register('main_theme', theme.main_theme)
alt.themes.enable('main_theme')

from Bio import SeqIO

In [3]:
sera = ['SCH23_y2021_s056', 'SCH23_y2016_s037', 'SCH23_y2009_s002', 'SCH23_y2009_s007']

titers = pd.read_csv('data/seqneut_data/titers_SCH.csv').query('serum in @sera')
titers.head()

n_unique_strains = len(titers['virus'].unique())
print(f'There are {n_unique_strains} strains in the library.')

There are 78 strains in the library.


In [4]:
# read in reference sequence
ma22_sequence = str(next(
    SeqIO.parse('../results/gene_sequence/protein.fasta', 'fasta')
).seq)
print(ma22_sequence)

QKIPGNDNSTATLCLGHHAVPNGTIVKTITNDRIEVTNATELVQNSSIGKICNSPHQILDGGNCTLIDALLGDPQCDGFQNKEWDLFVERSRANSSCYPYDVPDYASLRSLVASSGTLEFKNESFNWTGVKQNGTSSACKRGSSSSFFSRLNWLTSLNNIYPAQNVTMPNKEQFDKLYIWGVHHPDTDKNQFSLFAQSSGRITVSTKRSQQAVIPNIGSRPRVRDIPSRISIYWTIVKPGDILLINSTGNLIAPRGYFKIRSGKSSIMRSDAPIGKCKSECITPNGSIPNDKPFQNVNRITYGACPRYVKQSTLKLATGMRNVPEKQTRGIFGAIAGFIENGWEGMVDGWYGFRHQNSEGRGQAADLKSTQAAIDQISGKLNRLIGKTNEKFHQIEKEFSEVEGRVQDLEKYVEDTKIDLWSYNAELLVALENQHTIDLTDSEMNKLFEKTKKQLRENAEDMGNGCFKIYHKCDNACIGSIRNETYDHNVYRDEALNNRFQIKG


In [5]:
def get_mutations(seq, ref, offset=1):
    mutations = []
    for i, (ref_aa, query_aa) in enumerate(zip(ref, seq)):
        if ref_aa != query_aa:
            site = i + offset
            mutations.append(f"{ref_aa}{site}{query_aa}")
    return " ".join(mutations) if mutations else "wt"

# read in library protein sequences
records = []
for record in SeqIO.parse('data/seqneut_data/2023-2024_H3_library_protein_constructs.fasta', 'fasta'):
    virus_name = record.id
    full_description = record.description
    sequence = str(record.seq)

    records.append({
        'virus': virus_name,
        'sequence': sequence
    })

strain_sequences = pd.DataFrame(records)
strain_sequences = strain_sequences.assign(
    ectodomain=strain_sequences['sequence'].str.slice(19, 523),
    allele_site140=lambda x: x['ectodomain'].str.get(139),
    mutations=lambda x: x['ectodomain'].apply(lambda seq: get_mutations(seq, ma22_sequence))
)

# double check
strain_sequences.query(
    'virus == "A/Massachusetts/18/2022"'
)['ectodomain'] == ma22_sequence

31    True
Name: ectodomain, dtype: bool

In [6]:
titers_and_seqs = pd.merge(
    titers,
    strain_sequences,
    on='virus',
    how='left'
).query(
    'allele_site140 in ["K", "I"]'

)

titers_and_seqs.head()

Unnamed: 0,group,serum,virus,titer,titer_bound,titer_sem,n_replicates,titer_as,sequence,ectodomain,allele_site140,mutations
0,SCH,SCH23_y2009_s002,A/AbuDhabi/6753/2023,7165.0,interpolated,410.4,3,midpoint,MKAKLLVLLYAFVATDADTQKIPGNDNSTATLCLGHHAVPNGTIVK...,QKIPGNDNSTATLCLGHHAVPNGTIVKTITNDRIEVTNATELVQNS...,K,K121E
1,SCH,SCH23_y2009_s002,A/Bangkok/P3599/2023,4908.0,interpolated,348.5,3,midpoint,MKAKLLVLLYAFVATDADTQKIPGNDNSTATLCLGHHAVPNGTIVK...,QKIPGNDNSTATLCLGHHAVPNGTIVKTITNDRIEVTNATELVQNS...,K,Q197H
2,SCH,SCH23_y2009_s002,A/Bangkok/P3755/2023,5035.0,interpolated,213.0,3,midpoint,MKAKLLVLLYAFVATDADTQKIPGNDNSTATLCLGHHAVPNGTIVK...,QKIPGNDNSTATLCLGHHAVPNGTIVKTITNDRIEVTNATELVQNS...,K,N122D S144N K276E
3,SCH,SCH23_y2009_s002,A/Bhutan/0006/2023,8386.0,interpolated,2264.0,3,midpoint,MKAKLLVLLYAFVATDADTQKIPGNDNSTATLCLGHHAVPNGTIVK...,QKIPGNDNSTATLCLGHHAVPNGTIVKTITNDRIEVTNATELVQNS...,I,I48T K50E K140I Q173R V223I
4,SCH,SCH23_y2009_s002,A/Bhutan/0845/2023,4643.0,interpolated,712.9,3,midpoint,MKAKLLVLLYAFVATDADTQKIPGNDNSTATLCLGHHAVPNGTVVK...,QKIPGNDNSTATLCLGHHAVPNGTVVKTITNDRIEVTNATELVQNS...,K,I25V V347M


In [54]:
# Define custom colors for each RBS region
colors = {
    'K': '#8DA0CA',
    'I': '#F1CE63',
}

order = ['K', 'I']

# Create the jitter plot layer
points = alt.Chart(
    titers_and_seqs
).mark_circle(size=40, opacity=1, stroke='black', strokeWidth=0.3).encode(
    x=alt.X(
        "allele_site140",
        axis=alt.Axis(
            grid=False,
            labelAngle=0,
        ),
        sort=order
    ).title(None),
    y=alt.Y(
        'titer',
        title=(["NT50"]),
        scale=alt.Scale(type='log', domain=[1e1, 1e5]),
        axis=alt.Axis(
            grid=False,
            values=[1e1, 1e2, 1e3, 1e4, 1e5],
            tickCount=5,
            format='.0e'
        )
    ),
    xOffset="jitter:Q",
    color=alt.Color(
            "allele_site140",
            scale=alt.Scale(domain=list(colors.keys()), range=list(colors.values())),
            legend=None
    ),
    tooltip=['serum', 'virus', 'titer', 'allele_site140', 'mutations'],
).transform_calculate(
    jitter="sqrt(-2*log(random()))*cos(2*PI*random())"
)

# Add black median line
median_line = alt.Chart(
    titers_and_seqs
).mark_tick(
    color='black',
    thickness=2,
    size=15
).encode(
    x=alt.X('allele_site140', sort=order),
    y='median(titer):Q'
)

# Combine layers with median line on top
chart = alt.layer(
    points,
    median_line,
).properties(
    height=175, 
    width=70
)

faceted_chart = chart.facet(
    facet=alt.Facet('serum:N', title=None),
    columns=4
).resolve_scale(
    y='independent',
    x='shared'
).configure_header(
    labelFontSize=14,
    labelFontWeight='bold'
)

faceted_chart.display()