In [2]:
from Bio import SeqIO
from Bio.Seq import Seq

def translate_sequence(dna_sequence):
    return dna_sequence.translate()

def parse_genbank_translate(genbank_file):
    protein_sequences = []
    for record in SeqIO.parse(genbank_file, "genbank"):
        for feature in record.features:
            if feature.type == "gene":
                gene_seq = feature.location.extract(record.seq)
                protein_seq = translate_sequence(gene_seq)
    return protein_seq

genbank_file = "../data/PacBio_amplicon.gb"
protein_seq = parse_genbank_translate(genbank_file)
print(protein_seq)

QKIPGNDNSTATLCLGHHAVPNGTIVKTITNDRIEVTNATELVQNSSIGKICNSPHQILDGGNCTLIDALLGDPQCDGFQNKEWDLFVERSRANSSCYPYDVPDYASLRSLVASSGTLEFKNESFNWTGVKQNGTSSACKRGSSSSFFSRLNWLTSLNNIYPAQNVTMPNKEQFDKLYIWGVHHPDTDKNQFSLFAQSSGRITVSTKRSQQAVIPNIGSRPRVRDIPSRISIYWTIVKPGDILLINSTGNLIAPRGYFKIRSGKSSIMRSDAPIGKCKSECITPNGSIPNDKPFQNVNRITYGACPRYVKQSTLKLATGMRNVPEKQTRGIFGAIAGFIENGWEGMVDGWYGFRHQNSEGRGQAADLKSTQAAIDQISGKLNRLIGKTNEKFHQIEKEFSEVEGRVQDLEKYVEDTKIDLWSYNAELLVALENQHTIDLTDSEMNKLFEKTKKQLRENAEDMGNGCFKIYHKCDNACIGSIRNETYDHNVYRDEALNNRFQIKG


### Make `site_numbering_map.csv`

In [3]:
import pandas as pd

# These RBS sites compiled from:
#   Shi et al 2014. https://www.nature.com/articles/nrmicro3362
#   Wu et al 2018. https://www.nature.com/articles/s41467-018-03663-5
#   Kong et al 2021. https://journals.asm.org/doi/10.1128/mbio.01512-21
#   Lei et al 2024. https://www.nature.com/articles/s41467-024-49487-4

RBS_loop_130 = [128,130,133,134,135,136,137,138]
RBS_loop_150 = [155,156,157,158,159,160]
RBS_loop_190 = [186,189,190,191,192,193,194,196,197,198]
RBS_loop_220 = [221,222,223,224,225,226,227,228]
RBS_base = [98,153,183,195]

# These are epitopes based on Broecker et al. 2018:
# https://journals.asm.org/doi/10.1128/jvi.01100-18

epitope_A = list(range(122,147))
epitope_B = list(range(155,161)) + list(range(186,199))
epitope_C = list(range(44,55)) + list(range(273,281))
epitope_D = list(range(166,182)) + list(range(201,220))
epitope_E = list(range(62,66)) + list(range(78,95)) + list(range(260,266))

def create_site_map(protein_seq):
    sequential_site = list(range(1, len(protein_seq) + 1))
    reference_site = list(range(1, len(protein_seq) + 1))  # Start reference_site at 1 as this is H3 numbering
    sequential_wt = list(protein_seq)
    
    def assign_epitope_region(reference_site):
    # Epitope assignments are obtained from Welsh et al. 2023
        if reference_site in epitope_A:
            return 'epitope-A'
        elif reference_site in epitope_B:
            return 'epitope-B'
        elif reference_site in epitope_C:
            return 'epitope-C'
        elif reference_site in epitope_D:
            return 'epitope-D'
        elif reference_site in epitope_E:
            return 'epitope-E'
        elif 1 <= reference_site <= 329:
            return 'HA1'
        elif 330 <= reference_site <= 504:
            return 'HA2'
        else:
            return 'Other'

    def assign_rbs_region(reference_site):
        if reference_site in RBS_loop_130:
            return 'RBS 130-loop'
        elif reference_site in RBS_loop_150:
            return 'RBS 150-loop'
        elif reference_site in RBS_loop_190:
            return 'RBS 190-loop'
        elif reference_site in RBS_loop_220:
            return 'RBS 220-loop'
        elif reference_site in RBS_base:
            return 'RBS base'
        else:
            return 'outside RBS'

    df = pd.DataFrame({
        "sequential_site": sequential_site,
        "reference_site": reference_site,
        "sequential_wt": sequential_wt
    })
    df['region'] = df['reference_site'].apply(assign_epitope_region)
    df['rbs_region'] = df['reference_site'].apply(assign_rbs_region)
    
    return df

df = create_site_map(protein_seq)
df.to_csv('../data/site_numbering_map.csv', index=False)

### Make `mutation_design_classification.csv`

In [4]:
import numpy as np

vp = pd.read_csv('twist_qc_reports/Final_QC_Report_Q-328447_VariantProportion.csv').assign(
    h3_numbering = lambda x: x['AA Position'] - 16
)
vp['variant_proportion'] = vp['variant_proportion'].str.replace('%', '').astype(float)

AA=['A','C','D','E','F','G','H','I','K','L','M','N','P','Q','R','S','T','V','W','Y'];

missing_sites = vp.query(
    'variant_proportion == 0.0'
)[['AA Position', 'wt_codon', 'wt_aa', 'h3_numbering']].drop_duplicates()

missing_mutations = vp.query(
    'variant_proportion <= 1.0'
).query(
    'h3_numbering not in @missing_sites["h3_numbering"].to_numpy()'
).assign(
    mutation = lambda x: [
        str(wt)+str(site)+str(mut) for wt, site, mut in zip(x['wt_aa'], x['h3_numbering'], x['variant_aa'])
    ]
).reset_index(drop=True)

stop_positions=list(np.arange(0,40,2))
assert len(stop_positions) == 20

sequential_site=[];
amino_acid=[];
mutation_type=[];
for i in range(len(protein_seq)):
    codon_pos = i
    wt_aa = protein_seq[codon_pos]

    # add mutations
    for j in AA:
        if wt_aa != j:
            sequential_site.append(codon_pos+1)
            amino_acid.append(j)
            mutation = str(wt_aa) + str(codon_pos+1) + str(j)

            if mutation in list(missing_mutations['mutation']):
                mutation_type.append('spike_in_mutation')
            elif codon_pos in list(missing_sites['h3_numbering']-1):
                    mutation_type.append('spike_in_site')
            else:
                mutation_type.append('twist_mutation')
   
    # add stops
    if codon_pos in stop_positions:
        sequential_site.append(codon_pos+1);
        amino_acid.append('*');
        mutation_type.append('stop');

    # add one deletion that was included
    if codon_pos == 142:
        sequential_site.append(codon_pos+1);
        amino_acid.append('-');
        mutation_type.append('twist_mutation');

mutation_design_classification = pd.DataFrame(columns=['sequential_site','amino_acid','mutation_type'])
mutation_design_classification['sequential_site']=sequential_site
mutation_design_classification['amino_acid']=amino_acid
mutation_design_classification['mutation_type']=mutation_type

assert len(missing_sites) == len(
     mutation_design_classification.query('mutation_type == "spike_in_site"')['sequential_site'].unique()
)
assert len(missing_mutations) == len(
     mutation_design_classification.query('mutation_type == "spike_in_mutation"')
)
assert len(mutation_design_classification) == 9597

mutation_design_classification.to_csv('../data/mutation_design_classification.csv', index=False)