In [3]:
!pip install biopython pandas muscle



In [4]:
import pandas as pd
from Bio import Seq
from Bio import AlignIO
from Bio.Align import MultipleSeqAlignment
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq
from Bio import AlignIO
from Bio.Align.Applications import MuscleCommandline

def create_na_functional_regions():
    """Define functional regions of NA protein"""
    return {
        # active site residues
        'Active_site': [(118, 118), (151, 152), (224, 224), (276, 276), (292, 292), (371, 371), (406, 406)],
        
        # framework residues
        'Framework': [(119, 119), (156, 156), (178, 179), (198, 198), (222, 222), (227, 227), (277, 277), (293, 293), (425, 425)],
        
        # antigenic sites (approximate, as specific sites may vary)
        'Antigenic_sites': [(141, 142), (198, 200), (328, 329), (344, 346), (368, 370)],
        
        # calcium binding sites (multiple locations in the head domain)
        'Calcium_binding': [(293, 297), (324, 330), (345, 348)],
        
        # stalk region
        'Stalk_region': [(35, 82)],
        
        # head domain
        'Head_domain': [(83, 468)]
    }
    
def find_mutations(seq1, seq2, seq3):
    """Find mutations between three sequences"""
    mutations = []
    for i in range(len(seq1)):
        if not (seq1[i] == seq2[i] == seq3[i]):
            mutations.append({
                'position': i + 1,
                'seq2018_aa': seq1[i],
                'seq2021_aa': seq2[i],
                'seq2022_aa': seq3[i]
            })
    return mutations
    
def map_mutations_to_regions(mutations, regions):
    """Map mutations to functional regions"""
    mapped_mutations = []
    for mut in mutations:
        pos = mut['position']
        region_found = False
        for region_name, ranges in regions.items():
            for start, end in ranges:
                if start <= pos <= end:
                    mapped_mutations.append({
                        'position': pos,
                        'seq2018_aa': mut['seq2018_aa'],
                        'seq2021_aa': mut['seq2021_aa'],
                        'seq2022_aa': mut['seq2022_aa'],
                        'region': region_name
                    })
                    region_found = True
                    break
            if region_found:
                break
        if not region_found:
            mapped_mutations.append({
                'position': pos,
                'seq2018_aa': mut['seq2018_aa'],
                'seq2021_aa': mut['seq2021_aa'],
                'seq2022_aa': mut['seq2022_aa'],
                'region': 'Non-functional region'
            })
    return mapped_mutations
def analyze_na_sequences(seq1, seq2, seq3):
    """Analyze three NA sequences"""
    # create sequence records
    records = [
        SeqRecord(Seq(seq1), id="sequence1"),
        SeqRecord(Seq(seq2), id="sequence2"),
        SeqRecord(Seq(seq3), id="sequence3")
    ]
    
    # write sequences to file for MUSCLE
    with open("input_sequences.fasta", "w") as f:
        for record in records:
            f.write(f">{record.id}\n{record.seq}\n")
    
    # do the alignment using MUSCLE
    muscle_cline = MuscleCommandline(input="input_sequences.fasta", out="aligned.fasta")
    muscle_cline()
    
    # read aligned sequences
    alignment = AlignIO.read("aligned.fasta", "fasta")
    seq1_aligned = str(alignment[0].seq)
    seq2_aligned = str(alignment[1].seq)
    seq3_aligned = str(alignment[2].seq)
    
    # find and map mutations
    mutations = find_mutations(seq1_aligned, seq2_aligned, seq3_aligned)
    regions = create_na_functional_regions()
    mapped_mutations = map_mutations_to_regions(mutations, regions)
    
    # create DataFrame
    df = pd.DataFrame(mapped_mutations)
    return df

def main():
    sequence2018 = """MNPNQKVICISATGMTLSVVSLLIGIANLGLNIGLHYKMGDAPTVDIPSMNGTNSTTTIINNNTQNNFTNITNIIINKEEERIFLNLTKPLCEVNSWHILSKDNAIRIGEDAHILVTREPYLSCDPQSCRMFALSQGTTLRGRHANGTIHDRSPFRALVSWEMGQAPSPYNVRVECIGWSSTSCHDGISRMSICMSGPNNNASAVVWYGGRPVTEIPSWAGNILRTQESECVCHKGVCPVVMTDGPANNRAATKIIYFKEGKIQKIEELRGNAQHIEECSCYGAVRVIKCVCRDNWKGANRPVITIDPEMMTHTSKYLCSRVLTDTSRPNDPTSGNCDAPIIGGSPDPGVKGFAFLDGENSWLGRTISKDSRSGYEILKVPNAETDTQSGPTSHQIIVNNPNWSGYSGAFIDYWANKECFNPCFYVELIRGRPKESSVLWTSNSIVALCGSKERLGSWSWHDGAEIIYFK"""  
    sequence2021 = """MNPNQKIVIVGSISLGLVVFNVLLHAMSIILMVLALGKSENSGICKGTTVREYNETVRIEKVTQWYNTSVVEYVQHWNEGAYTNNTETICDVKGFAPFSKDNGIRVGSRGHIFVIREPFVSCSPIECRTFFLTQGALLNDKHSNGTVKDRSPFRTLMSVEVGQSPNVYQARFEAVAWSATACHDGKKWMTIGVTGPDPKAVAVVHYGGVPTDVVNSWAGDILRTQESSCTCIQGNCYWVMTDGPANRQAQYRIYKANQGKIIDQTDVSFSGGHIEECSCYPNDGKVECVCRDNWTGTNRPVLVISPDLSYRVGYLCAGLPSDTPRGKDAQFVGSCTSPMGNQGYGVKGFGFRQGTDVWVGRTISRTSRSGFEIIRIKNGWTQTSKEQIRRQVVIDNLNWSGYSGSFTLPVELSGRECLVPCFWVEMIRGRPEERTIWTSSSSIVMCGVDHEIADWSWHDGAILPFDIDKM""" 
    sequence2022 = """MNPNQRIITTGSICMVIGIVSLMLQIGNIISIWVSHSIQTGNQYQPEPCNQSIIAYENNTWINQTYVNINNTNFLAEQAVTSVTLAGNSSLCPISGWAIYSKDNGIRIGSKGDVFVIREPFISCSHLECRTFFLTQGALLNDKHSNGTVKDRSPYRTLMSCPVGEAPSPYNSRFESVAWSASACHDGISWLTIGISGPDNGAVAVLKYNGIITDTIKSWRNNILRTQESECACVNGSCFTVMTDGPSNGQASYKIFKIEKGKVVKSVELNAPNYHYEECSCYPDAGEIMCVCRDNWHGSNRPWVSFNQNLEYQIGYICSGVFGDNPRPNDGTGSCSPMSSNGAYGVKGFSFKYGNGVWIGRTKSTSSRSGFEMIWDPNGWTETDSSFSVKQDIVAITDLSGYSGTFVQHPELTGLDCMRPCFWVELIRGRPKENTVWTSGSSISFCGVNSDTVGWSWPDGAELPFTIDK"""
    
    results = analyze_na_sequences(sequence2018, sequence2021, sequence2022)
    
    # print results
    print("\nMutation Analysis Results:")
    print(results)
    
    # save to CSV
    results.to_csv('NAmutation_analysis.csv', index=False)
    print("\nResults saved to 'NAmutation_analysis.csv'")

if __name__ == "__main__":
    main()


Mutation Analysis Results:
     position seq2018_aa seq2021_aa seq2022_aa                 region
0           6          K          K          R  Non-functional region
1           7          V          I          I  Non-functional region
2           8          I          V          I  Non-functional region
3           9          C          I          T  Non-functional region
4          10          I          V          T  Non-functional region
..        ...        ...        ...        ...                    ...
319       481          F          D          T  Non-functional region
320       482          K          I          I  Non-functional region
321       483          -          D          D  Non-functional region
322       484          -          K          K  Non-functional region
323       485          -          M          -  Non-functional region

[324 rows x 5 columns]

Results saved to 'NAmutation_analysis.csv'


Analysis: The mutation analysis of NA protein from 2018, 2021, and 2022 reveals extensive amino acid changes across multiple functional regions, particularly in the head domain and antigenic sites, which directly impact vaccine effectiveness. Key mutations in antigenic sites (positions 142, 199-200, 329, 344-346, and 369-370) and active sites (positions 151-152, 224, 276, 292, and 406) demonstrate significant evolutionary changes that could affect antibody recognition and viral fitness. The frequent mutations in the calcium binding sites (positions 294-297, 324-330) and framework residues (positions 119, 178, 198, 222, 277, 293, and 425) suggest structural adaptations that could potentially help the virus evade immune responses. These continuous changes in the NA protein, especially in antigenic regions, directly explain why annual flu vaccine updates are necessary - the virus is constantly evolving to escape previous immunity, making older vaccines less effective against newer strains. The pattern of mutations shows both temporary changes and permanent shifts in amino acid composition, indicating ongoing selective pressure from host immune responses and the need for regular vaccine reformulation to match circulating strains.