In [2]:
!pip install biopython pandas muscle

Collecting muscle
  Downloading muscle-0.0.4-py3-none-any.whl (3.8 kB)
Installing collected packages: muscle
Successfully installed muscle-0.0.4


In [12]:
import pandas as pd
from Bio import Seq
from Bio import AlignIO
from Bio.Align import MultipleSeqAlignment
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq
from Bio import AlignIO
from Bio.Align.Applications import MuscleCommandline

def create_ha_functional_regions():
    """Define functional regions of HA protein based on H1 HA structure"""
    return {
        # Antigenic Sites
        'Sa_antigenic': [(124, 125), (153, 157), (159, 164)],
        'Sb_antigenic': [(184, 195)],
        'Ca1_antigenic': [(166, 170), (203, 205), (235, 237)],
        'Ca2_antigenic': [(137, 142), (221, 222)],
        'Cb_antigenic': [(71, 76)],
        
        # Receptor Binding Site (RBS)
        'RBS_130_loop': [(130, 135)],
        'RBS_150_loop': [(150, 155)],
        'RBS_190_helix': [(190, 198)],
        'RBS_220_loop': [(220, 229)],
        
        # Key RBS Residues
        'RBS_conserved': [(153, 153), (183, 183), (194, 194), (195, 195)],  # Fixed single residues
        
        # HA2 Regions
        'Fusion_peptide': [(1, 23)],
        'Stem_region': [(40, 58), (59, 75), (76, 130)]
    }
def find_mutations(seq1, seq2, seq3):
    """Find mutations between three sequences"""
    mutations = []
    for i in range(len(seq1)):
        if not (seq1[i] == seq2[i] == seq3[i]):
            mutations.append({
                'position': i + 1,
                'seq2018_aa': seq1[i],
                'seq2021_aa': seq2[i],
                'seq2022_aa': seq3[i]
            })
    return mutations
    
def map_mutations_to_regions(mutations, regions):
    """Map mutations to functional regions"""
    mapped_mutations = []
    for mut in mutations:
        pos = mut['position']
        region_found = False
        for region_name, ranges in regions.items():
            for start, end in ranges:
                if start <= pos <= end:
                    mapped_mutations.append({
                        'position': pos,
                        'seq2018_aa': mut['seq2018_aa'],
                        'seq2021_aa': mut['seq2021_aa'],
                        'seq2022_aa': mut['seq2022_aa'],
                        'region': region_name
                    })
                    region_found = True
                    break
            if region_found:
                break
        if not region_found:
            mapped_mutations.append({
                'position': pos,
                'seq2018_aa': mut['seq2018_aa'],
                'seq2021_aa': mut['seq2021_aa'],
                'seq2022_aa': mut['seq2022_aa'],
                'region': 'Non-functional region'
            })
    return mapped_mutations
def analyze_ha_sequences(seq1, seq2, seq3):
    """Analyze three HA sequences"""
    # Create sequence records
    records = [
        SeqRecord(Seq(seq1), id="sequence1"),
        SeqRecord(Seq(seq2), id="sequence2"),
        SeqRecord(Seq(seq3), id="sequence3")
    ]
    
    # Write sequences to file for MUSCLE
    with open("input_sequences.fasta", "w") as f:
        for record in records:
            f.write(f">{record.id}\n{record.seq}\n")
    
    # Perform alignment using MUSCLE
    muscle_cline = MuscleCommandline(input="input_sequences.fasta", out="aligned.fasta")
    muscle_cline()
    
    # Read aligned sequences
    alignment = AlignIO.read("aligned.fasta", "fasta")
    seq1_aligned = str(alignment[0].seq)
    seq2_aligned = str(alignment[1].seq)
    seq3_aligned = str(alignment[2].seq)
    
    # Find and map mutations
    mutations = find_mutations(seq1_aligned, seq2_aligned, seq3_aligned)
    regions = create_ha_functional_regions()
    mapped_mutations = map_mutations_to_regions(mutations, regions)
    
    # Create DataFrame
    df = pd.DataFrame(mapped_mutations)
    return df

# Example usage
def main():
    # Hardcode your sequences here
    sequence2018 = """MENIVLLLAIVSLVKSDQICIGYHANNSTEQVDTIMEKNVTVTHAQDILEKTHNGKLCDLNGVKPLILKDCSVAGWLLGNPMCDEFIRVPEWSYIVERDNPANDLCYPGSLNDYEELKHLLSRINHFEKILIIPKSSWPNHETSLGVSAACPYQGTPSFFRNVVWLIKKNDAYPTIKISYNNTNREDLLILWGIHHSNNAEEQTNLYKNPTTYISVGTSTLNQRLVPKIATRSQVNGQRGRMDFFWTILKPNDAIHFESNGNFIAPEYAYKIVKKGDSTIMKSGVEYGHCNTKCQTPVGAINSSMPFHNIHPLTIGECPKYVKSNKLVLATGLRNSPLREKRRKRGLFGAIAGFIEGGWQGMVDGWYGYHHSNEQGSGYAADKESTQKAIDGVTNKVNSIIDKMNTQFEAVGREFNNLERRIENLNKKMEDGFLDVWTYNAELLVLMENERTLDFHDSNVKNLYDKVRLQLRDNAKELGNGCFEFYHKCDNECMESVRNGTYDYPQYSEEARLKREEISGVKLESIGTYQILSIYSTVASSLALAIMVAGLSLWMCSNGSLQCRICI"""  # Replace with your actual sequence
    sequence2021 = """MERTVLLLAIVSLVKSDQICIGYHANNSTEQVDTIMEKNVTVTHAQDILEKAHNGKLCDLNGVKPLVLNDCSVAGWLLGNPMCDEFIRVPEWSYIVERTNPANDLCYPGSLNDYEELKHLLSRIKHFEKILIIPKSSWPNHETSLGVSAACPYQGTPSFFRNVVWLIKKNDAYPTIKINYNNTNREDLLILWGIHHSNNEEEQTNLYKNPTTYISVGTSTLNQRMVPKIATRSQVNGQRGRMDFFWTILKPNDAIHFESNGNFIAPEYAYKIVKKGDSTIMKSEVEYGHCNTKCQTPVGAINSSLPFHNIHPLTIGECPKYVKSNKLVLATGLRNSPLREKRRKRGLFGAIAGFIEGGWQGMVDGWYGYHHSNEQGSGYAADKESTQKAIDGVTNKVNSIINKMNTQFEAVGREFNNLERRIENLNKKMEDGFLDVWTYNAELLVLMENERTLDFHDSNVKNLYDKVRLQLRDNAKELGNGCFEFYHKCDNECMESVRNGTYYYPQYSEEARLKREEISGVKLESIGTYQILSIYSTVASSLALAIMVAGLSLWMCSNGSLQCRICI""" # Replace with your actual sequence
    sequence2022 = """MENIVLLLAIISLVKSDQICIGYHANNSTEQVDTIMEKNVTVTHAQDILEKAHNGKLCDLNGVKPLILKDCSVAGWLLGNPMCDEFIRVPEWSYIVERANPANDLCYPGSLNDYEELKHLLSRINHFEKILIIPKSSWPNHETSLGVSAACPYQGAPSFFRNVVWLIKKNDAYPTIKISYNNTNREDLLILWGIHHSNNAEEQTNLYKNPTTYISVGTSTLNQRLVPKIATRSQVNGQRGRMDFFWTILKPDDAIHFESNGNFIAPEYAYKIVKKGDSTIMKSGVEYGHCNTKCQTPVGAINSSMPFHNIHPLTIGECPKYVKSNKLVLATGLRNSPLRERRRKRGLFGAIAGFIEGGWQGMVDGWYGYHHSNEQGSGYAADKESTQKAIDGVTNKVNSIIDKMNTQFEAVGREFNNLERRIENLNKKMEDGFLDVWTYNAELLVLMENERTLDFHDSNVKNLYDKVRLQLRDNAKELGNGCFEFYHKCDDECMESVRNGTYDYPQYSEEARLKREEISGVKLESIGTYQILSIYSTAASSLALAIMMAGLSLWMCSNGSLQCRICI"""  # Replace with your actual sequence
    
    results = analyze_ha_sequences(sequence2018, sequence2021, sequence2022)
    
    # Print results
    print("\nMutation Analysis Results:")
    print(results)
    
    # Optional: Save to CSV
    results.to_csv('mutation_analysis.csv', index=False)
    print("\nResults saved to 'mutation_analysis.csv'")

if __name__ == "__main__":
    main()


Mutation Analysis Results:
    position seq2018_aa seq2021_aa seq2022_aa                 region
0          3          N          N          R         Fusion_peptide
1          4          I          I          T         Fusion_peptide
2         11          V          I          V         Fusion_peptide
3         52          T          A          A            Stem_region
4         67          I          I          V            Stem_region
5         69          K          K          N            Stem_region
6         99          D          A          T            Stem_region
7        125          N          N          K           Sa_antigenic
8        156          T          A          T           Sa_antigenic
9        179          S          S          N  Non-functional region
10       200          A          A          E  Non-functional region
11       225          L          L          M           RBS_220_loop
12       252          N          D          N  Non-functional region
13    

Analysis: The mutation analysis reveals significant amino acid changes across three time points (2018, 2021, and 2022) in various protein regions, with notable mutations observed in the Fusion_peptide region at positions 3 (N→R) and 4 (I→T), followed by changes in the Stem_region including positions 52 (T→A), 67 (I→V), 69 (K→N), and 99 (D→T), while the Sa_antigenic region displays mutations at positions 125 (N→K) and 156 (T→A→T); the RBS_220_loop shows a single mutation at position 225 (L→M), and the Non-functional regions exhibit multiple changes throughout positions 179-548, with particularly interesting patterns such as position 252 showing a transition from N→D→N, position 341 displaying K→R→K, and several other positions (284, 305, 402, 491, 503, 538, and 548) demonstrating various amino acid substitutions between 2018 and 2022, with some positions maintaining consistency between 2018 and 2021 before changing in 2022, while others show progressive changes across all three time points.