In [2]:
!pip install biopython pandas muscle

Collecting muscle
  Using cached muscle-0.0.4-py3-none-any.whl (3.8 kB)
Installing collected packages: muscle
Successfully installed muscle-0.0.4


In [3]:
import pandas as pd
from Bio import Seq
from Bio import AlignIO
from Bio.Align import MultipleSeqAlignment
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq
from Bio import AlignIO
from Bio.Align.Applications import MuscleCommandline

def create_pb2_functional_regions():
    """Define functional regions of PB2 protein based on literature"""
    return {
        # Cap-binding domain regions[2][4]
        'Cap_binding': [(318, 483)],
        
        # Nuclear localization signal[2][4]
        'NLS_domain': [(736, 759)],
        
        # 627 domain - important for host adaptation[3][4]
        '627_domain': [(535, 759)],
        
        # N-terminal region for viral RNA replication[2]
        'N_terminal': [(1, 247)],
        
        # Mid-link domain[4]
        'Mid_link': [(248, 317)],
        
        # Fusion peptide region (based on mutation analysis)
        'Fusion_peptide': [(1, 23)],
        
        # Stem region (based on mutation analysis)
        'Stem_region': [(40, 130)],
        
        # RBS region (based on mutation analysis)
        'RBS_220_loop': [(220, 229)],
        
        # Sa antigenic region (based on mutation analysis)
        'Sa_antigenic': [(124, 164)]
    }
    
def find_mutations(seq1, seq2, seq3):
    """Find mutations between three sequences"""
    mutations = []
    for i in range(len(seq1)):
        if not (seq1[i] == seq2[i] == seq3[i]):
            mutations.append({
                'position': i + 1,
                'seq2018_aa': seq1[i],
                'seq2021_aa': seq2[i],
                'seq2022_aa': seq3[i]
            })
    return mutations
    
def map_mutations_to_regions(mutations, regions):
    """Map mutations to functional regions"""
    mapped_mutations = []
    for mut in mutations:
        pos = mut['position']
        region_found = False
        for region_name, ranges in regions.items():
            for start, end in ranges:
                if start <= pos <= end:
                    mapped_mutations.append({
                        'position': pos,
                        'seq2018_aa': mut['seq2018_aa'],
                        'seq2021_aa': mut['seq2021_aa'],
                        'seq2022_aa': mut['seq2022_aa'],
                        'region': region_name
                    })
                    region_found = True
                    break
            if region_found:
                break
        if not region_found:
            mapped_mutations.append({
                'position': pos,
                'seq2018_aa': mut['seq2018_aa'],
                'seq2021_aa': mut['seq2021_aa'],
                'seq2022_aa': mut['seq2022_aa'],
                'region': 'Non-functional region'
            })
    return mapped_mutations
def analyze_pb2_sequences(seq1, seq2, seq3):
    """Analyze three PB2 sequences"""
    # Create sequence records
    records = [
        SeqRecord(Seq(seq1), id="sequence1"),
        SeqRecord(Seq(seq2), id="sequence2"),
        SeqRecord(Seq(seq3), id="sequence3")
    ]
    
    # Write sequences to file for MUSCLE
    with open("input_sequences.fasta", "w") as f:
        for record in records:
            f.write(f">{record.id}\n{record.seq}\n")
    
    # Perform alignment using MUSCLE
    muscle_cline = MuscleCommandline(input="input_sequences.fasta", out="aligned.fasta")
    muscle_cline()
    
    # Read aligned sequences
    alignment = AlignIO.read("aligned.fasta", "fasta")
    seq1_aligned = str(alignment[0].seq)
    seq2_aligned = str(alignment[1].seq)
    seq3_aligned = str(alignment[2].seq)
    
    # Find and map mutations
    mutations = find_mutations(seq1_aligned, seq2_aligned, seq3_aligned)
    regions = create_pb2_functional_regions()
    mapped_mutations = map_mutations_to_regions(mutations, regions)
    
    # Create DataFrame
    df = pd.DataFrame(mapped_mutations)
    return df

def main():
    sequence2018 = """MERVKELRDLMSQSRTREILTKTTVDHMAIIKKYTSGRQEKNPALRMKWMMAMKYPITADKRIMEMIPERNEQGQTLWSKTNDAGSDRVMVSPLAVTWWNRNGPTTSTVHYPKVYKTYFEKVERLKHGTFGPVHFRNQVKIRRRVDINPGHADLSAKEAQDVIMEVVFPNEVGARILTSESQLTITKEKKEELQDCKIAPLMVAYMLERELVRKTRFLPVAGGTSSVYIEVLHLTQGTCWEQMYTPGGDVRNDDVDQSLIIAARNIVRRAVVSADPLASLLEMCHSTQIGGIRMVDILRQNPTEEQAVDICKAAMGLRISSSFSFGGFTFKRTSGSSVKREEEVLTGNLQTLKIRVHEGYEEFTMVGRRATAILRKATRRLIQLIVSGKDEQSIAEAIIVAMVFSQEDCMVKAVRGDLNFVNRANQRLNPMHQLLRHFQKDAKVLFQNWGIEPIDNVMGMIGILPDMTPSTEMSLRGVRVSKMGVDEYSSTERVVVSIDRFLRVRDQRGNVLLSPEEVSETQGTEKLTITYSSSMMWEINGPESVLVNTYQWIIRNWETVKIQWSQDPTMLYNKMEFEPFQSLVPKAARGQYSGFVRTLFQQMRDVLGTFDTVQIIKLLPFAAAPPEQSRMQFSSLTVNVRGSGMRILVRGNSPVFNYNKATKRLTVLGKDAGALTEDPDEGTAGVESAVLRGFLILGKEDKRYGPALSINELSNLAKGEKANVLIGQGDVVLVMKRKRDSSILTDSQTATKRIRMAIN"""  
    sequence2021 = """MERIKELRDLMSQSRTREILTKTTVDHMAIIKKYTSGRQEKNPALRMKWMMAMRYPITADKRIMEMIPERNEQGQTLWSKTNDAGSDRVMVSPLAVTWWNRNGPTTSTVHYPKVYKTYFEKVERLKHGTFGPVHFRNQVKIRRRVDTNPGHADLSAKEAQDVIMEVVFPNEVGARILTSESQLTITREKKKELQGCKIAPLMVAYMLERELVRKTRFLPVAGGTSSVYIEVLHLTQGTCWEQMYTPGGVVRNDDVDQSLIIAARNIVRRATVSADPLASLLEMCHSTQIGGVRMVDILKQNPTEEQAVDICKAAMGLRISSSFSFGGFTFKRTSGSSVKKEEEVLTGNLQTLKIRIHEGYEEFTMVGRRATAILRKATRRLIQLIVSGRDEQSIAEAVIVAMVFSQEDCMIKAVRGDLNFVNRANQRLNPMHQLLRHFQKDAKVLFQNWGIEPIDNVMGMIGILPDMTPSTEMSLRGVRVSKMGVDEYSSTERVVVSIDRFLRVRDQKGNVLLSPEEVSEAQGTEKLTITYSSSMMWEINGPESVLVNTYQWIIRNWETVKIQWSQDPTMLYNKMEFEPFQSLVPKAARGQYSGFVRTLFQQMRDVLGTFDTVQIIKLLPFAAAPPEQSKMQFSSLTVNVRGSGLRILVRGNSPVFNYNRATKRLSILGKDAGALTEDPDEGTSGVESAVLRGFLILGKEDKRYGPALSINELSNLAKGEKANVLIGQGDIVLVMKRKRDSSILTDSQTATKRIRMAIN""" 
    sequence2022 = """MERIKELRDLMSQSRTREILTKTTVDHMAIIKKYTSGRQEKNPALRMKWMMAMKYPITADKRIMEMIPERNEQGQTLWSKTNDAGSDRVMVSPLAVTWWNRNGPTTSTVHYPKVYKTYFEKVERLKHGTFGPVHFRNQVKIRRRVDINPGHADLSAKEAQDVIMEVVFPNEVGARILTSESQLTITKEKKEELQDCKIAPLMVAYMLERELVRKTRFLPVAGGTSSVYIEVLHLTQGTCWEQMYTPGGEVRNDDVDQSLIIAARNIVRRATVSADPLASLLEMCHSTQIGGIRMVDILRQNPTEEQAVDICKAAMGLRISSSFSFGGFTFKRTSGSSVKREEEVLTGNLQTLKIRVHEGYEEFTMVGRRATAILRKATRRLIQLIVSGRDEQSIAEAIIVAMVFSQEDCMIKAVRGDLNFVNRANQRLNPMHQLLRHFQKDAKVLFQNWGIEPIDNVMGMIGILPDMTPSTEMSLRGVRVSRMGVDEYSSTERVVVSIDRFLRVRDQRGNVLLSPEEVSETQGTEKLTITYLSSMMWEINGPESVLVNTYQWIIRNWETVKIQWSQDPTMLYNRMEFEPFQSLVPKAARGQYSGFVRTLFQQMRDVLGTFDTVQIIKLLPFAAAPPEQSRMQFSSLTVNVRGSGMRILVRGNSPVFNYNKATKRLIVLGKDAGALTEDPGEGTAGVESAVLRGFLILGKEDKRYGPALSINELSNLAKGEKANVLIGQGDVVLVMKRKRDSSILTDSQTATKRIRMAIN"""  
    
    results = analyze_pb2_sequences(sequence2018, sequence2021, sequence2022)
    
    # Print results
    print("\nMutation Analysis Results:")
    print(results)
    
    # Save to CSV
    results.to_csv('PB2mutation_analysis.csv', index=False)
    print("\nResults saved to 'PB2mutation_analysis.csv'")

if __name__ == "__main__":
    main()


Mutation Analysis Results:
    position seq2018_aa seq2021_aa seq2022_aa                 region
0          4          I          V          I             N_terminal
1         54          R          K          K             N_terminal
2        147          T          I          I             N_terminal
3        187          R          K          K             N_terminal
4        191          K          E          E             N_terminal
5        195          G          D          D             N_terminal
6        249          V          D          E               Mid_link
7        271          T          V          T               Mid_link
8        292          V          I          I               Mid_link
9        299          K          R          R               Mid_link
10       340          K          R          R            Cap_binding
11       356          I          V          V            Cap_binding
12       389          R          K          R            Cap_binding
13    