In [13]:
# GENERATE CONSERVATION FOR HA

#Extract the HA sequences

from Bio import SeqIO

def extract_HA_sequences(fasta_files, gff_files, output_file):
    ha_sequences = []
    names = ["flu_2018", "flu_2021", "flu_2022"]
    
    for fasta_file, gff_file, name in zip(fasta_files, gff_files, names):
        genome_dict = {record.id: record.seq for record in SeqIO.parse(fasta_file, "fasta")}
        with open(gff_file) as gff:
            for line in gff:
                if not line.startswith("#"):
                    fields = line.strip().split("\t")
                    if len(fields) > 2 and fields[2] == "gene" and "HA" in fields[8]:
                        seq_id = fields[0]
                        start, end = int(fields[3]) - 1, int(fields[4])
                        ha_seq = genome_dict[seq_id][start:end]
                        ha_sequences.append((name, ha_seq))
                        break
    
    with open(output_file, "w") as out_fasta:
        for name, seq in ha_sequences:
            out_fasta.write(f">{name}\n{seq}\n")

# Input files
fasta_files = ["flu_2018.fa", "flu_2021.fa", "flu_2022.fa"]
gff_files = ["flu_2018.gff", "flu_2021.gff", "flu_2022.gff"]
output_fasta = "HA_sequences.fasta"

# Extract HA sequences
extract_HA_sequences(fasta_files, gff_files, output_fasta)



#____________________________________________________________________________________________________________________________
#Get Mafft to align the sequences

from Bio.Align.Applications import MafftCommandline
import os

def align_sequences(input_file, output_file="aligned_sequences.fasta"):
    mafft_cline = MafftCommandline(input=input_file)
    stdout, stderr = mafft_cline()
    with open(output_file, "w") as aligned:
        aligned.write(stdout)
    return output_file

aligned_file = align_sequences("HA_sequences.fasta", "HA_aligned_sequences.fasta")


#____________________________________________________________________________________________________________________________
# Calculate Conservation Scores


from Bio import AlignIO
import math
import numpy as np

def calculate_conservation_scores_to_wig(alignment_file, output_file, chrom="MT781554.2", start=1, step=1):
    
    alignment = AlignIO.read(alignment_file, "fasta")
    seq_len = alignment.get_alignment_length()
    scores = []

    for i in range(seq_len):
        column = alignment[:, i]
        valid_bases = [base for base in column if base not in {"-", "N"}]
        if len(valid_bases) == 0:
            conservation_score = 0
        else:
            freq = {base: valid_bases.count(base) / len(valid_bases) for base in set(valid_bases)}
            entropy = -sum(p * math.log2(p) for p in freq.values())
            conservation_score = 1 - np.divide(entropy, math.log2(len(freq)), out=np.zeros_like(entropy), where=(math.log2(len(freq))) != 0)
        
        scores.append(conservation_score)

    with open(output_file, "w") as wig:
        wig.write(f"track type=wiggle_0 name=\"Conservation Scores\" description=\"Conservation scores for HA alignment\"\n")
        wig.write(f"fixedStep chrom={chrom} start={start} step={step}\n")
        for score in scores:
            wig.write(f"{score:.4f}\n")

    print(f"Conservation scores written to {output_file}")

calculate_conservation_scores_to_wig("HA_sequences.fasta", "HA_conservation.wig")


Conservation scores written to HA_conservation.wig


In [14]:
# GENERATE CONSERVATION FOR NA

from Bio import SeqIO

def extract_NA_sequences(fasta_files, gff_files, output_file):
    na_sequences = []
    names = ["flu_2018", "flu_2021", "flu_2022"]
    
    for fasta_file, gff_file, name in zip(fasta_files, gff_files, names):
        genome_dict = {record.id: record.seq for record in SeqIO.parse(fasta_file, "fasta")}
        with open(gff_file) as gff:
            for line in gff:
                if not line.startswith("#"):
                    fields = line.strip().split("\t")
                    if len(fields) > 2 and fields[2] == "gene" and "NA" in fields[8]:
                        seq_id = fields[0]
                        start, end = int(fields[3]) - 1, int(fields[4])
                        na_seq = genome_dict[seq_id][start:end]
                        na_sequences.append((name, na_seq))
                        break
    
    with open(output_file, "w") as out_fasta:
        for name, seq in na_sequences:
            out_fasta.write(f">{name}\n{seq}\n")

# Input files
fasta_files = ["flu_2018.fa", "flu_2021.fa", "flu_2022.fa"]
gff_files = ["flu_2018.gff", "flu_2021.gff", "flu_2022.gff"]
output_fasta = "NA_sequences.fasta"

# Extract NA sequences
extract_NA_sequences(fasta_files, gff_files, output_fasta)


#____________________________________________________________________________________________________________________________
#Get Mafft to align the sequences

from Bio.Align.Applications import MafftCommandline
import os

def align_sequences(input_file, output_file="aligned_sequences.fasta"):
    mafft_cline = MafftCommandline(input=input_file)
    stdout, stderr = mafft_cline()
    with open(output_file, "w") as aligned:
        aligned.write(stdout)
    return output_file

aligned_file = align_sequences("NA_sequences.fasta", "NA_aligned_sequences.fasta")

#____________________________________________________________________________________________________________________________
# Calculate Conservation Scores

from Bio import AlignIO
import math
import numpy as np

def calculate_conservation_scores_to_wig(alignment_file, output_file, chrom="MT781550.2", start=1, step=1):
   
    alignment = AlignIO.read(alignment_file, "fasta")
    seq_len = alignment.get_alignment_length()
    scores = []

    for i in range(seq_len):
        column = alignment[:, i]
        valid_bases = [base for base in column if base not in {"-", "N"}]
        if len(valid_bases) == 0:
            conservation_score = 0
        else:
            freq = {base: valid_bases.count(base) / len(valid_bases) for base in set(valid_bases)}
            entropy = -sum(p * math.log2(p) for p in freq.values())
            conservation_score = 1 - np.divide(entropy, math.log2(len(freq)), out=np.zeros_like(entropy), where=(math.log2(len(freq))) != 0)
        
        scores.append(conservation_score)

    with open(output_file, "w") as wig:
        wig.write(f"track type=wiggle_0 name=\"Conservation Scores\" description=\"Conservation scores for NA alignment\"\n")
        wig.write(f"fixedStep chrom={chrom} start={start} step={step}\n")
        for score in scores:
            wig.write(f"{score:.4f}\n")

    print(f"Conservation scores written to {output_file}")

calculate_conservation_scores_to_wig("NA_aligned_sequences.fasta", "NA_conservation.wig")

Conservation scores written to NA_conservation.wig


In [12]:
# GENERATE PB2 CONSERVATION SCORES 

from Bio import SeqIO

def extract_pb1_sequences(fasta_files, gff_files, output_file):
    pb1_sequences = []
    names = ["flu_2018", "flu_2021", "flu_2022"]
    
    for fasta_file, gff_file, name in zip(fasta_files, gff_files, names):
        genome_dict = {record.id: record.seq for record in SeqIO.parse(fasta_file, "fasta")}
        with open(gff_file) as gff:
            for line in gff:
                if not line.startswith("#"):
                    fields = line.strip().split("\t")
                    if len(fields) > 2 and fields[2] == "gene" and "PB1" in fields[8]:
                        seq_id = fields[0]
                        start, end = int(fields[3]) - 1, int(fields[4])
                        pb1_seq = genome_dict[seq_id][start:end]
                        pb1_sequences.append((name, pb1_seq))
                        break
    
    with open(output_file, "w") as out_fasta:
        for name, seq in pb1_sequences:
            out_fasta.write(f">{name}\n{seq}\n")

# Input files
fasta_files = ["flu_2018.fa", "flu_2021.fa", "flu_2022.fa"]
gff_files = ["flu_2018.gff", "flu_2021.gff", "flu_2022.gff"]
output_fasta = "pb1_sequences.fasta"

# Extract PB1 sequences
extract_pb1_sequences(fasta_files, gff_files, output_fasta)


#____________________________________________________________________________________________________________________________
#Get Mafft to align the sequences

from Bio.Align.Applications import MafftCommandline
import os

def align_sequences(input_file, output_file="aligned_sequences.fasta"):
    mafft_cline = MafftCommandline(input=input_file)
    stdout, stderr = mafft_cline()
    with open(output_file, "w") as aligned:
        aligned.write(stdout)
    return output_file

aligned_file = align_sequences("pb1_sequences.fasta", "pb1_aligned_sequences.fasta")

#____________________________________________________________________________________________________________________________
# Calculate Conservation Scores

from Bio import AlignIO
import math
import numpy as np

def calculate_conservation_scores_to_wig(alignment_file, output_file, chrom="MT781553.2", start=1, step=1):
    
    alignment = AlignIO.read(alignment_file, "fasta")
    seq_len = alignment.get_alignment_length()
    scores = []

    for i in range(seq_len):
        column = alignment[:, i]
        valid_bases = [base for base in column if base not in {"-", "N"}]
        if len(valid_bases) == 0:
            conservation_score = 0
        else:
            freq = {base: valid_bases.count(base) / len(valid_bases) for base in set(valid_bases)}
            entropy = -sum(p * math.log2(p) for p in freq.values())
            conservation_score = 1 - np.divide(entropy, math.log2(len(freq)), out=np.zeros_like(entropy), where=(math.log2(len(freq))) != 0)
        
        scores.append(conservation_score)

    with open(output_file, "w") as wig:
        wig.write(f"track type=wiggle_0 name=\"Conservation Scores\" description=\"Conservation scores for PB1 alignment\"\n")
        wig.write(f"fixedStep chrom={chrom} start={start} step={step}\n")
        for score in scores:
            wig.write(f"{score:.4f}\n")

    print(f"Conservation scores written to {output_file}")

calculate_conservation_scores_to_wig("pb1_aligned_sequences.fasta", "pb1_conservation.wig")


Conservation scores written to pb1_conservation.wig


In [15]:
#GENERATE PB2 CONSERVATION SCORES 

from Bio import SeqIO

from Bio import SeqIO

def extract_pb2_sequences(fasta_files, gff_files, output_file):
    pb2_sequences = []
    names = ["flu_2018", "flu_2021", "flu_2022"]
    
    for fasta_file, gff_file, name in zip(fasta_files, gff_files, names):
        genome_dict = {record.id: record.seq for record in SeqIO.parse(fasta_file, "fasta")}
        with open(gff_file) as gff:
            for line in gff:
                if not line.startswith("#"):
                    fields = line.strip().split("\t")
                    if len(fields) > 2 and fields[2] == "gene" and "PB2" in fields[8]:
                        seq_id = fields[0]
                        start, end = int(fields[3]) - 1, int(fields[4])
                        pb2_seq = genome_dict[seq_id][start:end]
                        pb2_sequences.append((name, pb2_seq))
                        break
    
    with open(output_file, "w") as out_fasta:
        for name, seq in pb2_sequences:
            out_fasta.write(f">{name}\n{seq}\n")

# Input files
fasta_files = ["flu_2018.fa", "flu_2021.fa", "flu_2022.fa"]
gff_files = ["flu_2018.gff", "flu_2021.gff", "flu_2022.gff"]
output_fasta = "pb2_sequences.fasta"

# Extract PB2 sequences
extract_pb2_sequences(fasta_files, gff_files, output_fasta)


#____________________________________________________________________________________________________________________________
#Get Mafft to align the sequences

from Bio.Align.Applications import MafftCommandline
import os

def align_sequences(input_file, output_file="aligned_sequences.fasta"):
    mafft_cline = MafftCommandline(input=input_file)
    stdout, stderr = mafft_cline()
    with open(output_file, "w") as aligned:
        aligned.write(stdout)
    return output_file

aligned_file = align_sequences("pb2_sequences.fasta", "pb2_aligned_sequences.fasta")

#____________________________________________________________________________________________________________________________
# Calculate Conservation Scores

from Bio import AlignIO
import math
import numpy as np

def calculate_conservation_scores_to_wig(alignment_file, output_file, chrom="MT781552.2", start=1, step=1):
    alignment = AlignIO.read(alignment_file, "fasta")
    seq_len = alignment.get_alignment_length()
    scores = []

    for i in range(seq_len):
        column = alignment[:, i]
        valid_bases = [base for base in column if base not in {"-", "N"}]
        if len(valid_bases) == 0:
            conservation_score = 0
        else:
            freq = {base: valid_bases.count(base) / len(valid_bases) for base in set(valid_bases)}
            entropy = -sum(p * math.log2(p) for p in freq.values())
            conservation_score = 1 - np.divide(entropy, math.log2(len(freq)), out=np.zeros_like(entropy), where=(math.log2(len(freq))) != 0)
        
        scores.append(conservation_score)

    with open(output_file, "w") as wig:
        wig.write(f"track type=wiggle_0 name=\"Conservation Scores\" description=\"Conservation scores for PB2 alignment\"\n")
        wig.write(f"fixedStep chrom={chrom} start={start} step={step}\n")
        for score in scores:
            wig.write(f"{score:.4f}\n")

    print(f"Conservation scores written to {output_file}")

calculate_conservation_scores_to_wig("pb2_aligned_sequences.fasta", "pb2_conservation.wig")

Conservation scores written to pb2_conservation.wig
