In [1]:
# GENERATE alignment files for HA

#Extract the HA sequences

from Bio import SeqIO

def extract_HA_sequences(fasta_files, gff_files, output_file):
    ha_sequences = []
    names = ["flu_2018", "flu_2021", "flu_2022"]
    
    for fasta_file, gff_file, name in zip(fasta_files, gff_files, names):
        genome_dict = {record.id: record.seq for record in SeqIO.parse(fasta_file, "fasta")}
        with open(gff_file) as gff:
            for line in gff:
                if not line.startswith("#"):
                    fields = line.strip().split("\t")
                    if len(fields) > 2 and fields[2] == "gene" and "HA" in fields[8]:
                        seq_id = fields[0]
                        start, end = int(fields[3]) - 1, int(fields[4])
                        ha_seq = genome_dict[seq_id][start:end]
                        ha_sequences.append((name, ha_seq))
                        break
    
    with open(output_file, "w") as out_fasta:
        for name, seq in ha_sequences:
            out_fasta.write(f">{name}\n{seq}\n")

# Input files
fasta_files = ["flu_2018.fa", "flu_2021.fa", "flu_2022.fa"]
gff_files = ["flu_2018.gff", "flu_2021.gff", "flu_2022.gff"]
output_fasta = "HA_sequences.fasta"

# Extract HA sequences
extract_HA_sequences(fasta_files, gff_files, output_fasta)



#____________________________________________________________________________________________________________________________
#Get Mafft to align the sequences

from Bio.Align.Applications import MafftCommandline
import os

def align_sequences(input_file, output_file="aligned_sequences.fasta"):
    mafft_cline = MafftCommandline(input=input_file)
    stdout, stderr = mafft_cline()
    with open(output_file, "w") as aligned:
        aligned.write(stdout)
    return output_file

aligned_file = align_sequences("HA_sequences.fasta", "HA_aligned_sequences.fasta")






Due to the on going maintenance burden of keeping command line application
wrappers up to date, we have decided to deprecate and eventually remove these
modules.

We instead now recommend building your command line and invoking it directly
with the subprocess module.


In [2]:
# GENERATE alignment files for NA

from Bio import SeqIO

def extract_NA_sequences(fasta_files, gff_files, output_file):
    na_sequences = []
    names = ["flu_2018", "flu_2021", "flu_2022"]
    
    for fasta_file, gff_file, name in zip(fasta_files, gff_files, names):
        genome_dict = {record.id: record.seq for record in SeqIO.parse(fasta_file, "fasta")}
        with open(gff_file) as gff:
            for line in gff:
                if not line.startswith("#"):
                    fields = line.strip().split("\t")
                    if len(fields) > 2 and fields[2] == "gene" and "NA" in fields[8]:
                        seq_id = fields[0]
                        start, end = int(fields[3]) - 1, int(fields[4])
                        na_seq = genome_dict[seq_id][start:end]
                        na_sequences.append((name, na_seq))
                        break
    
    with open(output_file, "w") as out_fasta:
        for name, seq in na_sequences:
            out_fasta.write(f">{name}\n{seq}\n")

# Input files
fasta_files = ["flu_2018.fa", "flu_2021.fa", "flu_2022.fa"]
gff_files = ["flu_2018.gff", "flu_2021.gff", "flu_2022.gff"]
output_fasta = "NA_sequences.fasta"

# Extract NA sequences
extract_NA_sequences(fasta_files, gff_files, output_fasta)


#____________________________________________________________________________________________________________________________
#Get Mafft to align the sequences

from Bio.Align.Applications import MafftCommandline
import os

def align_sequences(input_file, output_file="aligned_sequences.fasta"):
    mafft_cline = MafftCommandline(input=input_file)
    stdout, stderr = mafft_cline()
    with open(output_file, "w") as aligned:
        aligned.write(stdout)
    return output_file

aligned_file = align_sequences("NA_sequences.fasta", "NA_aligned_sequences.fasta")



In [3]:
# GENERATE alignment files for PB1 

from Bio import SeqIO

def extract_pb1_sequences(fasta_files, gff_files, output_file):
    pb1_sequences = []
    names = ["flu_2018", "flu_2021", "flu_2022"]
    
    for fasta_file, gff_file, name in zip(fasta_files, gff_files, names):
        genome_dict = {record.id: record.seq for record in SeqIO.parse(fasta_file, "fasta")}
        with open(gff_file) as gff:
            for line in gff:
                if not line.startswith("#"):
                    fields = line.strip().split("\t")
                    if len(fields) > 2 and fields[2] == "gene" and "PB1" in fields[8]:
                        seq_id = fields[0]
                        start, end = int(fields[3]) - 1, int(fields[4])
                        pb1_seq = genome_dict[seq_id][start:end]
                        pb1_sequences.append((name, pb1_seq))
                        break
    
    with open(output_file, "w") as out_fasta:
        for name, seq in pb1_sequences:
            out_fasta.write(f">{name}\n{seq}\n")

# Input files
fasta_files = ["flu_2018.fa", "flu_2021.fa", "flu_2022.fa"]
gff_files = ["flu_2018.gff", "flu_2021.gff", "flu_2022.gff"]
output_fasta = "pb1_sequences.fasta"

# Extract PB1 sequences
extract_pb1_sequences(fasta_files, gff_files, output_fasta)


#____________________________________________________________________________________________________________________________
#Get Mafft to align the sequences

from Bio.Align.Applications import MafftCommandline
import os

def align_sequences(input_file, output_file="aligned_sequences.fasta"):
    mafft_cline = MafftCommandline(input=input_file)
    stdout, stderr = mafft_cline()
    with open(output_file, "w") as aligned:
        aligned.write(stdout)
    return output_file

aligned_file = align_sequences("pb1_sequences.fasta", "pb1_aligned_sequences.fasta")



In [4]:
# GENERATE alignment files for PB2


from Bio import SeqIO

def extract_pb2_sequences(fasta_files, gff_files, output_file):
    pb2_sequences = []
    names = ["flu_2018", "flu_2021", "flu_2022"]
    
    for fasta_file, gff_file, name in zip(fasta_files, gff_files, names):
        genome_dict = {record.id: record.seq for record in SeqIO.parse(fasta_file, "fasta")}
        with open(gff_file) as gff:
            for line in gff:
                if not line.startswith("#"):
                    fields = line.strip().split("\t")
                    if len(fields) > 2 and fields[2] == "gene" and "PB2" in fields[8]:
                        seq_id = fields[0]
                        start, end = int(fields[3]) - 1, int(fields[4])
                        pb2_seq = genome_dict[seq_id][start:end]
                        pb2_sequences.append((name, pb2_seq))
                        break
    
    with open(output_file, "w") as out_fasta:
        for name, seq in pb2_sequences:
            out_fasta.write(f">{name}\n{seq}\n")

# Input files
fasta_files = ["flu_2018.fa", "flu_2021.fa", "flu_2022.fa"]
gff_files = ["flu_2018.gff", "flu_2021.gff", "flu_2022.gff"]
output_fasta = "pb2_sequences.fasta"

# Extract PB2 sequences
extract_pb2_sequences(fasta_files, gff_files, output_fasta)


#____________________________________________________________________________________________________________________________
#Get Mafft to align the sequences

from Bio.Align.Applications import MafftCommandline
import os

def align_sequences(input_file, output_file="aligned_sequences.fasta"):
    mafft_cline = MafftCommandline(input=input_file)
    stdout, stderr = mafft_cline()
    with open(output_file, "w") as aligned:
        aligned.write(stdout)
    return output_file

aligned_file = align_sequences("pb2_sequences.fasta", "pb2_aligned_sequences.fasta")

