In [30]:
from Bio import SeqIO
vic = [seq.seq for seq in SeqIO.parse("victoria-pb2.fasta", "fasta")][0]

def get_seqs(variants_file):
    """
    variant_file: fasta file of PB2 sequences
    
    Returns: list of SeqRecords for all PB2 sequences in fasta file
    """
    all_seq = [seq for seq in SeqIO.parse(variants_file, "fasta")]
    return all_seq
def generate_tiles(get_seqs, tile_length, overlap, vic):
    """
    tile_length (int): the length of variable DNA per fragment
    overlap (int): length of overlap between tile fragments
    vic (str): PB2 sequence to use for overlap
    
    Returns: dictionary where keys are the id of the original PB2 sequence 
                and values are tuples of lists of the tiles generated from those sequences, 
                where the first list is the forward and the second is the reverse
    """
    tiles = {}
    for seq in get_seqs:
        seq_tiles = []
        rev_tiles = []
        for i in range(0, len(seq.seq)-1, tile_length+overlap):
            try:
                new_tile = vic[i-overlap:i] + seq.seq[i:i+tile_length] + vic[i+tile_length:i+tile_length+overlap]
            except:
                try:
                    new_tile = seq.seq[i:i+tile_length] + vic[i+tile_length:i+tile_length+overlap]
                except:
                    new_tile = vic[i-overlap:i] + seq.seq[i:i+tile_length]
            seq_tiles.append(new_tile)
            rev_tiles.append(new_tile.reverse_complement())
        tiles[seq.id] = (seq_tiles, rev_tiles)
    return tiles
print(generate_tiles(get_seqs("AllPB2DNA-2011-2016.fasta"), 180, 30, vic))


{'gb:KM070441|Organism:Influenza': ([Seq('AGCRAAAGCAGGTCAATTATATTCAGTATGGAAAGAATAAAAGAATTACGGAAT...AGA', SingleLetterAlphabet()), Seq('AAAAGGGTAACAGAAATGGTTCCGGAGAGAAGGATAACAGAAATGGTTCCAGAG...AAG', SingleLetterAlphabet()), Seq('GGCCCTGTCCATTTCAGAAATCAAGTCAAGCCTGTTCATTTTAGAAATCAAGTC...GAA', SingleLetterAlphabet()), Seq('TTGATGGTCGCATACATGTTAGAGAGAGAAATGGTTGCATACATGTTAGAGAGA...TTA', SingleLetterAlphabet()), Seq('GCAGTATCAGCAGATCCACTAGCATCTTTAGTATCAGCAGATCCACTAGCATCT...CAA', SingleLetterAlphabet()), Seq('GAAGAAGAGGTGCTTACAGGCAATCTCCAAGAAGAGGTGCTTACAGGCAATCTC...TTT', SingleLetterAlphabet()), Seq('ATAAAAGCAGTTAGAGGTGACCTGAATTTTAAAGCAGTTAGAGGTGACCTGAAT...AGC', SingleLetterAlphabet()), Seq('AGCAAAATGGGTGTGGATGAATACTCCAGCAAAATGGGTGTGGATGAATACTCC...GTC', SingleLetterAlphabet()), Seq('CAATGGATCATCAGAAATTGGGAAGCTGTCTGGATCATCAGAAATTGGGAAGCT...AGA', SingleLetterAlphabet()), Seq('TTTGCAGCTGCTCCACCGAAGCAAAGCAGAGCAGCTGCTCCACCAAAGCAAAGC...GAA', SingleLetterAlphabet()), Seq('TTGAGAGGGTTCCTCATTATAGGTAAGG