In [1]:
from Bio import SeqIO
from Bio import Seq
from collections import defaultdict
from collections import Counter


In [2]:
sequences_2019 = {}
other_seqs = {}
for seq in SeqIO.parse("../genomic_data/study_dataset.fasta", 'fasta'):
    date = seq.name.split("|")[1]
    if "2019" in date:
        sequences_2019[seq.name] = seq.seq
    else:
        other_seqs[seq.name] = seq.seq
    if seq.name == "KJ469556_2005|2005":
        reference = seq.seq
    

In [3]:
print(len(seq.seq))

11277


In [51]:
mutations = defaultdict(set)
background_mutations = defaultdict(set)

for locus, base in enumerate(reference):
    position = locus+1
    if base == "U":
        base = "T"
    start = f'{base}{position}'
    for name, seq in sequences_2019.items():
        if seq[locus] != base:
            mutations[name].add(f'{start}{seq[locus]}')
            
    for name, seq in other_seqs.items():
        base_2 = seq[locus]
        if base_2 == "U":
            base_2 = "T"
        if base_2 != base:
            background_mutations[name].add(f'{start}{base_2}')

In [52]:
all_mutation_sets = []
all_mutations = []
for name, mutation_set in mutations.items():
    all_mutation_sets.append(mutation_set)
    for i in mutation_set:
        all_mutations.append(i)

In [53]:
in_all = (all_mutation_sets[0].intersection(*all_mutation_sets))
print(in_all)
print(len(in_all))

{'C2583G', 'T10164C', 'A10737G', 'T7270C', 'T4581C', 'C5217T', 'A7432T', 'G10023T', 'A7088C', 'A10534G', 'G10959A', 'T1602C', 'A4981G', 'A8169C', 'C5184T', 'T8682C', 'G6210A'}
17


In [54]:
mutation_counts = Counter(all_mutations)
three_quarters = []
eighty_five = []
ninety = []
for mutation, count in mutation_counts.items():
    if count > 43 and mutation not in in_all:
        ninety.append(mutation)
    elif count >= 41 and mutation not in in_all and mutation not in ninety:
        eighty_five.append(mutation)
    elif count > 36 and mutation not in in_all and mutation not in ninety and mutation not in eighty_five:
        three_quarters.append(mutation)
        
    
print(ninety)
print(eighty_five)
print(three_quarters)

['C10557T', 'C3993T']
[]
['A1497T', 'C9387T']


## compare to background

In [55]:
    
def check_background(mutation_list):
    
    present_in_background = []
    for mutation in mutation_list:
        for seq, mutations in background_mutations.items():
            if mutation in mutations:
                present_in_background.append(mutation)

    present_counts = Counter(present_in_background)
    
    present_percs = {}
    for k,v in present_counts.items():
        present_percs[k] = (v/len(other_seqs))*100


    for mutation in mutation_list:
        if mutation not in present_percs:
            present_percs[mutation] = 0
            
    present_percs_sorted = {k:v for k,v in sorted(present_percs.items(), key=lambda x:x[1])}
    
    return present_percs_sorted
        

In [56]:
one_hundred_bg = check_background(in_all)
ninety_bg = check_background(ninety)
seventy_five_bg = check_background(three_quarters)

print(one_hundred_bg)
print(ninety_bg)
print(seventy_five_bg)


{'G10023T': 95.56962025316456, 'T1602C': 95.9915611814346, 'G10959A': 96.20253164556962, 'C5184T': 96.62447257383965, 'A10737G': 96.83544303797468, 'T7270C': 96.83544303797468, 'T4581C': 97.0464135021097, 'A4981G': 97.0464135021097, 'A8169C': 97.0464135021097, 'T10164C': 97.25738396624473, 'C5217T': 97.46835443037975, 'T8682C': 97.46835443037975, 'C2583G': 98.10126582278481, 'A7432T': 98.10126582278481, 'A7088C': 98.10126582278481, 'A10534G': 98.10126582278481, 'G6210A': 98.10126582278481}
{'C3993T': 94.72573839662446, 'C10557T': 96.83544303797468}
{'C9387T': 96.62447257383965, 'A1497T': 97.0464135021097}
