In [7]:
import random

# Variables

In [53]:
nucleotides = ['A', 'C', 'T', 'G']
length = 100
m = 8 # initial number of distinct ancestor strings
g = 12 # number of generations (at the end of simulation there will be m*(s^g) strings)
k = 8 # number of mutations per generation

# Number of splits from each string at each generation (i.e., one bacteria splitting in half to become two)
s = 2

### Generate our starting population of m ancestor strings

In [55]:
random.seed(0)
ancestors = []
for i in range(m):
    rand_ancestor = ''
    for j in range(length):
        rand_nucleotide = random.choice(nucleotides)
        rand_ancestor = rand_ancestor + rand_nucleotide
    ancestors.append((rand_ancestor, i))
[i[0] for i in ancestors]

['GGATGGTGTCCTCATCTAATGATGTCGGTAAAGAGTCTACCCCGAATGATTATCTGAGTCTCCCATGAACCAAGTCCGTGGTATAGTCCATACTCTGAAC',
 'CAAAACAGATAAACCAGCAAGATACATTGCAGAAGCTTGCCACCTTAGCAGGTTGTCAGATATCCGTTTCTGGAACTCCCGGGAGGACGATCGGAAGTTG',
 'AGCACAGGTACAAACACTTCAGGAATGATCTACTAAACTTTAGGGTCCGTACCTTTTATAATCCTTGCTAGCATCATGTTGAAGGTTAGAGGATTCCGAA',
 'ACCAGAAGTGGCGATCTCGCTAAAGCAGGTCACCACGGTCAGCGGGTGGCCATTTACTCGTGAAAACCATAGTCCGTGAAAGCTGGGCAACTTTAGTTGG',
 'GACCCTTAAGGCGACTGAGGGAAGCAACTATCGGAAGTATCGTACAGGTCGTAAAGTACCAGTACGGAAGAAGCAGGGAGTTATAATATTCACTACCACA',
 'ATTACCCGAGTTCACTTGTTTCAATCGCCCTCCCTTGACAGAACGTGCGTTACGTAGGAGTGCTTGACATACGGCGGCCGTCTGAGCTAGGACTATCGGA',
 'GCGTAATAATGGGATTTCAAATTTACCAGTTCCAGGTTGTCCAAGGGCTTGGCGGTGAGTCGACATGGAAAGATAAATTCCTCAGGTGCTGGCGCTCCCG',
 'TGGGGCCGCAGACACTACCTATTGGAGGGTGCTTAAACTATACAGCGCGCTAATTGTTAACTACTCCTTTGTGTCATAAGGGAGGGGAAACACGCGAGGA']

In [47]:
def mutate_string(string, k):
    """ Mutate k locations in the given string, returning this new mutated string"""
    indices_to_replace = random.sample(range(0, len(string)), k)
    new_string = ''
    for i in range(len(string)):
        if i in indices_to_replace:
            new_string = new_string + random.choice(nucleotides)
        else:
            new_string = new_string + string[i]
            
    return new_string
  
# Test function...
mutate_string('ACTG', 1)

'ACTA'

### Generate descendants!

In [48]:
# For each string, for the number of generations specified, split each string into "s" strings all with "k" mutations 
# compared to the parent string. 
population = ancestors
for generations in range(g):
    new_population = []
    for string, origin in population:
        # For each string in the current generation, generate "splits" descendants with k mutations from each string
        # Add these s new strings into the new population. Note: we are not retaining any of the original strings
        # from the previous generation, rather just the mutated descendants. 
        for splits in range(s):
            new_string = mutate_string(string, k) 
            new_population.append((new_string, origin))
    population = new_population
    
# (population contains tuples that also indicate which of the original 'm' ancestors each descendant came from)

In [49]:
(s**g)*m

1024

In [50]:
len(population)

1024

In [51]:
for descendant in population:
    print(descendant)

('GGTGGGACTCCTCATCACAAGAAGTCGACTTGACGTAGCCCCTGATTCGTCATTCGGGTCGACCATCATCGAACTACGTCGTGTCGACCAGACTCTTAAT', 0)
('GGTTGGACTCCTCATCACAGGAAGTTGACTTGCCGTAGCCCCGCATGGGTCATTCGAGCCTACCAACATCGAACTACGTGGTGTCGACCAGACTGTGAAT', 0)
('GGTTGCACTCCTCATGAAATGAAGTCGCTTGGAAGTAGATCCGGATTCATCATACGAGTTTACCATCATCGGACTACGGGGTATCGACCACACTCTAAAT', 0)
('GGATGCACTCCTAATAACATGGAGTCGATTGCAAGTTGGCCCGGATTTATCATACGTGTTTACCATCATCGGACTACGCGGTATCGACCACACCCTGAAT', 0)
('TGTTGGTCTCCTCATCACGTACAGTCGATTCGGAGTCTGTCCGGGCTCATCAGTCGCGCCCACCATCATCTAACTATGTGGTTACGTCTAGACTCTCAAA', 0)
('GCTTGGTCGCCTCATCACGTAAAGTCGATTTGGAGTATTCCCGGGCTCATCAGTCGCGCCCACCATAATCGAACTATGTGGTTACGTCAAGACTCTCAAA', 0)
('GGTTGCTCTCCTCCTCACGTAACGTCGATTTGGAGTATGTCGGTAATGATCATTAGCGTATATCATCATCAAACTACGTGGCTTAGTCAAGACTCTCAAA', 0)
('GGTTGCTCTCCTCCTCACCTAAAGTTTATTTGGAGTTTGCGCGTAATGATCATTAGCGTATACCATCATCATACTACGTGGCTTGGTGAAGACGCTCAAA', 0)
('GGATGTTCTCCTCATCAAATAATGTCGATCCGGAGCTTGACCGGATTGTAAATAATAGAGTGCCATGATCGTACGACCTGAGATCGACACGACAGTGACT', 0)
('GGATGTTCTCCTCATCAAATAATGTC

In [52]:
with open('descendants_k={}_g={}_m={}_s={}.txt'.format(k, g, m, s), 'w') as f:
    for descendant in population:
        f.write('{}\n'.format(descendant[0]))

# Consensus

In [41]:
def hamming_distance(str1, str2):
    hamming_count = 0
    for i, char1 in enumerate(str1):
        char2 = str2[i]
        if char1 != char2:
            hamming_count += 1
    return hamming_count
        

In [42]:
from collections import defaultdict

In [44]:
for ancestor_number in [0, 1, 2, 3]:
    pos_counts = defaultdict(lambda:defaultdict(lambda:0))
    for seq in [s for s, pop in population if pop == ancestor_number]:
        for i, nuc in enumerate(seq):
            pos_counts[i][nuc] += 1
    reconstructed = ''
    for pos in sorted(pos_counts.keys()):
        counts = pos_counts.get(pos)
        max_count = max(counts.values())
        for n in ['A', 'C', 'T', 'G']:
            if max_count == counts.get(n):
                reconstructed += n

    print(hamming_distance(reconstructed, ancestors[ancestor_number][0]))

10


IndexError: string index out of range

6

'GGATGGTGTCCTCATCTAATGATGTCGGTAAAGAGTCTACCCCGAATGATTATCTGAGTCTCCCATGAACCAAGTCCGTGGTATAGTCCATACTCTGAAC'