# Part 1: Code Academy Review


![Python syntax](python_syntax.png)
![String and console output](string_and_console.png?raw=true)
![Conditionals and Control flow](conditional_and_flow.png?raw=true)
![Functions](functions.png?raw=true)
![Python Lists and Dictionaries](python_lists_and _dictionary.png?raw=true)
![Lists and functions](lists_and_functions.png?raw=true)
![Loops](loops.png?raw=true)
![Practice makes perfect](practice_makes_perfect.png?raw=true)


# Part II
## Python for Biologists

### DNA translation

```
# begin pseudocode
# first use .upper() on sequence
# open genetic code file into python
# for loop to break sequence into list of codons
# for loop to run through list to find values from the codon dictionary
# concatenate result into string
# end psudocode
```



In [57]:
dna = "AGGTTGCCTGTCGTAAGC" #use this to test code

def translate_dna(dna):
    dna = dna.upper()
    gencode = {
        'ATA':'I', 'ATC':'I', 'ATT':'I', 'ATG':'M',
        'ACA':'T', 'ACC':'T', 'ACG':'T', 'ACT':'T',
        'AAC':'N', 'AAT':'N', 'AAA':'K', 'AAG':'K',
        'AGC':'S', 'AGT':'S', 'AGA':'R', 'AGG':'R',
        'CTA':'L', 'CTC':'L', 'CTG':'L', 'CTT':'L',
        'CCA':'P', 'CCC':'P', 'CCG':'P', 'CCT':'P',
        'CAC':'H', 'CAT':'H', 'CAA':'Q', 'CAG':'Q',
        'CGA':'R', 'CGC':'R', 'CGG':'R', 'CGT':'R',
        'GTA':'V', 'GTC':'V', 'GTG':'V', 'GTT':'V',
        'GCA':'A', 'GCC':'A', 'GCG':'A', 'GCT':'A',
        'GAC':'D', 'GAT':'D', 'GAA':'E', 'GAG':'E',
        'GGA':'G', 'GGC':'G', 'GGG':'G', 'GGT':'G',
        'TCA':'S', 'TCC':'S', 'TCG':'S', 'TCT':'S',
        'TTC':'F', 'TTT':'F', 'TTA':'L', 'TTG':'L',
        'TAC':'Y', 'TAT':'Y', 'TAA':'_', 'TAG':'_',
        'TGC':'C', 'TGT':'C', 'TGA':'_', 'TGG':'W'
    }

    codon_list = []
    
    last_codon_start = len(dna) - 2
    for ii in range(0, last_codon_start, 3):
        codon = dna[ii:ii + 3]
        codon_list.append(codon)
    aa_list = []
    for codon in codon_list:
        aa = gencode.get(codon)
        aa_list.append(aa)
    print (aa_list)
    protein = ",".join(aa_list)
    return protein.replace(",", "")

translate_dna(dna)

['R', 'L', 'P', 'V', 'V', 'S']


'RLPVVS'

# Part III
## 1. pop gen simulation

In [3]:


import scipy # for random numbers

def build_population(N, p):
    """The population consists of N individuals.
    Each individual has two chromosomes, containing allele 'A' or 'a',
    with prob p and 1-p
    """
    
    population = []
    for i in range(N):
        allele1 = "A"
        if scipy.random.rand() > p:
            allele1 = "a"
        allele2 = "A"
        if scipy.random.rand() > p:
            allele2 = "a"
        population.append((allele1, allele2))
    return population


In [4]:
build_population(10, 0.7)

[('A', 'A'),
 ('A', 'a'),
 ('A', 'A'),
 ('A', 'A'),
 ('A', 'A'),
 ('A', 'a'),
 ('A', 'A'),
 ('a', 'A'),
 ('A', 'A'),
 ('a', 'A')]

In [5]:
def compute_freq(population):
    """Count the genotypes
    Returns a dictionary with counts for each genotypes"""
    
    AA = population.count(('A', 'A'))
    Aa = population.count(('A', 'a'))
    aA = population.count(('a', 'A'))
    aa = population.count(('a', 'a'))
    return({'AA': AA,
    'aa': aa,
    'Aa': Aa,
    'aA': aA})

In [8]:
my_pop = build_population(6, 0.5)
print(my_pop)
compute_freq(my_pop)

[('A', 'a'), ('a', 'a'), ('A', 'a'), ('A', 'a'), ('A', 'a'), ('a', 'A')]


{'AA': 0, 'Aa': 4, 'aA': 1, 'aa': 1}

In [9]:
def reproduce_population(population):
    """ Create new generation through sexual reproduction.
    For each of N new offspring:
    - Choose the parents at random
    - The offspring receives a chromosome from each of the parent"""
    new_gen = []
    NN = len(population)
    for ii in range(NN):
        dad = scipy.random.randint(NN)
        mom = scipy.random.randint(NN)
        chr_mom = scipy.random.randint(2)
        offspring = (population[mom][chr_mom], population[dad][1-chr_mom])
        """ Why is chromosome from dad chosen based on 1-chr_mom?
        Chromosome from dad should be independent of chromosome from mom!"""
        new_gen.append(offspring)
    return(new_gen)

In [10]:
reproduce_population(my_pop)

[('a', 'A'), ('a', 'A'), ('a', 'A'), ('A', 'a'), ('a', 'A'), ('a', 'A')]

In [11]:
import drift

In [12]:
def simulate_drift(N, p):
    # initialize population
    my_pop = drift.build_population(N, p)
    fixation = False
    num_gen = 0
    while fixation == False:
        #compute genotype counts
        genotype_counts = drift.compute_freq(my_pop)
        #end simulation when one allele reaches fixation
        if genotype_counts['AA'] == N or genotype_counts['aa'] == N:
            print("An allele reached fixation at generation:", num_gen)
            print("The genotype counts are:")
            print(genotype_counts)
            fixation == True
            break
        my_pop = drift.reproduce_population(my_pop)
        num_gen = num_gen + 1

In [13]:
simulate_drift(100, 0.5)

An allele reached fixation at generation: 208
The genotype counts are:
{'Aa': 0, 'aA': 0, 'AA': 0, 'aa': 100}


In [14]:
simulate_drift(100, 0.9)

An allele reached fixation at generation: 64
The genotype counts are:
{'Aa': 0, 'aA': 0, 'AA': 100, 'aa': 0}


## 2. Do part 1 of question 4.9.1

In [88]:

"""format the Jiang2013_data.csv in terminal to make it 
comma-separated instead of space-separated using the command:
sed 's/\s/,/g' Jiang2013_data.csv > formatted_Jiang2013_data.csv
"""
# Write a function that takes as input the desired Taxon and returns the mean value of r

def meanr(xx):
    with open("formatted_Jiang2013_data.csv") as data_file:
        datalist = data_file.readlines()
        datalist = datalist[1:-1] # get rid of header and last empty line
        import re
        taxon_list = []
        for line in datalist:
            #for loop to get a list of all the taxa
            taxon = re.search("\w*,\w*,(\w*)", line)
            taxon_list.append(taxon.group(1))
        r_val_list =[]
        for line in datalist:
            #for loop to get a list of all the r values
            r_val = re.search("(-*\d\.*\d*)", line)
            r_val_list.append(r_val.group(1))
        #make a dictionary
        assert len(taxon_list) == len(r_val_list)
        taxon_indices = [i for i, x in enumerate(taxon_list)
                         if x == xx] # this gives indices of the desired taxon
        taxon_r_val = []
        for index in taxon_indices:
            taxon_r_val.append(float(r_val_list[index]))
            # this yields a list of all r values associated with the taxon
        mean_r_val = sum(taxon_r_val)/len(taxon_r_val)
        return mean_r_val



In [91]:
meanr("Insect")

0.19664531553867934

In [92]:
meanr("Fish")

0.39719005173783783