1\. These are downloaded in the file small.fasta

In [5]:
# Input a genome from a file in fasta format
def input_genome(filename, chosen_gen):    
    with open(filename, 'r') as f:
        header = f.readline()
        current_gen = 1
        genome = ""
        for line in f:
            if line[0] is '>':
                if current_gen is chosen_gen:
                    genome = genome.replace('\n', '')
                    genome = genome.replace('N', 'A')
                    return header, genome
                else:
                    header = line
                    current_gen += 1
                    genome = ""
            else:
                genome = genome + line
          
        if current_gen is chosen_gen:
            genome = genome.replace('\n', '')
            genome = genome.replace('N', 'A')
            return header, genome
        else:
            print("This file only has {} genomes".format(current_gen))
            exit(1)

In [6]:
import numpy as np

# Compute the global alignment of two sequences
def globalign(sequence1, sequence2, match, mismatch, gap):
    # Initialize tables
    scores = np.zeros((len(sequence1) + 1, len(sequence2) + 1), dtype=int)
    traceback = np.zeros((len(sequence1) + 1, len(sequence2) + 1), dtype='i,i')
    
    # Add starting scores and traceback
    traceback[0][0] = (-1, -1)

    # First row
    for c in range(len(scores[0])):
        scores[0][c] = gap * c
        
        if c is not 0:
            traceback[0][c] = (0, c-1)
    
    # First column
    for r in range(len(scores)):
        scores[r][0] = gap * r
        
        if r is not 0:
            traceback[r][0] = (r-1, 0)
        
    # Calculate scores and traceback, row by row
    for r in range(1, len(scores)):
        for c in range(1, len(scores[0])):
            # Calculate the three possible scores
            gap2sequence1 = scores[r-1][c] + gap # Look up
            gap2sequence2 = scores[r][c-1] + gap # Look left
            
            if sequence1[r-1] is sequence2[c-1]:  # Look diagonal
                sequence2sequence = scores[r-1][c-1] + match
            else:
                sequence2sequence = scores[r-1][c-1] + mismatch
                
            # Compare scores to find the largest and the correct traceback(s)
            if gap2sequence2 >= gap2sequence1 and gap2sequence2 >= sequence2sequence:
                # left
                scores[r][c] = gap2sequence2
                traceback[r][c] = (r, c-1)
            elif gap2sequence1 >= gap2sequence2 and gap2sequence1 >= sequence2sequence:
                # up
                scores[r][c] = gap2sequence1
                traceback[r][c] = (r-1, c)
            elif sequence2sequence >= gap2sequence1 and sequence2sequence >= gap2sequence2:
                # diagonal
                scores[r][c] = sequence2sequence
                traceback[r][c] = (r-1, c-1)
    
    r = len(sequence1)
    c = len(sequence2)
    trace = traceback[r][c]
    rev_seq1 = ""
    rev_seq2 = ""
    
    while r != 0 or c != 0:
        r_diff = trace[0] - r
        c_diff = trace[1] - c
        
        if r_diff == -1 and c_diff == -1:
            # diagonal trace
            rev_seq1 += "{}".format(sequence1[r-1])
            rev_seq2 += "{}".format(sequence2[c-1])
            r += -1
            c += -1
        elif r_diff == -1:
            # up trace
            rev_seq1 += "{}".format(sequence1[r-1])
            rev_seq2 += "-"
            r += -1
        elif c_diff == -1:
            # left trace
            rev_seq1 += "-"
            rev_seq2 += "{}".format(sequence2[c-1])
            c += -1
            
        trace = traceback[r][c]
    
    seq1_align = rev_seq1[::-1]
    seq2_align = rev_seq2[::-1]
    align_score = scores[-1][-1]
        
    return align_score, seq1_align, seq2_align

In [7]:
# 2. Compute the Hamming distance between all pairs of sequences and report in a matrix

# Constants
match = 2
mismatch = -1
gap = -2

# Read in the sequences
header, d4050 = input_genome("small.fasta", 1)
header, d4020 = input_genome("small.fasta", 2)
header, d4030 = input_genome("small.fasta", 3)
header, d4049 = input_genome("small.fasta", 4)
header, d4029 = input_genome("small.fasta", 5)

sequences = [d4050, d4020, d4030, d4049, d4029]

#for sequence in sequences:
    #print(sequence)
    #print()
    
distances = np.zeros((5, 5))

for i in range(0, len(sequences)):
    for j in range(i+1, len(sequences)):
        align_score, seq1_align, seq2_align = globalign(sequences[i], sequences[j], match, mismatch, gap)
        
        hamming = 0
        for char1, char2 in zip(seq1_align, seq2_align):
            if char1 != '-' and char2 != '-':
                if char1 != char2:
                    hamming += 1
                    
        distances[i][j] = hamming
        distances[j][i] = hamming
        
        
print("d4050 is 1")
print("d4020 is 2")
print("d4030 is 3")
print("d4049 is 4")
print("d4029 is 5")
print()
print("   1  2  3  4  5")
i = 1
for row in distances:
    print(i, row)
    i += 1

d4050 is 1
d4020 is 2
d4030 is 3
d4049 is 4
d4029 is 5

   1  2  3  4  5
1 [0. 4. 5. 4. 7.]
2 [4. 0. 1. 5. 3.]
3 [5. 1. 0. 6. 4.]
4 [4. 5. 6. 0. 8.]
5 [7. 3. 4. 8. 0.]


In [8]:
# Here are my answers, but matched up to the order of your matrix
# I replaced my Ns with As, which is why the sequence 1 pairs have a discrepancy

# 14532
#104754
#4x0865
#5xx043
#3xxx01
#2xxxx0

3\. See the attached problem3.pdf

4\. Installed. The files are downloaded as large.phylip.txt and large.fasta.txt

5\. The dnadist file is large.phylip.dnadist. The raw tree files for the kitsch program are large.phylip.kitsch.outtree (description) and large.phylip.kitsch.outfile (tree visualization). The raw tree files for the neighbor program are large.phylip.neighbor.outtree (description) and large.phylip.neighbor.outfile (tree visualization). 

Both trees are broken into a rough structure of three subtrees made up of (70, 71, 72), (64, 65, 66, 67, 80), and (75, 76, 77, 78, 79). There are many details which make the trees very dissimilar, including scrambling of the overall structure and differences in the subtree orders and structures. 69, 74, and 81 are placed in completely different locations in each tree. 

6\. I ran Clustal Omega and MUSCLE. The alignment files are named large.ebi.clustal.alignmentfile and large.ebi.muscle.alignmentfile. There are also pdfs of the generated trees name large.ebi.clustal.tree and large.ebi.muscle.tree. 

The Phylip and EBI trees have vague similarities in two subtrees (70-74) and (75-79), but the orderings are different, and the third subtree is replaced by a cascade of nodes. On a quick glance I would say they are very different. 

The Clustal Omega and MUSCLE trees are almost identical to each other. There are only two differences, which appear in the connections between 67, 80, and 64. 