# <font color=green>*BCBIO 490*</font>

In [9]:
import numpy as np
import os
from Bio import SeqIO

class Sequence_Alignment:
    
    def __init__(self, match=10, mismatch=-20, gapOpen=40, gapExtend=2, ambiguous=0):
        self.gapOpen = abs(gapOpen)
        self.gapExtend = abs(gapExtend)
        self.sub_mat = np.zeros((129,129))
        for i in range(len(self.sub_mat)):
            for j in range(len(self.sub_mat[i])):
                if i == 'N' or j == 'N':
                    self.sub_mat[i][j] = 0 
                elif i != j:
                    self.sub_mat[i][j] = mismatch
                else:
                    self.sub_mat[i][j] = match
    
    def getScore(self, nuc1, nuc2):
        return self.sub_mat[ord(nuc1.upper())][ord(nuc2.upper())]
    
    def setGapOpenPenalty(self, penalty):
        self.gapOpen = abs(penalty)
        
    def setGapExtensionPenalty(self, penalty):
        self.gapExtend = abs(penalty)
        
    def alignInput(self, fasta1, fasta2, align_type):
        """
        cwd = os.getcwd()  # Get the current working directory (cwd)
        files = os.listdir(cwd)  # Get all the files in that directory
        print("Files in %r: %s" % (cwd, files))
        """
        if os.path.getsize(fasta1) == 0 or os.path.getsize(fasta2) == 0:
            return "Input contains empty file(s)"
        if align_type != 'global' and align_type != 'local':
            return "This program only has two alignment types: 'global' or 'local'"
                    
        with open(fasta1) as handle1, open(fasta2) as handle2:
            for record1, record2 in zip(SeqIO.parse(handle1, "fasta"), SeqIO.parse(handle2, "fasta")):
                print("Aligning " + record1.id + " with " + record2.id + " using " + align_type + " alignment: ")
                print()
                if align_type == 'global':
                    self.global_alignment(record1.seq, record2.seq)
                elif align_type == 'local':
                    self.local_alignment(record1.seq, record2.seq)

                    
    def global_alignment(self, seq1, seq2):
        if len(seq1) == 0 or len(seq2) == 0:
            return
        
        s_mat = np.zeros((len(seq1) + 1, len(seq2) + 1))
        i_mat = np.zeros((len(seq1) + 1, len(seq2) + 1))
        d_mat = np.zeros((len(seq1) + 1, len(seq2) + 1))
        
        d_mat[len(seq1)][len(seq2)] = s_mat[len(seq1)][len(seq2)] - self.gapOpen
        i_mat[len(seq1)][len(seq2)] = s_mat[len(seq1)][len(seq2)] - self.gapOpen
        for i in range(len(seq2) - 1, -1, -1):
            i_mat[len(seq1)][i] = i_mat[len(seq1)][i + 1] - self.gapExtend
            s_mat[len(seq1)][i] = i_mat[len(seq1)][i]
            d_mat[len(seq1)][i] = s_mat[len(seq1)][i] - self.gapOpen
        
        for j in range(len(seq1) - 1, -1, -1):
            d_mat[j][len(seq2)] = d_mat[j + 1][len(seq2)] - self.gapExtend
            s_mat[j][len(seq2)] = d_mat[j][len(seq2)]
            i_mat[j][len(seq2)] = s_mat[j][len(seq2)] - self.gapOpen
            
            for k in range(len(seq2) - 1, -1, -1):
                d_mat[j][k] = max(d_mat[j + 1][k] - self.gapExtend, s_mat[j + 1][k] - self.gapOpen - self.gapExtend)
                i_mat[j][k] = max(i_mat[j][k + 1] - self.gapExtend, s_mat[j][k + 1] - self.gapOpen - self.gapExtend)
                s_mat[j][k] = max(s_mat[j + 1][k + 1] + self.getScore(seq1[j], seq2[k]), d_mat[j][k], i_mat[j][k])
                
        current = 'S'
        i = j = 0
        align_seq1 = ""
        align_seq2 = ""
        align_mid = ""

        while i <= len(seq1) and j <= len(seq2):
            if current == 'S':
                if i == len(seq1) and j == len(seq2):
                    break
                elif i == len(seq1) or s_mat[i][j] == i_mat[i][j]:
                    current = 'I'
                    continue
                elif j == len(seq2) or s_mat[i][j] == d_mat[i][j]:
                    current = 'D'
                    continue
                align_seq1 += seq1[i]
                align_seq2 += seq2[j]
                if seq1[i] != seq2[j]:
                    align_mid += "*"
                else:
                    align_mid += "|"
                i += 1
                j += 1
                continue
                
            elif current == 'I':
                align_seq1 += '-'
                align_seq2 += seq2[j]
                align_mid += " "
                if (j == len(seq2) - 1) or (i_mat[i][j] == s_mat[i][j + 1] - self.gapOpen - self.gapExtend):
                    current = 'S'
                j += 1
                continue
                
            elif current == 'D':
                align_seq1 += seq1[i]
                align_seq2 += '-'
                align_mid += " "
                if (i == len(seq1) - 1) or (d_mat[i][j] == s_mat[i + 1][j] - self.gapOpen - self.gapExtend):
                    current = 'S'
                i += 1
                continue
        
        counter = 0
        curr1 = 1
        curr2 = 1
        max_space = len(str(max(len(seq1), len(seq2))))
        print("Alignment Score: " + str(s_mat[0][0]) + "\n")
        while counter <= len(align_seq1):
            print("Sequence 1 > " + self.generateString('left', max_space, str(curr1)) + align_seq1[counter: counter + 79] + self.generateString('right', max_space, str(min(curr1 + 79 - align_seq1[counter: counter + 79].count('-') - 1, len(seq1)))))
            print("             " + self.generateString('left', max_space, "") + align_mid[counter: counter + 79] + self.generateString('right', max_space, ""))
            print("Sequence 2 > " + self.generateString('left', max_space, str(curr2)) + align_seq2[counter: counter + 79] + self.generateString('right', max_space, str(min(curr2 + 79 - align_seq2[counter: counter + 79].count('-') - 1, len(seq2)))))
            
            print("\n")
            curr1 = min(curr1 + 79 - align_seq1[counter: counter + 79].count('-'), len(seq1))
            curr2 = min(curr2 + 79 - align_seq2[counter: counter + 79].count('-'), len(seq2))
            counter += 79
    
    def local_alignment(self, seq1, seq2):
        if len(seq1) == 0 or len(seq2) == 0:
            return
        
        highestScore = 0
        firstRow = len(seq1)
        firstCol = len(seq2)
        s_mat = np.zeros((len(seq1) + 1, len(seq2) + 1))
        i_mat = np.zeros((len(seq1) + 1, len(seq2) + 1))
        d_mat = np.zeros((len(seq1) + 1, len(seq2) + 1))
        
        d_mat[len(seq1)][len(seq2)] = -(self.gapOpen + self.gapExtend)
        i_mat[len(seq1)][len(seq2)] = -(self.gapOpen + self.gapExtend)
        for i in range(len(seq2) - 1, -1, -1):
            i_mat[len(seq1)][i] =  -(self.gapOpen + self.gapExtend)
            d_mat[len(seq1)][i] =  -(self.gapOpen + self.gapExtend)
        
        for j in range(len(seq1) - 1, -1, -1):
            d_mat[j][len(seq2)] =  -(self.gapOpen + self.gapExtend)
            i_mat[j][len(seq2)] =  -(self.gapOpen + self.gapExtend)
        
            for k in range(len(seq2) - 1, -1, -1):
                d_mat[j][k] = max(d_mat[j + 1][k] - self.gapExtend, s_mat[j + 1][k] - self.gapOpen - self.gapExtend)
                i_mat[j][k] = max(i_mat[j][k + 1] - self.gapExtend, s_mat[j][k + 1] - self.gapOpen - self.gapExtend)
                s_mat[j][k] = max(0, s_mat[j + 1][k + 1] + self.getScore(seq1[j], seq2[k]), d_mat[j][k], i_mat[j][k])
                
                if highestScore < s_mat[j][k]:
                    highestScore = s_mat[j][k]
                    firstRow = j
                    firstCol = k
                    
        current = 'S'
        i = firstRow
        j = firstCol 
        align_seq1 = ""
        align_seq2 = ""
        align_mid = ""

        while i <= len(seq1) and j <= len(seq2):
            if current == 'S':
                if i == len(seq1) or j == len(seq2) or s_mat[i][j] == 0:
                    break
                elif s_mat[i][j] == i_mat[i][j]:
                    current = 'I'
                    continue
                elif s_mat[i][j] == d_mat[i][j]:
                    current = 'D'
                    continue
                align_seq1 += seq1[i]
                align_seq2 += seq2[j]
                if seq1[i] != seq2[j]:
                    align_mid += "*"
                else:
                    align_mid += "|"
                i += 1
                j += 1
                continue
                
            elif current == 'I':
                align_seq1 += '-'
                align_seq2 += seq2[j]
                align_mid += " "
                if (j == len(seq2) - 1) or (i_mat[i][j] == s_mat[i][j + 1] - self.gapOpen - self.gapExtend):
                    current = 'S'
                j += 1
                continue
                
            elif current == 'D':
                align_seq1 += seq1[i]
                align_seq2 += '-'
                align_mid += " "
                if (i == len(seq1) - 1) or (d_mat[i][j] == s_mat[i + 1][j] - self.gapOpen - self.gapExtend):
                    current = 'S'
                i += 1
                continue
                
        lastRow = i
        lastCol = j
        counter = 0
        curr1 = firstRow + 1
        curr2 = firstCol + 1
        max_space = len(str(max(len(seq1), len(seq2))))
        print("Alignment Score: " + str(s_mat[firstRow][firstCol]) + "\n")
        while counter <= len(align_seq1):
            print("Sequence 1 > " + self.generateString('left', max_space, str(curr1)) + align_seq1[counter: counter + 79] + self.generateString('right', max_space, str(min(curr1 + 79 - align_seq1[counter: counter + 79].count('-') - 1, lastRow))))
            print("             " + self.generateString('left', max_space, "") + align_mid[counter: counter + 79] + self.generateString('right', max_space, ""))
            print("Sequence 2 > " + self.generateString('left', max_space, str(curr2)) + align_seq2[counter: counter + 79] + self.generateString('right', max_space, str(min(curr2 + 79 - align_seq2[counter: counter + 79].count('-') - 1, lastCol))))
            print("\n")
            curr1 = min(curr1 + 79 - align_seq1[counter: counter + 79].count('-'), lastRow)
            curr2 = min(curr2 + 79 - align_seq2[counter: counter + 79].count('-'), lastCol)
            counter += 79
                            
        
    def generateString(self, position, max_space, string):
        if len(string) > max_space:
            return
        if position == 'left':
            return string + ((max_space - len(string)) * " ") + " "
        elif position == 'right':
            return " " + ((max_space - len(string)) * " ") + string
        else:
            return

In [141]:
p = Sequence_Alignment(10, -20, -40, -2)
A = "ATGCGTGAATTAAA"
B = "AGTCGTGCGCTTTTATCTTAAAAAA"
p.global_alignment(A, B)

Alignment Score: -50.0

Sequence 1 > 1  ATGCGTG-------------AATTAAA 14
                |**||||             ||  |||   
Sequence 2 > 1  AGTCGTGCGCTTTTATCTTAAA--AAA 25




In [172]:
p = Sequence_Alignment(10, -20, -40, -2)
A = "AGCTACGTACACTACC"
B = "AGCTATCGTACTAGC"
p.global_alignment(A, B)

Alignment Score: 24.0

Sequence 1 > 1  AGCTA-CGTACACTACC 16
                ||||| |||  ||||*|   
Sequence 2 > 1  AGCTATCGT--ACTAGC 15




In [194]:
p = Sequence_Alignment(10, -20, -40, -2)
p.alignInput("alignment_test.txt")

Alignment Score: -164.0

Sequence 1 > 1  ATGGCAGGCTTAT--------------TCCGTAG--------------TCGTAGGGGTCGTTTTCGCGCGCGCTTAGGC 51
                     ||||*|||              ||*||||              ||||    |||||                     
Sequence 2 > 1  -----AGGCATATTGATTCCGTGACTGTCTGTAGTGGGCTGTGTAAGATCGT----GTCGT------------------ 52


Sequence 1 > 52 TTTAGGGCCCC 62
                       *||*   
Sequence 2 > 53 -------GCCT 56




In [7]:
p = Sequence_Alignment()
p.alignInput("FASTA1.txt", "FASTA2.txt", 'global')

Aligning BTBSCRYR with BTBSCRYR using global alignment: 

Alignment Score: 6200.0

Sequence 1 > 1   tgcaccaaacatgtctaaagctggaaccaaaattactttctttgaagacaaaaactttcaaggccgccactatgacagc  79
                 |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||    
Sequence 2 > 1   tgcaccaaacatgtctaaagctggaaccaaaattactttctttgaagacaaaaactttcaaggccgccactatgacagc  79


Sequence 1 > 80  gattgcgactgtgcagatttccacatgtacctgagccgctgcaactccatcagagtggaaggaggcacctgggctgtgt 158
                 |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||    
Sequence 2 > 80  gattgcgactgtgcagatttccacatgtacctgagccgctgcaactccatcagagtggaaggaggcacctgggctgtgt 158


Sequence 1 > 159 atgaaaggcccaattttgctgggtacatgtacatcctaccccggggcgagtatcctgagtaccagcactggatgggcct 237
                 |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||    
Sequence 2 > 159 atgaaaggcccaattttgctgggtacatgtacatcctaccccggggcgagtatcctgagtaccagcactggatgggcct 237


Se

In [3]:
p = Sequence_Alignment()
p.alignInput("FASTA1.txt", "FASTA2.txt", 'globall')

"This program only has two alignment types: 'global' or 'local'"

In [8]:
p = Sequence_Alignment()
p.alignInput("FASTA1.txt", "FASTA2.txt", 'local')

Aligning BTBSCRYR with BTBSCRYR using local alignment: 

Alignment Score: 6200.0

Sequence 1 > 1   tgcaccaaacatgtctaaagctggaaccaaaattactttctttgaagacaaaaactttcaaggccgccactatgacagc  79
                 |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||    
Sequence 2 > 1   tgcaccaaacatgtctaaagctggaaccaaaattactttctttgaagacaaaaactttcaaggccgccactatgacagc  79


Sequence 1 > 80  gattgcgactgtgcagatttccacatgtacctgagccgctgcaactccatcagagtggaaggaggcacctgggctgtgt 158
                 |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||    
Sequence 2 > 80  gattgcgactgtgcagatttccacatgtacctgagccgctgcaactccatcagagtggaaggaggcacctgggctgtgt 158


Sequence 1 > 159 atgaaaggcccaattttgctgggtacatgtacatcctaccccggggcgagtatcctgagtaccagcactggatgggcct 237
                 |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||    
Sequence 2 > 159 atgaaaggcccaattttgctgggtacatgtacatcctaccccggggcgagtatcctgagtaccagcactggatgggcct 237


Seq

In [19]:
print(5/2)

2.5
