In [15]:
import functools
from collections import defaultdict
import itertools
scoring_matrix = defaultdict(dict)
for v1, v2 in itertools.product("ABCDEFGHIJKLMNOPQRSTUVWXYZ", repeat=2):
    scoring_matrix[v1][v2] = -1 if v1!=v2 else 0

In [16]:
def global_scoring_grid(v, w, scoring_matrix, sigma=5):    
    n = len(v)
    m = len(w)
    
    backtrack = []
    for i in range(n+1):
        backtrack.append([(0,0,"","")]*(m+1))   
        
    for i in range(1,n+1):
        backtrack[i][0] = (i-1,0,v[i-1],"-")
    
    for j in range(1,m+1):
        backtrack[0][j] = (0,j-1,"-",w[j-1])
    
    grid = []
    for i in range(n+1):
        grid.append([0]*(m+1))
        
    for i in range(n+1):
        grid[i][0] = -i*sigma
    for j in range(m+1):
        grid[0][j] = -j*sigma
        
    for i in range(1,n+1):
        for j in range(1,m+1):
            match = scoring_matrix[v[i-1]][w[j-1]]
            grid[i][j] = max(
                grid[i-1][j] - sigma,
                grid[i][j-1] - sigma,
                grid[i-1][j-1] + match 
            )
            if grid[i][j] == grid[i-1][j] - sigma:
                backtrack[i][j] = (i-1,j,v[i-1],"-")
            elif grid[i][j] == grid[i][j-1] - sigma:
                backtrack[i][j] = (i,j-1,"-",w[j-1])
            elif grid[i][j] == grid[i-1][j-1] + match:
                backtrack[i][j] = (i-1,j-1,v[i-1],w[j-1])
                
    return grid, backtrack

In [17]:
def traceback_path(backtrack, v, w, i, j):
    if j == 0 and i==0:
        return ("","")
    vv, ww = traceback_path(backtrack, v, w, backtrack[i][j][0], backtrack[i][j][1]) 
    vv += backtrack[i][j][2]
    ww += backtrack[i][j][3]
    return vv, ww

In [18]:
def align(v, w, score_function):
    grid, backtrack =  score_function(v, w)
    score = grid[len(grid)-1][len(grid[0])-1]
    aligned_v, aligned_w = traceback_path(backtrack, v, w, len(v), len(w))
    return score, aligned_v, aligned_w

In [19]:
v = "PLEASANTLY"
w = "MEANLY"

In [20]:
s, vv, ww = align(v, w, functools.partial(global_scoring_grid, scoring_matrix=scoring_matrix, sigma=1))
print(-s, vv, ww)

5 PLEASANTLY M-EA--N-LY


In [12]:
with open("../data/dataset_248_3.txt","r") as fin:
    lines = fin.read().strip().split("\n")
    v = lines[0].strip()
    w = lines[1].strip()

In [13]:
s, vv, ww = align(v, w, functools.partial(global_scoring_grid, scoring_matrix=scoring_matrix, sigma=1))
print(-s)

402


In [39]:
def fitting_scoring_grid(v, w, scoring_matrix, sigma=5):    
    n = len(v)
    m = len(w)
    
    backtrack = []
    for i in range(n+1):
        backtrack.append([(0,0,"","")]*(m+1))   
        
    for i in range(1,n+1):
        backtrack[i][0] = (0,0,"","")
    
    for j in range(1,m+1):
        backtrack[0][j] = (0,j-1,"-",w[j-1])
    
    grid = []
    for i in range(n+1):
        grid.append([0]*(m+1))
        
    for i in range(n+1):
        grid[i][0] = 0
    for j in range(m+1):
        grid[0][j] = -j*sigma
        
    max_score = -1
    max_i = 0
    for i in range(1,n+1):
        for j in range(1,m+1):
            match = scoring_matrix[v[i-1]][w[j-1]]
            grid[i][j] = max(
                grid[i-1][j] - sigma,
                grid[i][j-1] - sigma,
                grid[i-1][j-1] + match 
            )
            if j==m and grid[i][j] > max_score:
                max_score = grid[i][m]
                max_i = i
            if grid[i][j] == grid[i-1][j] - sigma:
                backtrack[i][j] = (i-1,j,v[i-1],"-")
            elif grid[i][j] == grid[i][j-1] - sigma:
                backtrack[i][j] = (i,j-1,"-",w[j-1])
            elif grid[i][j] == grid[i-1][j-1] + match:
                backtrack[i][j] = (i-1,j-1,v[i-1],w[j-1])
                
    if max_i != n:
        backtrack[n][m] = (max_i, m, "", "")
        grid[n][m] = max_score            
                
    return grid, backtrack

In [40]:
v="GTAGGCTTAAGGTTA"
w="TAGATA"

In [51]:
fitting_scoring_matrix = defaultdict(dict)
for v1, v2 in itertools.product("ABCDEFGHIJKLMNOPQRSTUVWXYZ", repeat=2):
    fitting_scoring_matrix[v1][v2] = -1 if v1!=v2 else 1

In [42]:
s, vv, ww = align(v, w, functools.partial(fitting_scoring_grid, scoring_matrix=fitting_scoring_matrix, sigma=1))
print(s)
print(vv)
print(ww)

2
TAGGCTTA
TAGA-T-A


In [45]:
v="CAATCACCCCAATCCCTCAATCCTGGCCCCACGCATAGGCTAATGCCAATCGCGGCCAGGGTATAACCGCCATAACTGTGGGTCAGAAGGGATAAGTTCCACAATCCTATTTTCCTCGAGGCGCTTCGATGCGTTAACGCGTACACTCTGTCGGCCAACCGTGTGGGAGCCGAATTGGCTGGGCTGTTGAACATTCTATCAGTAGATAAACGAAGGTACATCCGAGGTTGTCGATCGACCGCGGGGTCGTAGCGCGTGCATGTTCCTTTCAGGCCCACATACTCCGGAACGGTTCATATCACGACTATTCTTGCACAATCGGACAACGGTGTACCATGGTGGACACCGTAGGAGACCAATACTGCGTAAATCATAAGCATTGGAGAGTGGACTGCTAGCGAGGCTCACCATGGAGTCTCGGTCGGCATCTCCTGACTGCTGTTCCATCGCGTTTTTCTTTTACTCACGCAATAAATCAATACCCCCTAACACAGGCCTGCTCCAGCCTTATTAAGGCCATAGTAGCTCTACATGTAGACCGAACGGAAGCACAGTTTGGTAGAAATTCTTAATCGACTATGGTCCGTGCAGGCCAAAAAAGGAATAATCTTCGAATTCTCACGCCTTCATTAGGGCGCACATGGTGGGGTAAATCACTGCACTCTGTTCGCAGTTAAGCGTTGCAATCAATATCGGCAGAACTCGGAGTCCGTATAAAGCCGCCTCAGCGTGCACACGCCCGTGCGGCACGTCATTAGACGAGGATTCCGGGGGACTGGCCTGTTCGTAATCCACTAAAACAATGGTCCTACCATCTAAAACGCACCGTGTTCCCCTCTACGGGAACCCCCTAGAT"
w="AGAGCGCAGAGAAGTCATTAGAACATGTAGCACATCGCTTATTAAGGGTCAATACCTAAAGGGCCTAACTATACGCCACACGGAACAGCTC"

In [46]:
s, vv, ww = align(v, w, functools.partial(fitting_scoring_grid, scoring_matrix=fitting_scoring_matrix, sigma=1))
print(s)
print(vv)
print(ww)

22
AGGGCGCACATG--GTGGGGTA-AATCAC-T-GCAC-TCTG-TTCGCAGTTAAGCGTTGCAATCAATATCGGC-AGAACTCGGAGTCCGTA--TAAAGCCGCCTCAGCGTGCACACGC-C
AGAGCGCAGA-GAAGTCAT-TAGAA-CATGTAGCACATC-GCTT---A-TTAAG-G--G---TCAATA-C--CTA-AA---GG-G-CC-TAACTATA-C-GCCACA-CG-GAACA-GCTC


In [47]:
with open("../data/dataset_248_5.txt","r") as fin:
    lines = fin.read().strip().split("\n")
    v = lines[0].strip()
    w = lines[1].strip()

In [48]:
s, vv, ww = align(v, w, functools.partial(fitting_scoring_grid, scoring_matrix=fitting_scoring_matrix, sigma=1))
print(s)
print(vv)
print(ww)

22
C-AATTGC-CGTACGTGTAATCTACGTCAAGAGACT--GTT-CGCCCGCGTA-TGGGATAGCAAAGACATAAACATCGTA-TTAGACTAGGGTTATGCCCAC-TAC
CCAAT--CACGCACAT-TCC-CTAAG--AAG-GACTAAGTAAC-CCCG-GAACTGTG-TAG----GAT-TAAATA-CGTACTTATTCTCG--T-ATGCTCAAGTAC


In [49]:
#quiz

In [50]:
w="GATACACT"
v="ACGACCACAGATACCGCTATTCACTATATCGTT"

In [52]:
s, vv, ww = align(v, w, functools.partial(fitting_scoring_grid, scoring_matrix=fitting_scoring_matrix, sigma=1))
print(s)
print(vv)
print(ww)

5
GATACCGCT
GATACA-CT


In [83]:
def overlap_scoring_grid(v, w, scoring_matrix, sigma=5):    
    n = len(v)
    m = len(w)
    
    backtrack = []
    for i in range(n+1):
        backtrack.append([(0,0,"","")]*(m+1))   
        
    for i in range(1,n+1):
        backtrack[i][0] = (0,0,"","")
    
    for j in range(1,m+1):
        backtrack[0][j] = (0,j-1,"-",w[j-1])
    
    grid = []
    for i in range(n+1):
        grid.append([0]*(m+1))
        
    for i in range(n+1):
        grid[i][0] = 0
    for j in range(m+1):
        grid[0][j] = -j*sigma
        
    max_score = -1
    max_j = 0
    for i in range(1,n+1):
        for j in range(1,m+1):
            match = scoring_matrix[v[i-1]][w[j-1]]
            grid[i][j] = max(
                grid[i-1][j] - sigma,
                grid[i][j-1] - sigma,
                grid[i-1][j-1] + match 
            )
            if i==n and grid[i][j] >= max_score:
                max_score = grid[n][j]
                max_j = j
            if grid[i][j] == grid[i-1][j-1] + match:
                backtrack[i][j] = (i-1,j-1,v[i-1],w[j-1])
            elif grid[i][j] == grid[i-1][j] - sigma:
                backtrack[i][j] = (i-1,j,v[i-1],"-")
            elif grid[i][j] == grid[i][j-1] - sigma:
                backtrack[i][j] = (i,j-1,"-",w[j-1])
                
    if max_j != m:
        backtrack[n][m] = (n, max_j, "", "")
        grid[n][m] = max_score            
            
    return grid, backtrack

In [84]:
v="PAWHEAE"
w="HEAGAWGHEE"

In [85]:
overlap_scoring_matrix = defaultdict(dict)
for v1, v2 in itertools.product("ABCDEFGHIJKLMNOPQRSTUVWXYZ", repeat=2):
    overlap_scoring_matrix[v1][v2] = -2 if v1!=v2 else 1

In [86]:
s, vv, ww = align(v, w, functools.partial(overlap_scoring_grid, scoring_matrix=overlap_scoring_matrix, sigma=2))

In [87]:
print(s)
print(vv)
print(ww)

1
HEAE
HEAG


In [88]:
with open("../data/dataset_248_7.txt","r") as fin:
    lines = fin.read().strip().split("\n")
    v = lines[0].strip()
    w = lines[1].strip()

In [89]:
s, vv, ww = align(v, w, functools.partial(overlap_scoring_grid, scoring_matrix=overlap_scoring_matrix, sigma=2))
print(s)
print(vv)
print(ww)

43
CGCTATGTACAGTTCATGCCCAAAAGGC-GTGGTGCAAGAACATGCAT-CGCGC-AAGCGCC--GTGCA-GGAAGATTCTGGCCCAGATGATGT-CCT-T--AACCGCACAAGATATTATTTACCGTA-CTAAGAACGGGCTGGTCAATGGAGGTACGACTGCCAACAAAGATCTTGTTGGTAATGCGACGGAGTTGAC-AAGCCATGCCGATGATGAATAACTTT-AAG-GACAATGGAGCATGTGTTTATCG-G-TC
C-CAATGTTCCGTACATG--C-AAAGGCTGTGGTGCAAGTACAT-C-TCCTCGCAAAGCGCCATGTGGATTAAAG-TTCT-GCCCAGAT-A-GTAACTATACAATCG--CAAGGTGCCACTTACCGTACCTAAGCACGGGATGGT-TAGGGAGGTGCG-CTGCCCAC-AA-CTCCCG-TCGTAA-GTGTCGGAGTTGACTAAGCCATG-CAATGGTG-ATAACTTTGCAGTGTCAATGG-GCAT-T-TGTATCGCGTTC
