In [1]:
def hamming_distance(s1, s2):
    if len(s1) != len(s2):
        raise ValueError("Strings must be of the same length")
    
    return sum(el1 != el2 for el1, el2 in zip(s1, s2))

In [2]:
def levenshtein_distance(s1, s2):
    if len(s1) != len(s2):
        raise ValueError("Strings must be of the same length")
    d=dict()
    for i in range(len(s1)+1):
        d[i]=dict()
        d[i][0]=i
    for i in range(len(s2)+1):
        d[0][i] = i
    for i in range(1, len(s1)+1):
        for j in range(1, len(s2)+1):
            d[i][j] = min(d[i][j-1]+1, d[i-1][j]+1, d[i-1][j-1]+(not s1[i-1] == s2[j-1]))
    return d[len(s1)][len(s2)]

In [19]:
def grantham_distance(s1, s2):
    grantham_matrix = {
      'A': {'A':0,'R':112,'N':111,'D':126,'C':195,'Q':91,'E':107,'G':60,'H':86,'I':94,'L':96,'K':106,'M':84,'F':113,'P':27,'S':99,'T':58,'W':148,'Y':112,'V':64},
      'R': {'A':112,'R':0,'N':86,'D':96,'C':180,'Q':43,'E':54,'G':125,'H':29,'I':97,'L':102,'K':26,'M':91,'F':97,'P':103,'S':110,'T':71,'W':101,'Y':77,'V':96},
      'N': {'A':111,'R':86,'N':0,'D':23,'C':139,'Q':46,'E':42,'G':80,'H':68,'I':149,'L':153,'K':94,'M':142,'F':158,'P':91,'S':46,'T':65,'W':174,'Y':143,'V':133},
      'D': {'A':126,'R':96,'N':23,'D':0,'C':154,'Q':61,'E':45,'G':94,'H':81,'I':168,'L':172,'K':101,'M':160,'F':177,'P':108,'S':65,'T':85,'W':181,'Y':160,'V':152},
      'C': {'A':195,'R':180,'N':139,'D':154,'C':0,'Q':154,'E':170,'G':159,'H':174,'I':198,'L':198,'K':202,'M':196,'F':205,'P':169,'S':112,'T':149,'W':215,'Y':194,'V':192},
      'Q': {'A':91,'R':43,'N':46,'D':61,'C':154,'Q':0,'E':29,'G':87,'H':24,'I':109,'L':113,'K':53,'M':101,'F':116,'P':76,'S':68,'T':42,'W':130,'Y':99,'V':96},
      'E': {'A':107,'R':54,'N':42,'D':45,'C':170,'Q':29,'E':0,'G':98,'H':40,'I':134,'L':138,'K':56,'M':126,'F':140,'P':93,'S':80,'T':65,'W':152,'Y':122,'V':121},
      'G': {'A':60,'R':125,'N':80,'D':94,'C':159,'Q':87,'E':98,'G':0,'H':98,'I':135,'L':138,'K':127,'M':127,'F':153,'P':42,'S':56,'T':59,'W':184,'Y':147,'V':109},
      'H': {'A':86,'R':29,'N':68,'D':81,'C':174,'Q':24,'E':40,'G':98,'H':0,'I':94,'L':99,'K':32,'M':87,'F':100,'P':77,'S':89,'T':47,'W':115,'Y':83,'V':84},
      'I': {'A':94,'R':97,'N':149,'D':168,'C':198,'Q':109,'E':134,'G':135,'H':94,'I':0,'L':5,'K':102,'M':10,'F':21,'P':95,'S':142,'T':89,'W':61,'Y':33,'V':29},
      'L': {'A':96,'R':102,'N':153,'D':172,'C':198,'Q':113,'E':138,'G':138,'H':99,'I':5,'L':0,'K':107,'M':15,'F':22,'P':98,'S':145,'T':92,'W':61,'Y':36,'V':32},
      'K': {'A':106,'R':26,'N':94,'D':101,'C':202,'Q':53,'E':56,'G':127,'H':32,'I':102,'L':107,'K':0,'M':95,'F':102,'P':103,'S':121,'T':78,'W':110,'Y':85,'V':97},
      'M': {'A':84,'R':91,'N':142,'D':160,'C':196,'Q':101,'E':126,'G':127,'H':87,'I':10,'L':15,'K':95,'M':0,'F':28,'P':87,'S':135,'T':81,'W':67,'Y':36,'V':21},
      'F': {'A':113,'R':97,'N':158,'D':177,'C':205,'Q':116,'E':140,'G':153,'H':100,'I':21,'L':22,'K':102,'M':28,'F':0,'P':114,'S':155,'T':103,'W':40,'Y':22,'V':50},
      'P': {'A':27,'R':103,'N':91,'D':108,'C':169,'Q':76,'E':93,'G':42,'H':77,'I':95,'L':98,'K':103,'M':87,'F':114,'P':0,'S':74,'T':38,'W':147,'Y':110,'V':68},
      'S': {'A':99,'R':110,'N':46,'D':65,'C':112,'Q':68,'E':80,'G':56,'H':89,'I':142,'L':145,'K':121,'M':135,'F':155,'P':74,'S':0,'T':58,'W':177,'Y':144,'V':124},
      'T': {'A':58,'R':71,'N':65,'D':85,'C':149,'Q':42,'E':65,'G':59,'H':47,'I':89,'L':92,'K':78,'M':81,'F':103,'P':38,'S':58,'T':0,'W':128,'Y':92,'V':69},
      'W': {'A':148,'R':101,'N':174,'D':181,'C':215,'Q':130,'E':152,'G':184,'H':115,'I':61,'L':61,'K':110,'M':67,'F':40,'P':147,'S':177,'T':128,'W':0,'Y':37,'V':88},
      'Y': {'A':112,'R':77,'N':143,'D':160,'C':194,'Q':99,'E':122,'G':147,'H':83,'I':33,'L':36,'K':85,'M':36,'F':22,'P':110,'S':144,'T':92,'W':37,'Y':0,'V':55},
      'V': {'A':64,'R':96,'N':133,'D':152,'C':192,'Q':96,'E':121,'G':109,'H':84,'I':29,'L':32,'K':97,'M':21,'F':50,'P':68,'S':124,'T':69,'W':88,'Y':55,'V':0}
   }
    
    if len(s1) != len(s2):
        raise ValueError("Sequences must be of the same length")

    distance = 0
    compared_sites = 0

    for aa1, aa2 in zip(s1, s2):
        if aa1 == "-" or aa2 == "-":
            continue
        if aa1 == "*" or aa2 == "*":
            continue
        if aa1 not in grantham_matrix or aa2 not in grantham_matrix[aa1]:
            continue
        #print(aa1,aa2)
        distance += grantham_matrix[aa1][aa2]
        compared_sites += 1
    #print(distance, compared_sites)
    return distance, compared_sites

In [20]:
grantham_distance("ARN","ARD")

A A
R R
N D
23 3


(23, 3)

In [21]:
from Bio.SeqIO import parse
file = open("../resources/test/seqs_DENV1.fasta") 
for record in parse(file, "fasta"): print(record.id) 

2017.775
2017.315
2017.288
2016.874


In [5]:
from Bio.SeqIO import parse
from Bio import pairwise2
from Bio.pairwise2 import format_alignment 

ref_denv1 = None
with open("../resources/test/ref_DENV1.fasta") as file:
    for record in parse(file, "fasta"):
        ref = record.seq.translate()
        print(ref)

with open("../resources/test/seqs_DENV1.fasta") as file:
    for record in parse(file, "fasta"):

        seq = record.seq.translate()
        #alignment = pairwise2.align.globalxx(ref, seq)[0]

        #aligned_ref = alignment.seqA
        #aligned_seq = alignment.seqB

        print(seq)
        print('#'*60)
        #print(format_alignment(*alignment))
        #grantham_distance(aligned_ref,aligned_seq)



VDRKEQFRIGSLLNVVLTVFY*RADL**TTNEKRRLDRLSIC*NARETACQLVHSWRRDSQKDCFQAKDP*NW*WLS*HS*DF*PYPQQQEFWLDGAHSRRMERSKCYGVSRKKSQTC*T**IEGKDL*PCSSCCCPQPWRSI*LHEGESHT**LASRKEKSHSCLRPL*VSTCAPL*RWIWESYVRTQ*LTNALELLRRNQMTLIVGAMLQTHG*PMEHVPKLASTDGTNVPSHWPHTWDLV*KQEPKRGCPLKALGNKYKEWRLGLCDTQDSR**PFFLHMP*EHPSLRKGLFSFC*C**HHPWPCDAWE*AAGTSWKDYQEQLG*TWYWNMEVASPPWQKTNQHWTLNS*KRRSRTLPSCANCALKLKYQTPPPIQDVQHKEKLHWWKNKTRTLCVDERSWTEAGVMAADYLEKEAY*RVLSSSV*QN*KER*FNMKT*NIQ**SLSTLGTSTRWETRLQNMEQLQP*HLKLLRRKYS*PTTEPSHWTAHLELGWTLMRWCY*Q*KKNHGLFTNNGF*TYHCLGLRGLQHPKRLGTDKICWSHSRQLMQRSRK*SYWDHRKEQCTLR*LGRQKSKRLERQQFLQDT*NVD*KWTN*L*KGCHM*CAQAHLS*RRKWLRPSMELF*CRLNTKEQMHHARSPFRPKMRKE*PRID**QPIL*LLTKKNQSTLRQNHLLVRATSW*GQVKKL*NNAGSRKEAA*GKCSKQPPEEHEGWLSWETPHGTSVL*EECSRLWEN*CIRFLEPHMGFCSAVFLGP*K*E*GFC*HGWD*IQGARHFR*RALQLAWSHCT*ESWFKRTRDV*STGRAENSNVEVAFLSLMKSTLGQSNTNFKLTPQKDYQQPSERHGRRVCVEFDQPLVSRTSCGSKYQMN*TTSYLKMT*NSQWL*EMLLGSWPKGKK*LDHNPWNTNTHGKAGEKPKS*EQTYRTPPSSLTAQILQNVLMTKEHGTFGKLRTMGSEFSRQTYG*NCVTPTPKCVTTG*CQLPSRTARQSMLIWG