# ASO off-target finder (v1.0)

In [5]:
# Function for distance matrix and match finding

def find_match(aso, genome, allowed_difference):
    # Create distance matrix
    D = []
    for i in range(len(aso)+1):
        D.append([0]*(len(genome)+1))
    
    # Fill in the first column of the matrix
    for i in range(len(aso)+1):
        D[i][0] = i

    # Fill in the rest of the matrix
    for i in range(1, len(aso)+1): # First row/column already filled, start from 1
        for j in range(1, len(genome)+1):
            matrix_left = D[i][j-1] + 1
            matrix_above = D[i-1][j] + 1
            
            if aso[i-1] == genome[j-1]: # If ASO and genome sequence is same
                matrix_diagonal = D[i-1][j-1] # Do not add 1
            else: # If ASO and genome sequence is different
                matrix_diagonal = D[i-1][j-1] + 1 # Add 1
            
            # Acquire minimum distance from 3 positions
            D[i][j] = min(matrix_left, matrix_above, matrix_diagonal)

    # Count number of matches with less than allowed indel, mismatch
    match_result = dict() # Empty dictionary
    for distance in D[-1]: # Final line of matrix containing distance
        if distance > allowed_difference:
            continue
            
        if distance not in match_result:
            match_result[distance] = 0 # Add key for distance

        match_result[distance] += 1 # Add 1 to count
            
    for key, value in sorted(match_result.items()):
        print('Total %s matches with %s indel/mismatches' % (value, key))

Below is a example sequence from the Coursera lecture.

    seq1 = GCGTATGC
    seq2 = TATTGGCTATACGGTT
    
There is 1 mismatch and 1 insertion when aligning seq1 to seq2

In [2]:
aso = 'GCGTATGC'
genome = 'TATTGGCTATACGGTT'

find_match(aso, genome, 2)

Total 1 matches with 2 indel/mismatches


In [2]:
# Code from Coursera lecture that reads FASTA files

def read_fasta(filename):
    sequence = ''
    with open(filename) as f:
        for line in f:
            if not line[0] == '>':
                sequence += line.rstrip()
                
    return sequence

In [3]:
read_fasta('example_aso.fasta')

'GCGTATGC'

In [4]:
genome_sequence = read_fasta('example_genome.fasta')
aso_sequence = read_fasta('example_aso.fasta')

find_match(aso_sequence, genome_sequence, 2)

Total 1 matches with 2 indel/mismatches


In [6]:
# Change allowed indel/mismatch number

genome_sequence = read_fasta('example_genome.fasta')
aso_sequence = read_fasta('example_aso.fasta')

print('Allowed indel/mismatch: 1')
find_match(aso_sequence, genome_sequence, 1)

print('Allowed indel/mismatch: 3')
find_match(aso_sequence, genome_sequence, 3)

Allowed indel/mismatch: 1
Allowed indel/mismatch: 3
Total 1 matches with 2 indel/mismatches
Total 3 matches with 3 indel/mismatches


human.fasta is a sequence from part of human genome (GRCh38) chromosome 1

Extracted subsequence of 36 bases, that only appear once in FASTA file:

    AATCGGGTGGCTTTAACTAATGAAAATAGAATG

Changes 4 bases from the original sequence:

    AATG(C)GGGTGGCA(T)TTAAT(C)TAATGAAG(A)ATAGAATG

In [8]:
genome_sequence = read_fasta('human.fasta')
aso_sequence = 'AATGGGGTGGCATTAATTAATGAAGATAGAATG'

find_match(aso_sequence, genome_sequence, 4)

Total 1 matches with 4 indel/mismatches


In [9]:
genome_sequence = read_fasta('human.fasta')
aso_sequence = 'AATGGGGTGGCATTAATTAATGAAGATAGAATG'

find_match(aso_sequence, genome_sequence, 3)