In [3]:
import bio_lib_string_rs

In [24]:
def convert_dna_to_rna_native(s: str) -> str: 
    return s.replace("T", "U")

def convert_dna_to_rna(s: str) -> str: 
    return ''.join(["U" if char == 'T' else char for char in s])

In [25]:
with open('./data/rosalind_rna.txt') as f:
    dna = f.read().strip()

In [26]:
%%timeit
convert_dna_to_rna(dna)

47.4 µs ± 272 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [27]:
%%timeit
bio_lib_string_rs.convert_dna_to_rna(dna)

75.6 µs ± 423 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [28]:
%%timeit
convert_dna_to_rna_native(dna)

817 ns ± 7 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)


In [None]:
%%timeit
np.char.replace(dna, "T", "U")

7.17 µs ± 19.9 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


In [29]:
%%timeit
bio_lib_string_rs.convert_dna_to_rna_native(dna)

105 µs ± 6.29 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [30]:
with open('./benchmark-data/rna-large.txt') as f:
    dna_large = f.read().strip()

In [31]:
%%timeit
convert_dna_to_rna(dna_large)

4.5 ms ± 250 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [32]:
%%timeit
bio_lib_string_rs.convert_dna_to_rna(dna_large)

7.28 ms ± 241 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [33]:
%%timeit
convert_dna_to_rna_native(dna_large)

57.1 µs ± 3.45 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [34]:
%%timeit
bio_lib_string_rs.convert_dna_to_rna_native(dna_large)

9.26 ms ± 58.9 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [83]:
%%timeit
np.char.replace(dna_large, "T", "U")

228 µs ± 1.3 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [35]:
from dataclasses import dataclass
from typing import List

@dataclass
class PalindromeLocation:
    start_index: int
    length: int


def find_reverse_palindomes(seq: str) -> List[PalindromeLocation]:
    min_len = 4
    max_len = 12
    locations = []
    for i in range(0, len(seq) - min_len + 1):
        for length in range(min_len, max_len + 1, 2):
            if i + length > len(seq):
                continue
            test_seq = seq[i:(i + length)]
            if is_reverse_palindrome(test_seq):
                locations.append(
                    PalindromeLocation(start_index=i + 1, length=length)
                )
    return locations


def is_reverse_palindrome(seq: str) -> bool:
    return seq == reverse_complement_dna(seq)


def reverse_complement_dna(dna_seq: str) -> str:
    return ''.join([dna_base_complement(b) for b in dna_seq[::-1]])


def dna_base_complement(base: str) -> str:
    if base == "A":
        return "T"
    elif base == "T":
        return "A"
    elif base == "G":
        return "C"
    elif base == "C":
        return "G"
    else:
        raise Exception("Non-DNA base \"{}\" found.".format(base))


seq = "TCAATGCATGCGGGTCTATATGCAT"
test_answer = find_reverse_palindomes(seq)
test_answer = [(p.start_index, p.length) for p in test_answer]
true_answer = [
    (4, 6),
    (5, 4),
    (6, 6),
    (7, 4),
    (17, 4),
    (18, 4),
    (20, 6),
    (21, 4),
]
assert true_answer == test_answer

In [36]:
with open("./data/rosalind_revp.txt") as f:
    data = f.readlines()

input = "".join([l.strip() for l in data[1:]])

# _ = [print(p.start_index, p.length) for p in find_reverse_palindomes(input)]

In [37]:
%%timeit
_ = find_reverse_palindomes(input)

8.85 ms ± 377 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [38]:
%%timeit
_ = bio_lib_string_rs.find_reverse_palindomes(input)

6.11 ms ± 28.6 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [39]:
with open("./benchmark-data/revp-large.txt") as f:
    data_large = f.readlines()[0]

In [40]:
%%timeit
_ = find_reverse_palindomes(data_large)

880 ms ± 14.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [61]:
s = np.array(list("ATC"))
ss = reverse_complement_dna(s)
is_reverse_palindrome(ss)
ss
s

array(['A', 'T', 'C'], dtype='<U1')

In [41]:
%%timeit
_ = bio_lib_string_rs.find_reverse_palindomes(data_large)

612 ms ± 3.25 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [78]:
import numpy as np
import numpy.typing as npt

def find_reverse_palindomes_np(seq: str) -> List[PalindromeLocation]:
    min_len = 4
    max_len = 12
    locations = []
    np_seq: npt.ArrayLike = np.array(list(seq))
    for i in range(0, len(np_seq) - min_len + 1):
        for length in range(min_len, max_len + 1, 2):
            if i + length > len(np_seq):
                continue
            test_seq = np_seq[i:(i + length)]
            if is_reverse_palindrome_np(test_seq):
                locations.append(
                    PalindromeLocation(start_index=i + 1, length=length)
                )
    return locations

def is_reverse_palindrome_np(seq: npt.ArrayLike) -> bool:
    return np.array_equal(seq, map(dna_base_complement,np.flip(seq)) )


In [80]:
%%timeit
_ = find_reverse_palindomes_np(data_large)

1.87 s ± 69.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
