In [7]:
def paired_composition(dna_string, k, d):
    pairs = []
    for i in range(len(dna_string)-2*k-d+1):
        tpl = (dna_string[i:i+k], dna_string[i+k+d:i+2*k+d])
        pairs.append(tpl)
    return sorted(pairs)

In [11]:
def print_pairs(pair_list):
    return " ".join([f"({p1}|{p2})" for p1, p2 in pair_list])
        

In [12]:
input = "TAATGCCATGGGATGTT"
print_pairs(paired_composition(input, 3, 1))

'(AAT|CCA) (ATG|CAT) (ATG|GAT) (CAT|GGA) (CCA|GGG) (GCC|TGG) (GGA|GTT) (GGG|TGT) (TAA|GCC) (TGC|ATG) (TGG|ATG)'

In [13]:
print_pairs(paired_composition("TAATGCCATGGGATGTT", 3, 2))

'(AAT|CAT) (ATG|ATG) (ATG|ATG) (CAT|GAT) (CCA|GGA) (GCC|GGG) (GGG|GTT) (TAA|CCA) (TGC|TGG) (TGG|TGT)'

#### read pairs

In [1]:
from collections import defaultdict
from utils import *

In [66]:
k = 4
d = 2
input = [
    "GAGA|TTGA",
    "TCGT|GATG",
    "CGTG|ATGT",
    "TGGT|TGAG",
    "GTGA|TGTT",
    "GTGG|GTGA",
    "TGAG|GTTG",
    "GGTC|GAGA",
    "GTCG|AGAT"
]

In [67]:
def debruijn_from_readpairs(patterns, k, d):
    graph = defaultdict(list)
    for pattern in patterns:
        kmer1, kmer2 = pattern.strip().split('|')
        prefix = f"{kmer1[0:k-1]}|{kmer2[0:k-1]}"
        suffix = f"{kmer1[1:k]}|{kmer2[1:k]}"
        graph[prefix].append(suffix)
    return graph

In [68]:
g = debruijn_from_readpairs(input, k, d)
g

defaultdict(list,
            {'GAG|TTG': ['AGA|TGA'],
             'TCG|GAT': ['CGT|ATG'],
             'CGT|ATG': ['GTG|TGT'],
             'TGG|TGA': ['GGT|GAG'],
             'GTG|TGT': ['TGA|GTT'],
             'GTG|GTG': ['TGG|TGA'],
             'TGA|GTT': ['GAG|TTG'],
             'GGT|GAG': ['GTC|AGA'],
             'GTC|AGA': ['TCG|GAT']})

In [69]:
ep = eulerian_path(g)
ep

['GTG|GTG',
 'TGG|TGA',
 'GGT|GAG',
 'GTC|AGA',
 'TCG|GAT',
 'CGT|ATG',
 'GTG|TGT',
 'TGA|GTT',
 'GAG|TTG',
 'AGA|TGA']

In [70]:
def pairpath_to_genome(pairs, k, d):
    k = k -1
    genome = pairs[0].split("|")[0]
    for i, pair in enumerate(pairs[1:]):
        prefix = pair.split("|")[0]
        prev_pair = pairs[i].split("|")[0]
        assert prefix[0:k-1] == prev_pair[1:k]
        genome += prefix[-1]
    prev_pair = pairs[-1].split("|")[0]
    for i in range(len(pairs)-k-d-1,len(pairs)):
        prefix = pairs[i].split("|")[1]
        print(prefix, prev_pair)
        assert prefix[0:k-1] == prev_pair[1:k]
        genome += prefix[-1]
        prev_pair = prefix
    return genome

In [72]:
reconstructed_genome = pairpath_to_genome(ep, k, d)

GAT AGA
ATG GAT
TGT ATG
GTT TGT
TTG GTT
TGA TTG


In [73]:
assert reconstructed_genome == "GTGGTCGTGAGATGTTGA"

In [89]:
read_pairs = []
with open("../data/dataset_204_16.txt", "r") as fin:
    for i, line in enumerate(fin):
        if i == 0:
            kstr, dstr = line.strip().split(" ")
            k = int(kstr.strip())
            d = int(dstr.strip())
        else:
            read_pairs.append(line.strip())

In [90]:
print(k, d)
g = debruijn_from_readpairs(read_pairs, k, d)
ep = eulerian_path(g)
reconstructed_genome = pairpath_to_genome(ep, k, d)

50 200
CAGACTAAATCGGCAAGAGCAGATTAACTCAGCCACTGCCGCGCCAACT ACAGACTAAATCGGCAAGAGCAGATTAACTCAGCCACTGCCGCGCCAAC
AGACTAAATCGGCAAGAGCAGATTAACTCAGCCACTGCCGCGCCAACTC CAGACTAAATCGGCAAGAGCAGATTAACTCAGCCACTGCCGCGCCAACT
GACTAAATCGGCAAGAGCAGATTAACTCAGCCACTGCCGCGCCAACTCA AGACTAAATCGGCAAGAGCAGATTAACTCAGCCACTGCCGCGCCAACTC
ACTAAATCGGCAAGAGCAGATTAACTCAGCCACTGCCGCGCCAACTCAC GACTAAATCGGCAAGAGCAGATTAACTCAGCCACTGCCGCGCCAACTCA
CTAAATCGGCAAGAGCAGATTAACTCAGCCACTGCCGCGCCAACTCACT ACTAAATCGGCAAGAGCAGATTAACTCAGCCACTGCCGCGCCAACTCAC
TAAATCGGCAAGAGCAGATTAACTCAGCCACTGCCGCGCCAACTCACTA CTAAATCGGCAAGAGCAGATTAACTCAGCCACTGCCGCGCCAACTCACT
AAATCGGCAAGAGCAGATTAACTCAGCCACTGCCGCGCCAACTCACTAC TAAATCGGCAAGAGCAGATTAACTCAGCCACTGCCGCGCCAACTCACTA
AATCGGCAAGAGCAGATTAACTCAGCCACTGCCGCGCCAACTCACTACA AAATCGGCAAGAGCAGATTAACTCAGCCACTGCCGCGCCAACTCACTAC
ATCGGCAAGAGCAGATTAACTCAGCCACTGCCGCGCCAACTCACTACAG AATCGGCAAGAGCAGATTAACTCAGCCACTGCCGCGCCAACTCACTACA
TCGGCAAGAGCAGATTAACTCAGCCACTGCCGCGCCAACTCACTACAGT ATCGGCAAGAGCAGATTAACTCAGCCACTGCCGCGCCAACTCA

In [91]:
reconstructed_genome

'GAGTACACTATAGGAGAACCGATTATCGCTGCTGACGTATTGCGCCCGCAAAAGATGGGAATGATTGCTTTGCGTTTCTGACGACGATCTGCCGTTAAAAGCGACCGCAATATAGGTAAATGTCCTGTAACAGATCCTGAGGCCGACCCTTCAATGCTAAGCAGCACGGTGCTCCACTGATGACCAGAAGACAAGATTCAAAGCTCAGTGCCCATCAGGTCGCCGTTAAAAGCGACCGCAATATAGGTAAATGTCCTGTAACAGATCCTGAGCCTCGCAACACATTCTGTTCGTGCCGTTAAAAGCGACCGCAATATAGGTAAATGTCCTGTAACAGATCCTGAGAGGAAGGTTGCCAACCTTAATGAGCTGGGCCACGATTCGAAGGCTAGCCAATGGTTGACGATGGCCTCAAAGATGCCGTTAAAAGCGACCGCAATATAGGTAAATGTCCTGTAACAGATCCTGAACCGCGTCACGGGTCAGATTATATAACCCCCCACTGCCGTTTGCTACGAGATTGAGCTGCCGTTAAAAGCGACCGCAATATAGGTAAATGTCCTGTAACAGATCCTGAACGTGGTGTTGGCGAGGATTCATAGTGAGGTAGGTCGGGAGGACGAGACCTCGCTACATACCTGCCACGCGCTAAACAATAAGCGTTCCCCTTCTAGAAGGTCTGTCAGTGTATCACGGTATCTCCACTGAAATTATTCCAACGGCATCTATGATAAACTGACAACCTCCCTAATTCCAGTATATCCTTTGGGAACTTAGGAACTACCTTGCACAGTAATAGTCAAGTACCGGGCTATACCGAGTGAATGTCCAGAGCAAAGCGTAAGAGATAACTATAATCGTTTGCTTACTATTGAAGACCCGTCAGTTTGGATATGTTCGGCAACTCCTATCGTTGCTCAGTCTGACGCCGTTAAAAGCGACCGCAATATAGGTAAATGTCCTGTAACAGATCCTGATCCCCCTCGTAGACATTGTTGG