In [1]:
%load_ext autoreload
%autoreload 2
import rpy2.robjects as robjects
from rpy2.robjects import numpy2ri
import rpy2
from pprint import pprint
import re
import numpy as np
import pickle
from collections import OrderedDict
from itertools import repeat, count, product
from skbio.alignment import StripedSmithWaterman, local_pairwise_align_ssw
import sys
sys.path.append("../../..")
from analysis_tools import KmerAligner
%run ../lib.ipynb
import pysam

In [2]:
file_data = pickle.load(open("file_data_phage.pickle", 'rb'))

In [3]:
ref = !cat lambda_ref.fasta | grep -v ">"
ref = ref[0]

In [4]:
with open("lambda_all_reads.fastq", 'w') as f: 
    for file_obj in file_data:
        f.write("@ch{0}_file{1}_template\n".format(file_obj["channel"], file_obj["file_id"]))
        f.write(file_obj["called_seq"] + "\n")
        f.write("+\n")
        f.write(file_obj["called_qual"] + "\n")


In [5]:
!../../../../tools/graphmap/graphmap -t4 -r lambda_ref.fasta -d lambda_all_reads.fastq -o lambda_all_reads.sam

[Index 14:44:54] Running in fast and sensitive mode. Two indexes will be used (double memory consumption).
[Index 14:44:54] Index is not prebuilt. Generating index.
[LoadOrGenerate 14:44:54] Started generating new index from file 'lambda_ref.fasta'...
[LoadOrGenerate 14:44:55] Storing new index to file 'lambda_ref.fasta.gmidx'...
[LoadOrGenerate 14:44:56] New index stored.
[Index 14:44:56] Secondary index is not prebuilt. Generating index.
[LoadOrGenerate 14:44:56] Started generating new index from file 'lambda_ref.fasta'...
[LoadOrGenerate 14:44:56] Storing new index to file 'lambda_ref.fasta.gmidxsec'...
[LoadOrGenerate 14:44:58] New index stored.
[Index 14:44:58] Index loaded in 1.07 sec.
[Index 14:44:58] Memory consumption: [currentRSS = 482 MB, peakRSS = 610 MB]

[Run 14:44:58] Automatically setting the maximum allowed number of regions: max. 500, attempt to reduce after 100
[Run 14:44:58] Reference genome is assumed to be linear.
[Run 14:44:58] Only one alignment will be reported

In [6]:
prepare_sam("lambda_all_reads")

[samopen] SAM header is present: 1 sequences.


In [24]:
samfile = pysam.AlignmentFile("lambda_all_reads.sorted.bam")
samreads = {}

In [25]:
print(len(file_data))

6123


In [26]:
kmer_aligner = KmerAligner("")

Query Length 0


In [27]:
fmap = {}
for f in file_data: 
    fmap["ch{0}_file{1}".format(f["channel"], f["file_id"])] = f

In [28]:
def get_file(channel, file_id):
    return fmap["ch{0}_file{1}".format(channel, file_id)]

In [29]:
reads  = [x for x in samfile.fetch()]
len(reads)

5709

In [30]:
NMER = 5

In [31]:
class AlignmentEndException(Exception):
    pass

def event_indexes(pairing_seq, offset):
    """ get the next entries from the pairing array
    such that k non-gap characters are contained"""
    count = 0
    kmer = []
    for i in range(offset, len(pairing_seq)): 
        if count == NMER: break
        if pairing_seq[i] is not None:
            count += 1
            kmer.append(i)
    if len(kmer) != NMER: 
        raise AlignmentEndException
    return kmer
    
    

In [32]:
def gapmove(to_move, seq, offset):
    """move by 'move' (from metrichor) in the aligned sequence. 
    additionally increase index to compensate for each gap
    """
    move = to_move
    for i in seq[offset:]: 
        if i is None: 
            move += 1
        else:
            to_move -= 1
            if to_move <= 0:
                return move

In [33]:
def get_nt_kmer(index, pairs, seq): 
    """convert sequence indexes into the corresponding nucleotides. 
    gaps are converted into '' 
    """
    seq_index = [pairs[x] for x in index]
    nt_kmer = [seq[x] for x in seq_index]
    return "".join(nt_kmer)


In [34]:
def is_consecutive_seq(seq):
    """check if the sequence 'seq' consists of consecutive numbers"""
    return len(set(list(map(lambda ix:ix[1]-ix[0], enumerate(seq))))) <= 1

In [35]:
def is_correct_kmer(ev_index, pairs, read, ref):
    """check if a kmer corresponds completely wit the reference. 
    This is the case if: 
        * the read positions are consecutive (no indels)
        * the ref positions are consecutive (no indels)
        * the nucleotides are idential (no substitutions)
    """
    assert(len(ev_index) == NMER), "invalid event index"
    read_index = [pairs[0][x] for x in ev_index]
    ref_index = [pairs[1][x] for x in ev_index]
    
    if None in read_index or not is_consecutive_seq(read_index): 
        """indel in read"""
        return False
             
    if None in ref_index or not is_consecutive_seq(ref_index): 
        """indel in ref"""
        return False
             
    read_seq = [read[x] for x in read_index]
    ref_seq = [ref[x] for x in ref_index]
    if read_seq == ref_seq:
        """full_match"""
        return True
    else: 
        """substitution"""
        return False
    

In [36]:
true_events = 0
total_events = 0

for i, read in enumerate(reads):
    file_id, channel_id = get_file_and_channel(read.query_name)
    print(i, file_id, channel_id)
    pairs = [list(t) for t in zip(*read.get_aligned_pairs())]
    file_obj = get_file(channel_id, file_id)
    events = file_obj["events"]
    i_seq = 0
    assert(pairs[0][0] == 0), "alignment is not null-indexed."
       
    ## map read to events
    total_events += len(events)
    for ev in events:
        ev_kmer = ev["kmer"]
        i_seq += gapmove(ev["move"], pairs[0], i_seq)
        try:
            ev_index = event_indexes(pairs[0], i_seq)
        except AlignmentEndException:
            """not the whole read is aligned"""
            break
        read_kmer = get_nt_kmer(ev_index, pairs[0], file_obj["called_seq"])       
        assert(read_kmer == ev_kmer), (ev, read_kmer, ev_index)
        if is_correct_kmer(ev_index, pairs, file_obj["called_seq"], ref):
            true_events += 1
        


0 24 106
1 16 107
2 1 132
3 6 132
4 6 154
5 5 165
6 15 179
7 20 180
8 9 180
9 18 182
10 10 214
11 7 23
12 1 247
13 6 254
14 1 26
15 17 27
16 27 280
17 1 282
18 13 283
19 22 283
20 10 296
21 5 297
22 10 29
23 20 29
24 1 300
25 7 304
26 4 30
27 17 31
28 0 320
29 0 323
30 13 323
31 24 333
32 4 333
33 8 33
34 51 33
35 13 34
36 14 353
37 15 355
38 23 355
39 3 356
40 2 358
41 18 35
42 5 36
43 23 378
44 16 396
45 19 413
46 3 422
47 23 424
48 2 435
49 12 441
50 14 442
51 17 443
52 10 446
53 19 452
54 9 453
55 1 454
56 13 458
57 18 458
58 11 463
59 2 466
60 6 466
61 13 467
62 5 467
63 14 468
64 3 469
65 26 469
66 7 474
67 2 479
68 10 483
69 10 496
70 21 59
71 5 5
72 11 67
73 18 87
74 15 96
75 11 98
76 35 9
77 0 105
78 1 106
79 0 116
80 0 23
81 2 452
82 2 490
83 0 63
84 1 8
85 12 112
86 1 123
87 12 186
88 20 211
89 7 27
90 7 281
91 12 287
92 9 28
93 5 300
94 0 306
95 5 34
96 13 353
97 0 376
98 12 393
99 7 400
100 1 426
101 3 443
102 2 447
103 4 467
104 2 481
105 6 493
106 17 495
107 5 497
108 6 

In [37]:
print(true_events/total_events)

0.060403470052776326


In [38]:
true_events

2166596