In [1]:
%load_ext autoreload
%autoreload 2
import rpy2.robjects as robjects
from rpy2.robjects import numpy2ri
import rpy2
from pprint import pprint
import re
import numpy as np
import pickle
from collections import OrderedDict
from itertools import repeat, count, product
from skbio.alignment import StripedSmithWaterman, local_pairwise_align_ssw
import sys
sys.path.append("../../..")
from analysis_tools import KmerAligner
%run ../lib.ipynb
import pysam

In [2]:
file_data = pickle.load(open("file_data_phage.pickle", 'rb'))

In [3]:
ref = !cat lambda_ref.fasta | grep -v ">"
ref = ref[0]

In [5]:
with open("lambda_all_reads.fastq", 'w') as f: 
    for file_obj in file_data:
        f.write("@ch{0}_file{1}_template\n".format(file_obj["channel"], file_obj["file_id"]))
        f.write(file_obj["called_seq"] + "\n")
        f.write("+\n")
        f.write(file_obj["called_qual"] + "\n")


In [6]:
!bwa index lambda_ref.fasta 

[bwa_index] Pack FASTA... 0.00 sec
[bwa_index] Construct BWT for the packed sequence...
[bwa_index] 0.02 seconds elapse.
[bwa_index] Update BWT... 0.00 sec
[bwa_index] Pack forward-only FASTA... 0.00 sec
[bwa_index] Construct SA from BWT and Occ... 0.00 sec
[main] Version: 0.7.5a-r405
[main] CMD: bwa index lambda_ref.fasta
[main] Real time: 0.090 sec; CPU: 0.027 sec


In [7]:
!bwa mem -t 3 lambda_ref.fasta  lambda_all_reads.fastq > mouse_all_reads.sam

[M::main_mem] read 5302 sequences (30011153 bp)...
[M::main_mem] read 821 sequences (3836491 bp)...
[main] Version: 0.7.5a-r405
[main] CMD: bwa mem -t 3 lambda_ref.fasta lambda_all_reads.fastq
[main] Real time: 6.015 sec; CPU: 15.480 sec


In [8]:
!samtools view -S -b mouse_all_reads.sam > mouse_all_reads.bam

[samopen] SAM header is present: 1 sequences.


In [9]:
!samtools sort mouse_all_reads.bam mouse_all_reads.sorted

In [10]:
!samtools index mouse_all_reads.sorted.bam

In [11]:
samfile = pysam.AlignmentFile("mouse_all_reads.sorted.bam")
samreads = {}

In [12]:
print(len(file_data))

6123


In [13]:
kmer_aligner = KmerAligner("")

Query Length 0


In [14]:
fmap = {}
for f in file_data: 
    fmap["ch{0}_file{1}".format(f["channel"], f["file_id"])] = f

In [15]:
def get_file(channel, file_id):
    return fmap["ch{0}_file{1}".format(channel, file_id)]

In [16]:
reads  = [x for x in samfile.fetch()]
len(reads)

4916

In [17]:
NMER = 5

In [18]:
class AlignmentEndException(Exception):
    pass

def event_indexes(pairing_seq, offset):
    """ get the next entries from the pairing array
    such that k non-gap characters are contained"""
    count = 0
    kmer = []
    for i in range(offset, len(pairing_seq)): 
        if count == NMER: break
        if pairing_seq[i] is not None:
            count += 1
            kmer.append(i)
    if len(kmer) != NMER: 
        raise AlignmentEndException
    return kmer
    
    

In [19]:
def gapmove(to_move, seq, offset):
    """move by 'move' (from metrichor) in the aligned sequence. 
    additionally increase index to compensate for each gap
    """
    move = to_move
    for i in seq[offset:]: 
        if i is None: 
            move += 1
        else:
            to_move -= 1
            if to_move <= 0:
                return move

In [20]:
def get_nt_kmer(index, pairs, seq): 
    """convert sequence indexes into the corresponding nucleotides. 
    gaps are converted into '' 
    """
    seq_index = [pairs[x] for x in index]
    nt_kmer = [seq[x] for x in seq_index]
    return "".join(nt_kmer)


In [21]:
def is_consecutive_seq(seq):
    """check if the sequence 'seq' consists of consecutive numbers"""
    return len(set(list(map(lambda ix:ix[1]-ix[0], enumerate(seq))))) <= 1

In [22]:
def is_correct_kmer(ev_index, pairs, read, ref):
    """check if a kmer corresponds completely wit the reference. 
    This is the case if: 
        * the read positions are consecutive (no indels)
        * the ref positions are consecutive (no indels)
        * the nucleotides are idential (no substitutions)
    """
    assert(len(ev_index) == NMER), "invalid event index"
    read_index = [pairs[0][x] for x in ev_index]
    ref_index = [pairs[1][x] for x in ev_index]
    
    if None in read_index or not is_consecutive_seq(read_index): 
        """indel in read"""
        return False
             
    if None in ref_index or not is_consecutive_seq(ref_index): 
        """indel in ref"""
        return False
             
    read_seq = [read[x] for x in read_index]
    ref_seq = [ref[x] for x in ref_index]
    if read_seq == ref_seq:
        """full_match"""
        return True
    else: 
        """substitution"""
        return False
    

In [23]:
true_events = 0
total_events = 0

for i, read in enumerate(reads):
    file_id, channel_id = get_file_and_channel(read.query_name)
    print(i, file_id, channel_id)
    pairs = [list(t) for t in zip(*read.get_aligned_pairs())]
    file_obj = get_file(channel_id, file_id)
    events = file_obj["events"]
    i_seq = 0
    assert(pairs[0][0] == 0), "alignment is not null-indexed."
       
    ## map read to events
    total_events += len(events)
    for ev in events:
        ev_kmer = ev["kmer"]
        i_seq += gapmove(ev["move"], pairs[0], i_seq)
        try:
            ev_index = event_indexes(pairs[0], i_seq)
        except AlignmentEndException:
            """not the whole read is aligned"""
            break
        read_kmer = get_nt_kmer(ev_index, pairs[0], file_obj["called_seq"])       
        assert(read_kmer == ev_kmer), (ev, read_kmer, ev_index)
        if is_correct_kmer(ev_index, pairs, file_obj["called_seq"], ref):
            true_events += 1
        


0 1 300
1 6 466
2 24 333
3 10 483
4 20 180
5 6 204
6 7 511
7 1 106
8 2 479
9 16 431
10 1 252
11 26 266
12 11 423
13 15 96
14 10 9
15 4 30
16 4 395
17 10 195
18 9 180
19 0 237
20 1 454
21 1 438
22 14 442
23 7 85
24 2 447
25 9 147
26 12 280
27 0 58
28 12 186
29 4 284
30 2 490
31 0 429
32 1 58
33 5 53
34 10 112
35 12 44
36 1 70
37 7 400
38 2 244
39 13 107
40 6 56
41 21 122
42 14 439
43 7 297
44 15 59
45 14 351
46 13 107
47 0 429
48 10 375
49 11 451
50 9 47
51 7 364
52 1 19
53 10 470
54 14 386
55 14 252
56 5 377
57 1 488
58 21 122
59 16 463
60 6 79
61 1 70
62 26 128
63 13 283
64 26 254
65 0 59
66 9 154
67 7 41
68 5 251
69 1 300
70 4 360
71 0 320
72 14 351
73 14 439
74 4 377
75 8 226
76 0 105
77 14 252
78 7 297
79 24 92
80 15 355
81 28 20
82 2 276
83 11 451
84 23 355
85 14 413
86 12 375
87 2 444
88 23 461
89 1 488
90 38 9
91 5 135
92 1 113
93 15 96
94 17 105
95 26 13
96 2 479
97 2 358
98 9 85
99 30 439
100 6 439
101 1 454
102 8 462
103 26 211
104 4 91
105 4 211
106 0 211
107 4 138
108 24 41

In [24]:
print(true_events/total_events)

0.0010192871613214903


In [26]:
true_events

36288