In [1]:
%load_ext autoreload
%autoreload 2
import rpy2.robjects as robjects
from rpy2.robjects import numpy2ri
import rpy2
from pprint import pprint
import re
import numpy as np
import pickle
from collections import OrderedDict
from itertools import repeat, count, product
from skbio.alignment import StripedSmithWaterman, local_pairwise_align_ssw
import sys
sys.path.append("../../..")
from analysis_tools import KmerAligner
%run ../lib.ipynb
import pysam

In [2]:
file_data = pickle.load(open("mouse_file_data.pickle", 'rb'))

In [3]:
ref = !cat mouse_ref.fa | grep -v ">"
ref = ref[0]

In [4]:
with open("mouse_all_reads.fastq", 'w') as f: 
    for file_obj in file_data:
        f.write("@ch{0}_file{1}_template\n".format(file_obj["channel"], file_obj["file_id"]))
        f.write(file_obj["called_seq"] + "\n")
        f.write("+\n")
        f.write(file_obj["called_qual"] + "\n")


In [5]:
!bwa index mouse_ref.fa

[bwa_index] Pack FASTA... 0.00 sec
[bwa_index] Construct BWT for the packed sequence...
[bwa_index] 0.01 seconds elapse.
[bwa_index] Update BWT... 0.00 sec
[bwa_index] Pack forward-only FASTA... 0.00 sec
[bwa_index] Construct SA from BWT and Occ... 0.00 sec
[main] Version: 0.7.5a-r405
[main] CMD: bwa index mouse_ref.fa
[main] Real time: 0.048 sec; CPU: 0.013 sec


In [6]:
!bwa mem -t 3 mouse_ref.fa mouse_all_reads.fastq > mouse_all_reads.sam

[M::main_mem] read 73 sequences (407165 bp)...
[main] Version: 0.7.5a-r405
[main] CMD: bwa mem -t 3 mouse_ref.fa mouse_all_reads.fastq
[main] Real time: 0.091 sec; CPU: 0.204 sec


In [7]:
!samtools view -S -b mouse_all_reads.sam > mouse_all_reads.bam

[samopen] SAM header is present: 1 sequences.


In [8]:
!samtools sort mouse_all_reads.bam mouse_all_reads.sorted

In [9]:
!samtools index mouse_all_reads.sorted.bam

In [10]:
samfile = pysam.AlignmentFile("mouse_all_reads.sorted.bam")
samreads = {}

In [11]:
print(len(file_data))

73


In [12]:
kmer_aligner = KmerAligner("")

Query Length 0


In [13]:
fmap = {}
for f in file_data: 
    fmap["ch{0}_file{1}".format(f["channel"], f["file_id"])] = f

In [14]:
def get_file(channel, file_id):
    return fmap["ch{0}_file{1}".format(channel, file_id)]

In [15]:
reads  = [x for x in samfile.fetch()]
len(reads)

104

In [16]:
NMER = 5

In [17]:
class AlignmentEndException(Exception):
    pass

def event_indexes(pairing_seq, offset):
    """ get the next entries from the pairing array
    such that k non-gap characters are contained"""
    count = 0
    kmer = []
    for i in range(offset, len(pairing_seq)): 
        if count == NMER: break
        if pairing_seq[i] is not None:
            count += 1
            kmer.append(i)
    if len(kmer) != NMER: 
        raise AlignmentEndException
    return kmer
    
    

In [18]:
def gapmove(to_move, seq, offset):
    """move by 'move' (from metrichor) in the aligned sequence. 
    additionally increase index to compensate for each gap
    """
    move = to_move
    for i in seq[offset:]: 
        if i is None: 
            move += 1
        else:
            to_move -= 1
            if to_move <= 0:
                return move

In [19]:
def get_nt_kmer(index, pairs, seq): 
    """convert sequence indexes into the corresponding nucleotides. 
    gaps are converted into '' 
    """
    seq_index = [pairs[x] for x in index]
    nt_kmer = [seq[x] for x in seq_index]
    return "".join(nt_kmer)


In [20]:
def is_consecutive_seq(seq):
    """check if the sequence 'seq' consists of consecutive numbers"""
    return len(set(list(map(lambda ix:ix[1]-ix[0], enumerate(seq))))) <= 1

In [21]:
def is_correct_kmer(ev_index, pairs, read, ref):
    """check if a kmer corresponds completely wit the reference. 
    This is the case if: 
        * the read positions are consecutive (no indels)
        * the ref positions are consecutive (no indels)
        * the nucleotides are idential (no substitutions)
    """
    assert(len(ev_index) == NMER), "invalid event index"
    read_index = [pairs[0][x] for x in ev_index]
    ref_index = [pairs[1][x] for x in ev_index]
    
    if None in read_index or not is_consecutive_seq(read_index): 
        """indel in read"""
        return False
             
    if None in ref_index or not is_consecutive_seq(ref_index): 
        """indel in ref"""
        return False
             
    read_seq = [read[x] for x in read_index]
    ref_seq = [ref[x] for x in ref_index]
    if read_seq == ref_seq:
        """full_match"""
        return True
    else: 
        """substitution"""
        return False
    

In [27]:
true_events = 0
total_events = 0

for i, read in enumerate(reads):
    file_id, channel_id = get_file_and_channel(read.query_name)
    print(i, file_id, channel_id)
    pairs = [list(t) for t in zip(*read.get_aligned_pairs())]
    file_obj = get_file(channel_id, file_id)
    events = file_obj["events"]
    i_seq = 0
    assert(pairs[0][0] == 0), "alignment is not null-indexed."
       
    ## map read to events
    total_events += len(events)
    for ev in events:
        ev_kmer = ev["kmer"]
        i_seq += gapmove(ev["move"], pairs[0], i_seq)
        try:
            ev_index = event_indexes(pairs[0], i_seq)
        except AlignmentEndException:
            """not the whole read is aligned"""
            break
        read_kmer = get_nt_kmer(ev_index, pairs[0], file_obj["called_seq"])       
        assert(read_kmer == ev_kmer), (ev, read_kmer, ev_index)
        if is_correct_kmer(ev_index, pairs, file_obj["called_seq"], ref):
            true_events += 1
        


0 5 227
1 1 157
2 15 215
3 13 141
4 16 215
5 24 212
6 5 204
7 19 135
8 12 206
9 12 206
10 17 132
11 15 215
12 10 132
13 9 215
14 9 215
15 3 211
16 11 156
17 3 156
18 15 215
19 3 156
20 9 215
21 17 132
22 5 204
23 10 156
24 12 206
25 6 156
26 3 195
27 3 211
28 12 211
29 1 157
30 10 132
31 11 156
32 11 133
33 17 132
34 1 157
35 3 156
36 3 156
37 12 206
38 9 215
39 11 156
40 33 135
41 1 157
42 20 211
43 10 132
44 10 132
45 3 156
46 15 215
47 3 211
48 1 157
49 3 195
50 3 156
51 11 133
52 1 157
53 10 156
54 1 157
55 11 133
56 20 206
57 14 209
58 10 156
59 9 215
60 15 215
61 1 157
62 3 156
63 17 132
64 15 215
65 3 211
66 9 215
67 20 206
68 10 132
69 4 204
70 9 206
71 9 206
72 21 223
73 23 215
74 23 215
75 4 204
76 44 132
77 23 215
78 23 215
79 2 133
80 2 133
81 4 204
82 2 133
83 23 132
84 8 204
85 4 204
86 3 132
87 8 204
88 16 215
89 4 206
90 9 215
91 12 206
92 1 157
93 19 135
94 12 215
95 1 157
96 3 211
97 25 215
98 20 211
99 12 206
100 16 215
101 20 211
102 23 135
103 1 157


In [28]:
print(true_events/total_events)

0.0020408672360514753
