In [1]:
%load_ext autoreload
%autoreload 2
import rpy2.robjects as robjects
from rpy2.robjects import numpy2ri
import rpy2
from pprint import pprint
import re
import numpy as np
import pickle
from collections import OrderedDict
from itertools import repeat, count, product
from skbio.alignment import StripedSmithWaterman, local_pairwise_align_ssw
import sys
sys.path.append("../../..")
%run "../lib.ipynb"
import pysam

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
file_data = pickle.load(open("mouse_file_data.pickle", 'rb'))

In [3]:
ref = !cat mouse_ref.fa | grep -v ">"
ref = ref[0]

In [4]:
with open("mouse_all_reads.fastq", 'w') as f: 
    for file_obj in file_data:
        f.write("@ch{0}_file{1}_2d".format(file_obj["channel"], file_obj["file_id"]) + "\n")
        f.write("\n".join(file_obj["fastq"].split("\n")[1:]))


In [5]:
!../../../../tools/graphmap/graphmap -t4 -r mouse_ref.fa -d mouse_all_reads.fastq -o mouse_all_reads.sam

[Index 13:50:00] Running in fast and sensitive mode. Two indexes will be used (double memory consumption).
[Index 13:50:00] Index already exists. Loading from file.
[Index 13:50:02] Secondary index already exists. Loading from file.
[Index 13:50:03] Index loaded in 0.89 sec.
[Index 13:50:03] Memory consumption: [currentRSS = 515 MB, peakRSS = 515 MB]

[Run 13:50:03] Automatically setting the maximum allowed number of regions: max. 500, attempt to reduce after 100
[Run 13:50:03] Reference genome is assumed to be linear.
[Run 13:50:03] Only one alignment will be reported per mapped read.
[ProcessReads 13:50:03] Reads will be loaded in batches of up to 200 MB in size.
[ProcessReads 13:50:03] Batch of 51 reads (0 MiB) loaded in 0.01 sec. (11359128 bases)
[ProcessReads 13:50:03] Memory consumption: [currentRSS = 516 MB, peakRSS = 516 MB]
[ProcessReads 13:50:03] Using 4 threads.
[ProcessReads 13:50:04] [CPU time: 3.33 sec, RSS: 534 MB] Read: 51/51 (100.00%) [m: 51, u: 0]                     

In [6]:
prepare_sam("mouse_all_reads")

[samopen] SAM header is present: 1 sequences.


In [7]:
samfile = pysam.AlignmentFile("mouse_all_reads.sorted.bam")
samreads = {}

In [8]:
print(len(file_data))

51


In [9]:
fmap = {}
for f in file_data: 
    fmap["ch{0}_file{1}".format(f["channel"], f["file_id"])] = f

In [10]:
def get_file(channel, file_id):
    return fmap["ch{0}_file{1}".format(channel, file_id)]

In [11]:
reads  = [x for x in samfile.fetch()]
len(reads)

51

In [12]:
NMER = 5

In [13]:
class AlignmentEndException(Exception):
    pass

def event_indexes(pairing_seq, offset):
    """ get the next entries from the pairing array
    such that k non-gap characters are contained"""
    count = 0
    kmer = []
    for i in range(offset, len(pairing_seq)): 
        if count == NMER: break
        if pairing_seq[i] is not None:
            count += 1
            kmer.append(i)
    if len(kmer) != NMER: 
        raise AlignmentEndException
    return kmer
    
    

In [14]:
def gapmove(to_move, seq, offset):
    """move by 'move' (from metrichor) in the aligned sequence. 
    additionally increase index to compensate for each gap
    """
    move = to_move
    for i in seq[offset:]: 
        if i is None: 
            move += 1
        else:
            to_move -= 1
            if to_move <= 0:
                return move

In [15]:
def get_nt_kmer(index, pairs, seq): 
    """convert sequence indexes into the corresponding nucleotides. 
    gaps are converted into '' 
    """
    seq_index = [pairs[x] for x in index]
    nt_kmer = [seq[x] for x in seq_index]
    return "".join(nt_kmer)


In [16]:
def is_consecutive_seq(seq):
    """check if the sequence 'seq' consists of consecutive numbers"""
    return len(set(list(map(lambda ix:ix[1]-ix[0], enumerate(seq))))) <= 1

In [17]:
def is_correct_kmer(ev_index, pairs, read, ref):
    """check if a kmer corresponds completely wit the reference. 
    This is the case if: 
        * the read positions are consecutive (no indels)
        * the ref positions are consecutive (no indels)
        * the nucleotides are idential (no substitutions)
    """
    assert(len(ev_index) == NMER), "invalid event index"
    read_index = [pairs[0][x] for x in ev_index]
    ref_index = [pairs[1][x] for x in ev_index]
    
    if None in read_index or not is_consecutive_seq(read_index): 
        """indel in read"""
        return False
             
    if None in ref_index or not is_consecutive_seq(ref_index): 
        """indel in ref"""
        return False
             
    read_seq = [read[x] for x in read_index]
    ref_seq = [ref[x] for x in ref_index]
    if read_seq == ref_seq:
        """full_match"""
        return True
    else: 
        """substitution"""
        return False
    

In [23]:
true_events = 0
total_events = 0

for i, read in enumerate(reads):
    print (read.query_name)
    file_id, channel_id = get_file_and_channel(read.query_name)
    print(i, file_id, channel_id)
    pairs = [list(t) for t in zip(*read.get_aligned_pairs())]
    file_obj = get_file(channel_id, file_id)
    events = file_obj["events"]
    i_seq = 0
    assert(pairs[0][0] == 0), "alignment is not null-indexed."
       
    ## map read to events
    event_dict = events.to_dict("records")
    total_events += len(event_dict)
    called_seq = file_obj["fastq"].split("\n")[1]
    
    for ev in event_dict:
        ev_kmer = ev["kmer"]
        i_seq += gapmove(ev["move"], pairs[0], i_seq)

        try:
            ev_index = event_indexes(pairs[0], i_seq)
        except AlignmentEndException:
            """not the whole read is aligned"""
            break
        read_kmer = get_nt_kmer(ev_index, pairs[0], called_seq)       
        assert(read_kmer == ev_kmer), (ev, read_kmer, ev_index)
        if is_correct_kmer(ev_index, pairs, called_seq, ref):
            true_events += 1
        


ch135_file0_2d
0 0 135
ch135_file19_2d
1 19 135
ch156_file6_2d
2 6 156
ch157_file1_2d
3 1 157
ch204_file5_2d
4 5 204
ch206_file10_2d
5 10 206
ch209_file14_2d
6 14 209
ch211_file3_2d
7 3 211
ch215_file15_2d
8 15 215
ch215_file27_2d
9 27 215
ch215_file9_2d
10 9 215
ch132_file38_2d
11 38 132
ch141_file8_2d
12 8 141
ch157_file0_2d
13 0 157
ch201_file1_2d
14 1 201
ch203_file1_2d
15 1 203
ch211_file28_2d
16 28 211
ch223_file11_2d
17 11 223
ch227_file5_2d
18 5 227
ch132_file35_2d
19 35 132
ch142_file18_2d
20 18 142
ch211_file20_2d
21 20 211
ch132_file26_2d
22 26 132
ch156_file11_2d
23 11 156
ch133_file11_2d
24 11 133
ch203_file2_2d
25 2 203
ch132_file10_2d
26 10 132
ch211_file12_2d
27 12 211
ch212_file27_2d
28 27 212
ch135_file23_2d
29 23 135
ch132_file17_2d
30 17 132
ch141_file12_2d
31 12 141
ch206_file20_2d
32 20 206
ch142_file3_2d
33 3 142
ch132_file36_2d
34 36 132
ch211_file17_2d
35 17 211
ch132_file44_2d
36 44 132
ch223_file6_2d
37 6 223
ch133_file2_2d
38 2 133
ch149_file3_2d
39 3 149
ch

In [24]:
print(true_events/total_events)

0.16406780330426327


In [25]:
true_events

66466

### do the kmers correspond to the 2d read? 

In [21]:
for i, read in enumerate(reads[:]):
    print (read.query_name)
    file_id, channel_id = get_file_and_channel(read.query_name)
    file_obj = get_file(channel_id, file_id)
    events = file_obj["events"]
    called_seq = file_obj["fastq"].split("\n")[1]
    seq = []
    event_dict = events.to_dict("records")
    seq.extend(event_dict[0]["kmer"])
    for ev in event_dict[1:]:
        if(ev["move"] == 0): continue
        seq.extend(ev["kmer"][-ev["move"]:])

    seq = "".join(seq)
    
    print(called_seq[:80])
    print(seq[:80])
    assert(called_seq == seq)
        

ch135_file0_2d
CGATTGGGGGCCAACCAGTGAACAGGATTTATTATCATTGGCCAAGCCTCGACGGAATGGATATTCTCAAATGCCTGTAA
CGATTGGGGGCCAACCAGTGAACAGGATTTATTATCATTGGCCAAGCCTCGACGGAATGGATATTCTCAAATGCCTGTAA
ch135_file19_2d
GGAGTTGGGCGCACGATGACGCGTAAGAGTAGAAGTCGTATTAGTCTCATGAGCAAGCCATCTCATGGCGTGCTAATCAT
GGAGTTGGGCGCACGATGACGCGTAAGAGTAGAAGTCGTATTAGTCTCATGAGCAAGCCATCTCATGGCGTGCTAATCAT
ch156_file6_2d
ATCCTACCCGAATTGGGGGCTCAACCCGTGAACACCCATTTATTATCATTGCCACTGAGCCTCCATCTCATACTTCTCAA
ATCCTACCCGAATTGGGGGCTCAACCCGTGAACACCCATTTATTATCATTGCCACTGAGCCTCCATCTCATACTTCTCAA
ch157_file1_2d
ACTTGCCTGACCTGAGACAAATGAAGATCTTCTTCTCAGATGCCGGAGGAGTAAACATCACAGCGCACGACGAGCTGACT
ACTTGCCTGACCTGAGACAAATGAAGATCTTCTTCTCAGATGCCGGAGGAGTAAACATCACAGCGCACGACGAGCTGACT
ch204_file5_2d
CAACAAGGGTGAATTGACTTAGTACTGCTAGAACATCCACGTTTATGCTTGGTCTCAACTAGCCTCCATCTCATACTTCC
CAACAAGGGTGAATTGACTTAGTACTGCTAGAACATCCACGTTTATGCTTGGTCTCAACTAGCCTCCATCTCATACTTCC
ch206_file10_2d
TCATCTTCTTAGCGTCGGCGGCACGGGAGTCACAGTGGATCGAGTGGGATTTCTTCTTATGGTCGTCATCTTCGGGCTTG
TCATCTTCTTAGCGTCG

_Yes, it's true!!!_