In [1]:
%load_ext autoreload
%autoreload 2
import rpy2.robjects as robjects
from rpy2.robjects import numpy2ri
import rpy2
from pprint import pprint
import re
import numpy as np
import pickle
from collections import OrderedDict
from itertools import repeat, count, product
from skbio.alignment import StripedSmithWaterman, local_pairwise_align_ssw
import sys
sys.path.append("../../..")
from analysis_tools import KmerAligner
%run ../lib.ipynb
import pysam

In [2]:
file_data = pickle.load(open("mouse_file_data.pickle", 'rb'))

In [3]:
with open("mouse_all_reads.fastq", 'w') as f: 
    for file_obj in file_data:
        f.write("@ch{0}_file{1}_template\n".format(file_obj["channel"], file_obj["file_id"]))
        f.write(file_obj["called_seq"] + "\n")
        f.write("+\n")
        f.write(file_obj["called_qual"] + "\n")


In [4]:
!bwa index mouse_ref.fa

[bwa_index] Pack FASTA... 0.00 sec
[bwa_index] Construct BWT for the packed sequence...
[bwa_index] 0.01 seconds elapse.
[bwa_index] Update BWT... 0.00 sec
[bwa_index] Pack forward-only FASTA... 0.00 sec
[bwa_index] Construct SA from BWT and Occ... 0.00 sec
[main] Version: 0.7.5a-r405
[main] CMD: bwa index mouse_ref.fa
[main] Real time: 0.063 sec; CPU: 0.014 sec


In [5]:
!bwa mem -t 3 mouse_ref.fa mouse_all_reads.fastq > mouse_all_reads.sam

[M::main_mem] read 73 sequences (407165 bp)...
[main] Version: 0.7.5a-r405
[main] CMD: bwa mem -t 3 mouse_ref.fa mouse_all_reads.fastq
[main] Real time: 0.088 sec; CPU: 0.203 sec


In [6]:
!samtools view -S -b mouse_all_reads.sam > mouse_all_reads.bam

[samopen] SAM header is present: 1 sequences.


In [7]:
!samtools sort mouse_all_reads.bam mouse_all_reads.sorted

In [8]:
!samtools index mouse_all_reads.sorted.bam

In [9]:
samfile = pysam.AlignmentFile("mouse_all_reads.sorted.bam")
samreads = {}

In [10]:
print(len(file_data))

73


In [11]:
kmer_aligner = KmerAligner("")

Query Length 0


In [12]:
fmap = {}
for f in file_data: 
    fmap["ch{0}_file{1}".format(f["channel"], f["file_id"])] = f

In [13]:
def get_file(channel, file_id):
    return fmap["ch{0}_file{1}".format(channel, file_id)]

In [14]:
reads  = [x for x in samfile.fetch()]
len(reads)

104

In [15]:
NMER = 5

In [47]:
def next_gapped_kmer(pairing_seq):
    """ get the next entries from the pairing array
    such that k non-gap characters are contained"""
    count = 0
    kmer = []
    for i in pairing_seq: 
        if count == NMER: break
        if i is not None: count += 1
        kmer.append(i)
    assert(count == NMER), "sequence out of bounds"
    return kmer
    
    

In [48]:
def gapmove(move, seq):
    """move by 'move' (from metrichor) in the aligned sequence. 
    additionally increase index to compensate for each gap
    """
    for i in seq: 
        if i is None: 
            move += 1
        else:
            return move

In [49]:
def get_nt_kmer(id_kmer, seq): 
    """convert sequence indexes into the corresponding nucleotides. 
    gaps are converted into '' 
    """
    return list(map(lambda x: "" if x is None else seq[x], id_kmer))

In [50]:

for read in reads[0:]:
    print(file_id, channel_id)
    file_id, channel_id = get_file_and_channel(read.query_name)
    pairs = read.aligned_pairs
#     print(pairs)
    pairs2 = [list(t) for t in zip(*pairs)]
    file_obj = get_file(channel_id, file_id)
    events = file_obj["events"]
    i_seq = 0
    for ev in events:
        ev_kmer = ev["kmer"]
        i_seq += gapmove(ev["move"], pairs2[0][i_seq:])
        read_kmer_ids = next_gapped_kmer(pairs2[0][i_seq:])
        read_kmer = get_nt_kmer(read_kmer_ids, file_obj["called_seq"])
        print(ev_kmer, read_kmer_ids, read_kmer)
        assert("".join(read_kmer) == ev_kmer), (ev, read_kmer, read_kmer_ids)
        
    break


5 227
TAGCC [0, 1, 2, 3, 4] ['T', 'A', 'G', 'C', 'C']
AGCCT [1, 2, 3, 4, 5] ['A', 'G', 'C', 'C', 'T']
GCCTG [2, 3, 4, 5, 6] ['G', 'C', 'C', 'T', 'G']
CCTGT [3, 4, 5, 6, 7] ['C', 'C', 'T', 'G', 'T']
CTGTG [4, 5, 6, 7, 8] ['C', 'T', 'G', 'T', 'G']
TGTGG [5, 6, 7, 8, 9] ['T', 'G', 'T', 'G', 'G']
GTGGC [6, 7, 8, 9, 10] ['G', 'T', 'G', 'G', 'C']
TGGCG [7, 8, 9, 10, 11] ['T', 'G', 'G', 'C', 'G']
GGCGA [8, 9, 10, 11, 12] ['G', 'G', 'C', 'G', 'A']
GCGAA [9, 10, 11, 12, 13] ['G', 'C', 'G', 'A', 'A']
CGAAT [10, 11, 12, 13, 14] ['C', 'G', 'A', 'A', 'T']
GAATG [11, 12, 13, 14, 15] ['G', 'A', 'A', 'T', 'G']
AATGG [12, 13, 14, 15, 16] ['A', 'A', 'T', 'G', 'G']
ATGGG [13, 14, 15, 16, 17] ['A', 'T', 'G', 'G', 'G']
TGGGC [14, 15, 16, 17, 18] ['T', 'G', 'G', 'G', 'C']
GGGCT [15, 16, 17, 18, 19] ['G', 'G', 'G', 'C', 'T']
GGCTT [16, 17, 18, 19, 20] ['G', 'G', 'C', 'T', 'T']
GCTTT [17, 18, 19, 20, 21] ['G', 'C', 'T', 'T', 'T']
CTTTT [18, 19, 20, 21, 22] ['C', 'T', 'T', 'T', 'T']
TTTTG [19, 20, 21, 22, 23] 

In [59]:
print(len(samreads))

37


In [36]:
for file_obj in file_data: 
    file_obj["samread"] = samreads["{0}_{1}".format(file_obj["file_id"], file_obj["channel"])]

KeyError: '3_149'