In [246]:
import ghmm
import pickle
import numpy as np
from itertools import product
from pprint import pprint
import pysam
import os
import pandas
import re
import editdistance
import sys
from multiprocessing import Pool
%run "/home/ibis/gregor.sturm/nanopore/own/notebooks/03_pipeline/alignment_lib.ipynb"

In [247]:
NMERS = 3
NSTATES = 4**NMERS

In [248]:
args = {
    "events" : "/home/ibis/gregor.sturm/nanopore/own/notebooks/03_pipeline/david_events.template.pickle",
    "out_basename" : "/home/ibis/gregor.sturm/nanopore/own/notebooks/03_pipeline/david_calling",
    "ref": "/home/ibis/gregor.sturm/nanopore/david_eccles_bc_ideas/mouse_ref.fa",
    "hmm_params": "/home/ibis/gregor.sturm/nanopore/own/notebooks/03_pipeline/david_hmm_params.pickle",
    "ncores": 4
}

In [249]:
HMM_PARAMS = pickle.load(open(args["hmm_params"], 'rb'))
HMM_PARAMS = sorted(zip(*[HMM_PARAMS["kmers"], HMM_PARAMS["means"], HMM_PARAMS["stdv"]]))

# Train Model 

In [250]:
def mk_transmat(nmers):
    """make a transition matrix assuming move=1"""
    all_kmers = [x[0] for x in HMM_PARAMS]
    n_components = len(all_kmers)
    transmat = np.empty((n_components, n_components))
    for j, from_kmer in enumerate(all_kmers):
        for i, to_kmer in enumerate(all_kmers):
            p = 1/4. if from_kmer[-(NMERS-1):] == to_kmer[:(NMERS-1)] else 0.
            transmat[j, i] = p          
            
    return transmat.tolist()

In [251]:
def mk_transmat0(nmers):
    """make a transition matrix assuming move=0 or move=1"""
    all_kmers = [x[0] for x in HMM_PARAMS]
    n_components = len(all_kmers)
    transmat = np.empty((n_components, n_components))
    for j, from_kmer in enumerate(all_kmers):
        for i, to_kmer in enumerate(all_kmers):
            p = 0
            if from_kmer[-(NMERS-1):] == to_kmer[:(NMERS-1)]:
                """move=1"""
                p = (9/10.) * (1/4.) 
            elif from_kmer == to_kmer:
                """move=0"""
                p = (1/10.) * 1
            transmat[j, i] = p          
            
    return transmat.tolist()

In [252]:
def mk_transmat2(nmers):
    """make a transition matrix assuming move=0 or move=1 or move=2"""
    all_kmers = [x[0] for x in HMM_PARAMS]
    n_components = len(all_kmers)
    transmat = np.empty((n_components, n_components))
    for j, from_kmer in enumerate(all_kmers):
        for i, to_kmer in enumerate(all_kmers):
            p = 0
            if from_kmer[-(NMERS-2):] == to_kmer[:(NMERS-2)]:
                """move=2"""
                p = (3/10.) * (1/16.)
            elif from_kmer[-(NMERS-1):] == to_kmer[:(NMERS-1)]:
                """move=1"""
                p = (7/10.) * (1/4.) 
            elif from_kmer == to_kmer:
                """move=0"""
                p = (0/10.) * 1
            transmat[j, i] = p          
            
    return transmat.tolist()

In [253]:
# example code for a continuous HMM with gaussian emissions


F = ghmm.Float()  # emission domain of this model

A = mk_transmat2(NMERS)
B = [[x[1], x[2]] for x in HMM_PARAMS]   # parameters of emission distributions in pairs of (mu, sigma)
pi = [1/float(NSTATES)] * NSTATES   # initial probabilities per state

# generate model from parameters
model = ghmm.HMMFromMatrices(F,ghmm.GaussianDistribution(F), A, B, pi)

In [254]:
s = str(model)
print(s)

GaussianEmissionHMM(N=64)
  state 0 (initial=0.02, mu=70.17, sigma=3.84)
    Transitions: ->0 (0.02), ->1 (0.02), ->2 (0.02), ->3 (0.02), ->4 (0.02), ->5 (0.02), ->6 (0.02), ->7 (0.02), ->8 (0.02), ->9 (0.02), ->10 (0.02), ->11 (0.02), ->12 (0.02), ->13 (0.02), ->14 (0.02), ->15 (0.02)
  state 1 (initial=0.02, mu=63.14, sigma=3.47)
    Transitions: ->4 (0.17), ->5 (0.17), ->6 (0.17), ->7 (0.17), ->16 (0.02), ->17 (0.02), ->18 (0.02), ->19 (0.02), ->20 (0.02), ->21 (0.02), ->22 (0.02), ->23 (0.02), ->24 (0.02), ->25 (0.02), ->26 (0.02), ->27 (0.02), ->28 (0.02), ->29 (0.02), ->30 (0.02), ->31 (0.02)

  ...

  state 62 (initial=0.02, mu=53.52, sigma=2.85)
    Transitions: ->32 (0.02), ->33 (0.02), ->34 (0.02), ->35 (0.02), ->36 (0.02), ->37 (0.02), ->38 (0.02), ->39 (0.02), ->40 (0.02), ->41 (0.02), ->42 (0.02), ->43 (0.02), ->44 (0.02), ->45 (0.02), ->46 (0.02), ->47 (0.02), ->56 (0.17), ->57 (0.17), ->58 (0.17), ->59 (0.17)
  state 63 (initial=0.02, mu=49.86, sigma=2.78)
    Transition

In [255]:
def result_to_seq(result):
    states = result[0]
    all_kmers = [x[0] for x in HMM_PARAMS]
    kmers = [all_kmers[x] for x in states]
    seq = [kmer[NMERS/2] for kmer in kmers]
    return "".join(seq)

In [256]:
def predict(means):
    seq = ghmm.EmissionSequence(F, means)
    result = model.viterbi(seq)
    return result_to_seq(result)

In [257]:
predict([44.2, 44.3, 56, 58.2, 56.2, 58.1, 58.2, 60, 30.2])

u'TTTGTGTGT'

# Validate Model 

In [258]:
!pwd

/home/ibis/gregor.sturm/nanopore/own/notebooks/03_pipeline


In [259]:
assert os.path.isfile(args["events"])

In [260]:
ref_file = args["ref"]
test = !cat {ref_file} | grep ">"
print(test)
ref = !cat {ref_file} | grep -v ">"
ref = ref[0]
print(ref[:100])

['>mmusMT_PCR1']
GTTAATGTAGCTTAATAACAAAGCAAAGCACTGAAAATGCTTAGATGGATAATTGTATCCCATAAACACAAAGGTTTGGTCCTGGCCTTATAATTAATTA


In [261]:
file_data = pickle.load(open(args["events"], 'rb'))
file_data = [f for f in file_data if f is not None]

In [262]:
prepare_filemap(file_data)

### Make Alignment

In [263]:
fastq_file = "{0}.fastq".format(args["out_basename"])
mk_fastq(fastq_file, file_data)

In [264]:
sam_file = "{0}.sam".format(args["out_basename"])
graphmap(ref_file, fastq_file, sam_file, args["ncores"])

[Index 14:48:41] Running in fast and sensitive mode. Two indexes will be used (double memory consumption).
[Index 14:48:41] Index already exists. Loading from file.
[Index 14:48:43] Secondary index already exists. Loading from file.
[Index 14:48:45] Index loaded in 0.57 sec.
[Index 14:48:45] Memory consumption: [currentRSS = 513 MB, peakRSS = 513 MB]

[Run 14:48:45] Automatically setting the maximum allowed number of regions: max. 500, attempt to reduce after 100
[Run 14:48:45] Reference genome is assumed to be linear.
[Run 14:48:45] Only one alignment will be reported per mapped read.
[ProcessReads 14:48:45] Reads will be loaded in batches of up to 200 MB in size.
[ProcessReads 14:48:45] Batch of 81 reads (0 MiB) loaded in 0.00 sec. (7746680 bases)
[ProcessReads 14:48:45] Memory consumption: [currentRSS = 514 MB, peakRSS = 514 MB]
[ProcessReads 14:48:45] Using 4 threads.
[ProcessReads 14:48:46] [CPU time: 3.46 sec, RSS: 549 MB] Read: 81/81 (100.00%) [m: 80, u: 1]                      

In [265]:
bam_file = prepare_sam(args["out_basename"])

[samopen] SAM header is present: 1 sequences.


In [266]:
samfile = pysam.AlignmentFile(bam_file)
reads  = [x for x in samfile.fetch()]
len(reads)

80

In [267]:
def basecall_read(params):
    read_name, ref_pos = params
    file_id, channel_id = get_file_and_channel(read_name)
    print(file_id, channel_id, read_name)
    sys.stdout.flush()
    
    file_obj = get_file(channel_id, file_id)
      
    events = [x["mean"] for x in file_obj["events"].to_dict("records")]
    metrichor_seq = file_obj["fastq"].split("\n")[1]
    ref_seq = "".join([ref[x] for x in ref_pos])
    called_seq = predict(events)
    
    stats = {
        "d_metrichor": int(editdistance.eval(ref_seq, metrichor_seq)),
        "d_caller": int(editdistance.eval(ref_seq, called_seq)), 
        "length": len(ref_seq),
    }

    return (read_name, called_seq, stats)

In [268]:
p = Pool(args["ncores"])
## one can not access a read in parallel (deadlock for whatever reason)
## therfore prepare input parameters outside of map
input_params = [(read.query_name, read.get_reference_positions()) for read in reads]
try:
    results = p.map(basecall_read, input_params)
except KeyboardInterrupt:
    p.terminate()

(10, 132, 'ch132_file10_read')
(17, 132, 'ch132_file17_read')
(16, 215, 'ch215_file16_read')
(9, 215, 'ch215_file9_read')
(35, 132, 'ch132_file35_read')
(20, 211, 'ch211_file20_read')
(12, 206, 'ch206_file12_read')
(1, 203, 'ch203_file1_read')
(11, 133, 'ch133_file11_read')
(25, 215, 'ch215_file25_read')
(11, 156, 'ch156_file11_read')
(28, 211, 'ch211_file28_read')
(1, 157, 'ch157_file1_read')
(5, 227, 'ch227_file5_read')
(10, 156, 'ch156_file10_read')
(0, 135, 'ch135_file0_read')
(12, 211, 'ch211_file12_read')
(3, 211, 'ch211_file3_read')
(19, 135, 'ch135_file19_read')
(21, 212, 'ch212_file21_read')
(12, 141, 'ch141_file12_read')
(14, 215, 'ch215_file14_read')
(18, 142, 'ch142_file18_read')
(8, 141, 'ch141_file8_read')
(3, 195, 'ch195_file3_read')
(13, 141, 'ch141_file13_read')
(15, 215, 'ch215_file15_read')
(10, 215, 'ch215_file10_read')
(24, 212, 'ch212_file24_read')
(6, 156, 'ch156_file6_read')
(27, 212, 'ch212_file27_read')
(3, 156, 'ch156_file3_read')
(0, 157, 'ch157_file0_read')

### Stats

In [269]:
headers, seqs, stats = zip(*results)

In [270]:
stats = pandas.DataFrame(list(stats))

In [271]:
stats = stats.sum(0)

In [272]:
stats

d_caller       222110
d_metrichor    172867
length         387166
dtype: int64

In [273]:
print("Relative Performance: {0:5.3f}%".format(stats["d_metrichor"] * 100 /float(stats["d_caller"])))

Relative Performance: 77.829%


In [274]:
fasta_file_called = "{0}.called.fa".format(args["out_basename"])
with open(fasta_file_called, 'w') as f: 
    for header, seq in zip(headers, seqs): 
        f.write(">" + header + "\n")
        f.write(seq + "\n")

In [275]:
sam_file_called = "{0}.called.sam".format(args["out_basename"])
graphmap(ref_file, fasta_file_called, sam_file_called, args["ncores"])

[Index 14:49:17] Running in fast and sensitive mode. Two indexes will be used (double memory consumption).
[Index 14:49:17] Index already exists. Loading from file.
[Index 14:49:18] Secondary index already exists. Loading from file.
[Index 14:49:20] Index loaded in 0.55 sec.
[Index 14:49:20] Memory consumption: [currentRSS = 513 MB, peakRSS = 513 MB]

[Run 14:49:20] Automatically setting the maximum allowed number of regions: max. 500, attempt to reduce after 100
[Run 14:49:20] Reference genome is assumed to be linear.
[Run 14:49:20] Only one alignment will be reported per mapped read.
[ProcessReads 14:49:20] Reads will be loaded in batches of up to 200 MB in size.
[ProcessReads 14:49:20] Batch of 80 reads (0 MiB) loaded in 0.00 sec. (13427848 bases)
[ProcessReads 14:49:20] Memory consumption: [currentRSS = 513 MB, peakRSS = 513 MB]
[ProcessReads 14:49:20] Using 4 threads.
[ProcessReads 14:49:22] [CPU time: 3.20 sec, RSS: 531 MB] Read: 80/80 (100.00%) [m: 67, u: 13]                    

In [276]:
prepare_sam("{0}.called".format(args["out_basename"]))

[samopen] SAM header is present: 1 sequences.


'/home/ibis/gregor.sturm/nanopore/own/notebooks/03_pipeline/david_calling.called.sorted.bam'