In [23]:
import ghmm
import pickle
import numpy as np
from itertools import product
import pysam
import os
import pandas
import re
import editdistance
from skbio.alignment import StripedSmithWaterman

In [42]:
NMERS = 3
NSTATES = 4**NMERS

In [43]:
HMM_PARAMS = pickle.load(open("/home/ibis/gregor.sturm/nanopore/own/notebooks/03_pipeline/hmm_params.pickle", 'rb'))
HMM_PARAMS = sorted(zip(*[HMM_PARAMS["kmers"], HMM_PARAMS["means"], HMM_PARAMS["stdv"]]))

# Train Model 

In [44]:
def mk_transmat(nmers):
    """make a transition matrix assuming single base steps"""
    all_kmers = [x[0] for x in HMM_PARAMS]
    n_components = len(all_kmers)
    transmat = np.empty((n_components, n_components))
    for j, from_kmer in enumerate(all_kmers):
        for i, to_kmer in enumerate(all_kmers):
            p = 1/4. if from_kmer[-2:] == to_kmer[:2] else 0.
            transmat[j, i] = p          
            
    return transmat.tolist()

In [45]:
# example code for a continuous HMM with gaussian emissions


F = ghmm.Float()  # emission domain of this model

A = mk_transmat(NMERS)
B = [[x[1], x[2]] for x in HMM_PARAMS]   # parameters of emission distributions in pairs of (mu, sigma)
pi = [1/float(NSTATES)] * NSTATES   # initial probabilities per state

# generate model from parameters
model = ghmm.HMMFromMatrices(F,ghmm.GaussianDistribution(F), A, B, pi)

In [46]:
s = str(model)
print(s)

GaussianEmissionHMM(N=64)
  state 0 (initial=0.02, mu=68.52, sigma=3.37)
    Transitions: ->0 (0.25), ->1 (0.25), ->2 (0.25), ->3 (0.25)
  state 1 (initial=0.02, mu=62.11, sigma=2.79)
    Transitions: ->4 (0.25), ->5 (0.25), ->6 (0.25), ->7 (0.25)

  ...

  state 62 (initial=0.02, mu=55.12, sigma=2.72)
    Transitions: ->56 (0.25), ->57 (0.25), ->58 (0.25), ->59 (0.25)
  state 63 (initial=0.02, mu=50.28, sigma=2.71)
    Transitions: ->60 (0.25), ->61 (0.25), ->62 (0.25), ->63 (0.25)



In [47]:
def result_to_seq(result):
    states = result[0]
    all_kmers = [x[0] for x in HMM_PARAMS]
    kmers = [all_kmers[x] for x in states]
    seq = [kmer[NMERS/2] for kmer in kmers]
    return "".join(seq)

In [48]:
def predict(means):
    seq = ghmm.EmissionSequence(F, means)
    result = model.viterbi(seq)
    return result_to_seq(result)

In [49]:
predict([44.2, 44.3, 56, 58.2, 56.2, 58.1, 58.2, 60, 30.2])

u'TTTGTGTGT'

# Validate Model 

In [32]:
args = {
    "events" : "/home/ibis/gregor.sturm/nanopore/own/notebooks/03_pipeline/david_events.pickle",
    "alignment" : "/home/ibis/gregor.sturm/nanopore/own/notebooks/03_pipeline/david_alignment.sorted.bam",
    "ref": "/home/ibis/gregor.sturm/nanopore/david_eccles_bc_ideas/mouse_ref.fa",
}

In [33]:
!pwd

/home/ibis/gregor.sturm/nanopore/own/notebooks/03_pipeline


In [34]:
assert os.path.isfile(args["events"])
assert os.path.isfile(args["alignment"])

In [35]:
ref_file = args["ref"]
test = !cat {ref_file} | grep ">"
print(test)
ref = !cat {ref_file} | grep -v ">"
ref = ref[0]
print(ref[:100])

['>mmusMT_PCR1']
GTTAATGTAGCTTAATAACAAAGCAAAGCACTGAAAATGCTTAGATGGATAATTGTATCCCATAAACACAAAGGTTTGGTCCTGGCCTTATAATTAATTA


In [36]:
file_data = pickle.load(open(args["events"], 'rb'))
file_data = [f for f in file_data if f is not None]

In [37]:
fmap = {}
for f in file_data: 
    fmap["ch{0}_file{1}".format(f["channel"], f["file_id"])] = f

In [38]:
def get_file(channel, file_id):
    return fmap["ch{0}_file{1}".format(channel, file_id)]

In [39]:
def get_file_and_channel(filename):
    result = re.search(r'ch(\d+)_file(\d+)_', filename)
    file_id = int(result.group(2))
    channel_id = int(result.group(1))
    return file_id, channel_id

In [40]:
samfile = pysam.AlignmentFile(args["alignment"])
reads  = [x for x in samfile.fetch()]
len(reads)

51

In [50]:
total_events = 0

result = []
for i, read in enumerate(reads[:1]):
    file_id, channel_id = get_file_and_channel(read.query_name)
    print(i, file_id, channel_id, read.query_name)
    file_obj = get_file(channel_id, file_id)
      
    events = [x["template.mean"] for x in file_obj["events"]]
    events = [x for x in events if x is not None]
    metrichor_seq = file_obj["fastq"].split("\n")[1]
    ref_seq = "".join([ref[x] for x in read.get_reference_positions()])
    called_seq = predict(events)
    print(editdistance.eval(ref_seq, metrichor_seq))
    print(editdistance.eval(ref_seq, called_seq))
    print(len(ref_seq))

    
    

(0, 0, 135, 'ch135_file0_2d')
1768
3259
6112
