In [2]:
import ghmm
import pickle
import numpy as np
from itertools import product
import pysam
import os
import pandas
import re
import editdistance
from Bio import pairwise2

In [3]:
NMERS = 3
NSTATES = 4**NMERS

In [4]:
HMM_PARAMS = pickle.load(open("/home/ibis/gregor.sturm/nanopore/own/notebooks/03_pipeline/hmm_params.pickle", 'rb'))
HMM_PARAMS = sorted(zip(*[HMM_PARAMS["kmers"], HMM_PARAMS["means"], HMM_PARAMS["stdv"]]))

# Train Model 

In [5]:
def mk_transmat(nmers):
    """make a transition matrix assuming single base steps"""
    all_kmers = [x[0] for x in HMM_PARAMS]
    n_components = len(all_kmers)
    transmat = np.empty((n_components, n_components))
    for j, from_kmer in enumerate(all_kmers):
        for i, to_kmer in enumerate(all_kmers):
            p = 1/4. if from_kmer[-2:] == to_kmer[:2] else 0.
            transmat[j, i] = p          
            
    return transmat.tolist()

In [6]:
# example code for a continuous HMM with gaussian emissions


F = ghmm.Float()  # emission domain of this model

A = mk_transmat(NMERS)
B = [[x[1], x[2]] for x in HMM_PARAMS]   # parameters of emission distributions in pairs of (mu, sigma)
pi = [1/float(NSTATES)] * NSTATES   # initial probabilities per state

# generate model from parameters
model = ghmm.HMMFromMatrices(F,ghmm.GaussianDistribution(F), A, B, pi)

In [7]:
s = str(model)
print(s)

GaussianEmissionHMM(N=64)
  state 0 (initial=0.02, mu=70.17, sigma=3.84)
    Transitions: ->0 (0.25), ->1 (0.25), ->2 (0.25), ->3 (0.25)
  state 1 (initial=0.02, mu=63.14, sigma=3.47)
    Transitions: ->4 (0.25), ->5 (0.25), ->6 (0.25), ->7 (0.25)

  ...

  state 62 (initial=0.02, mu=53.52, sigma=2.85)
    Transitions: ->56 (0.25), ->57 (0.25), ->58 (0.25), ->59 (0.25)
  state 63 (initial=0.02, mu=49.86, sigma=2.78)
    Transitions: ->60 (0.25), ->61 (0.25), ->62 (0.25), ->63 (0.25)



In [8]:
def result_to_seq(result):
    states = result[0]
    all_kmers = [x[0] for x in HMM_PARAMS]
    kmers = [all_kmers[x] for x in states]
    seq = [kmer[NMERS/2] for kmer in kmers]
    return "".join(seq)

In [9]:
def predict(means):
    seq = ghmm.EmissionSequence(F, means)
    result = model.viterbi(seq)
    return result_to_seq(result)

In [10]:
predict([44.2, 44.3, 56, 58.2, 56.2, 58.1, 58.2, 60, 30.2])

u'TTTGTGTGT'

# Validate Model 

In [11]:
args = {
    "events" : "/home/ibis/gregor.sturm/nanopore/own/notebooks/03_pipeline/david_events.pickle",
    "alignment" : "/home/ibis/gregor.sturm/nanopore/own/notebooks/03_pipeline/david_alignment.sorted.bam",
    "ref": "/home/ibis/gregor.sturm/nanopore/david_eccles_bc_ideas/mouse_ref.fa",
}

In [12]:
!pwd

/home/ibis/gregor.sturm/nanopore/own/notebooks/03_pipeline


In [13]:
assert os.path.isfile(args["events"])
assert os.path.isfile(args["alignment"])

In [14]:
ref_file = args["ref"]
test = !cat {ref_file} | grep ">"
print(test)
ref = !cat {ref_file} | grep -v ">"
ref = ref[0]
print(ref[:100])

['>mmusMT_PCR1']
GTTAATGTAGCTTAATAACAAAGCAAAGCACTGAAAATGCTTAGATGGATAATTGTATCCCATAAACACAAAGGTTTGGTCCTGGCCTTATAATTAATTA


In [15]:
file_data = pickle.load(open(args["events"], 'rb'))
file_data = [f for f in file_data if f is not None]

In [16]:
fmap = {}
for f in file_data: 
    fmap["ch{0}_file{1}".format(f["channel"], f["file_id"])] = f

In [17]:
def get_file(channel, file_id):
    return fmap["ch{0}_file{1}".format(channel, file_id)]

In [18]:
def get_file_and_channel(filename):
    result = re.search(r'ch(\d+)_file(\d+)_', filename)
    file_id = int(result.group(2))
    channel_id = int(result.group(1))
    return file_id, channel_id

In [19]:
samfile = pysam.AlignmentFile(args["alignment"])
reads  = [x for x in samfile.fetch()]
len(reads)

51

In [22]:
total_events = 0

called_seqs = []
for i, read in enumerate(reads):
    file_id, channel_id = get_file_and_channel(read.query_name)
    print(i, file_id, channel_id, read.query_name)
    file_obj = get_file(channel_id, file_id)
      
    events = [x["template.mean"] for x in file_obj["events"]]
    events = [x for x in events if x is not None]
    metrichor_seq = file_obj["fastq"].split("\n")[1]
    ref_seq = "".join([ref[x] for x in read.get_reference_positions()])
    called_seq = predict(events)
    print(editdistance.eval(ref_seq, metrichor_seq))
    print(editdistance.eval(ref_seq, called_seq))
    print(len(ref_seq))
    called_seqs.append((read.query_name, called_seq))
    
    

(0, 1, 157, 'ch157_file1_2d')
1376
3405
6266
(1, 6, 156, 'ch156_file6_2d')
1722
3126
6166
(2, 0, 135, 'ch135_file0_2d')
1768
3239
6112
(3, 10, 206, 'ch206_file10_2d')
2021
3155
6129
(4, 5, 204, 'ch204_file5_2d')
1801
3167
6135
(5, 19, 135, 'ch135_file19_2d')
2072
3302
6162
(6, 14, 209, 'ch209_file14_2d')
2156
3187
6170
(7, 15, 215, 'ch215_file15_2d')
1681
3231
6226
(8, 27, 215, 'ch215_file27_2d')
1866
3032
6161
(9, 3, 211, 'ch211_file3_2d')
1884
3282
6307
(10, 9, 215, 'ch215_file9_2d')
1971
3310
5720
(11, 1, 203, 'ch203_file1_2d')
3560
3024
5681
(12, 0, 157, 'ch157_file0_2d')
3726
3262
6122
(13, 1, 201, 'ch201_file1_2d')
3944
3395
6100
(14, 8, 141, 'ch141_file8_2d')
3790
3137
6092
(15, 38, 132, 'ch132_file38_2d')
3682
3493
5929
(16, 28, 211, 'ch211_file28_2d')
3852
3340
6225
(17, 5, 227, 'ch227_file5_2d')
3624
3680
5941
(18, 11, 223, 'ch223_file11_2d')
3741
3418
5976
(19, 35, 132, 'ch132_file35_2d')
1895
3219
5574
(20, 18, 142, 'ch142_file18_2d')
1981
3431
6059
(21, 20, 211, 'ch211_fil

In [23]:
with open("basecall_test.fa", 'w') as f: 
    for seq in called_seqs: 
        f.write(">" + seq[0] + "\n")
        f.write(seq[1] + "\n")

In [24]:
!/home/ibis/gregor.sturm/nanopore/tools/graphmap/graphmap \
-r ./../../../david_eccles_bc_ideas/mouse_ref.fa -d basecall_test.fa -o basecall_test.sam -t4

[Index 15:01:25] Running in fast and sensitive mode. Two indexes will be used (double memory consumption).
[Index 15:01:25] Index already exists. Loading from file.
[Index 15:01:27] Secondary index already exists. Loading from file.
[Index 15:01:28] Index loaded in 0.21 sec.
[Index 15:01:28] Memory consumption: [currentRSS = 513 MB, peakRSS = 513 MB]

[Run 15:01:28] Automatically setting the maximum allowed number of regions: max. 500, attempt to reduce after 100
[Run 15:01:28] Reference genome is assumed to be linear.
[Run 15:01:28] Only one alignment will be reported per mapped read.
[ProcessReads 15:01:28] Reads will be loaded in batches of up to 200 MB in size.
[ProcessReads 15:01:28] Batch of 51 reads (0 MiB) loaded in 0.00 sec. (28845000 bases)
[ProcessReads 15:01:28] Memory consumption: [currentRSS = 514 MB, peakRSS = 514 MB]
[ProcessReads 15:01:28] Using 4 threads.
[ProcessReads 15:01:29] [CPU time: 2.00 sec, RSS: 524 MB] Read: 51/51 (100.00%) [m: 50, u: 1]                     

In [25]:
def prepare_sam(basename):
    !samtools view -S -b {basename}.sam > {basename}.bam
    !samtools sort {basename}.bam {basename}.sorted
    !samtools index {basename}.sorted.bam

In [27]:
prepare_sam("./basecall_test")

[samopen] SAM header is present: 1 sequences.
