In [2]:
import ghmm
from collections import OrderedDict
import cPickle as pickle
import numpy as np
from itertools import product as iterproduct, chain
from pprint import pprint
import pysam
import os
import pandas
from copy import deepcopy
import re
import editdistance
import sys
import random
from nbwrapper import getargs
from multiprocessing import Pool
%run "/home/ibis/gregor.sturm/nanopore/own/notebooks/03_pipeline/alignment_validation.ipynb"
%run "/home/ibis/gregor.sturm/nanopore/own/notebooks/03_pipeline/alignment_lib.ipynb"
%run "/home/ibis/gregor.sturm/nanopore/own/notebooks/03_pipeline/analysis_lib.ipynb"

In [3]:
args = getargs()



In [4]:
args = {
    "events" : "/home/ibis/gregor.sturm/nanopore/own/notebooks/03_pipeline/david_events.template.pickle",
    "raw" : "/home/ibis/gregor.sturm/nanopore/own/notebooks/03_pipeline/david_raw.pickle",
    "out_basename" : "/home/ibis/gregor.sturm/nanopore/own/notebooks/03_pipeline/david_calling",
    "ref": "/home/ibis/gregor.sturm/nanopore/david_eccles_bc_ideas/mouse_ref.fa",
    "hmm_params": "/home/ibis/gregor.sturm/nanopore/own/notebooks/03_pipeline/david_hmm_params_raw_3mer.pickle",
    "ncores": 60,
    "nmers": 3,
}

# args = {
#     "events" : "/home/ibis/gregor.sturm/nanopore/own/notebooks/03_pipeline/lambda_events.template.pickle",
#     "out_basename" : "/home/ibis/gregor.sturm/nanopore/own/notebooks/03_pipeline/lambda_calling",
#     "ref": "/home/ibis/gregor.sturm/nanopore/own/notebooks/03_pipeline/lambda_ref.fasta",
#     "hmm_params": "/home/ibis/gregor.sturm/nanopore/own/notebooks/03_pipeline/lambda_hmm_params_{0}mer.pickle".format(NMERS),
#     "ncores": 62,
#     "nmers": 3,
#     "multivariate": True
# }

In [205]:
NMERS = int(args["nmers"])
NSTATES = 4**NMERS
args["ncores"] = int(args["ncores"])
MEAN_LENGTH = 175

In [206]:
HMM_PARAMS = pickle.load(open(args["hmm_params"], 'rb'))
HMM_PARAMS = OrderedDict(HMM_PARAMS)
ALL_KMERS = ["".join(x) for x in iterproduct("ACGT", repeat=NMERS)]
assert HMM_PARAMS.keys() == ALL_KMERS

# Train Model 

In [207]:
def mk_transmat1(nmers):
    """make a transition matrix assuming move=1"""
    n_components = len(ALL_KMERS)
    transmat = np.empty((n_components, n_components))
    p_move = float(1)/MEAN_LENGTH
    p_stay = 1-p_move
    for j, from_kmer in enumerate(ALL_KMERS):
        for i, to_kmer in enumerate(ALL_KMERS):
            if from_kmer[-(NMERS-1):] == to_kmer[:(NMERS-1)]:
                p = p_move * 1/4.
            elif from_kmer == to_kmer: 
                p = p_stay
            else: 
                p = 0
            transmat[j, i] = p          
            
    return transmat.tolist()

In [208]:
def mk_transmat2(nmers):
    """make a transition matrix assuming move=1"""
    n_components = len(ALL_KMERS)
    transmat = np.empty((n_components, n_components))
    p_move = float(1)/MEAN_LENGTH
    p_stay = 1-p_move
    for j, from_kmer in enumerate(ALL_KMERS):
        for i, to_kmer in enumerate(ALL_KMERS):
            if from_kmer[-(NMERS-2):] == to_kmer[:(NMERS-2)]:
                p = p_move * 1/3. * 1/16. 
            elif from_kmer[-(NMERS-1):] == to_kmer[:(NMERS-1)]:
                p = p_move *2/3. * 1/4.
            elif from_kmer == to_kmer: 
                p = p_stay
            else: 
                p = 0
            transmat[j, i] = p          
            
    return transmat.tolist()

In [209]:
mk_transmat = mk_transmat1

In [210]:
F = ghmm.Float()  # emission domain of this model

In [211]:
def mk_model_simple(): 
    """ simple model, only taking the means into account. """
    A = mk_transmat(NMERS)
    B = [ [float(df[['mean']].mean()), float(df[['mean']].std())] #mu1, stdv
            for df in HMM_PARAMS.values()]   # parameters of emission distributions in pairs of (mu, sigma)
    pi = [1/float(NSTATES)] * NSTATES   # initial probabilities per state

    # generate model from parameters
    model = ghmm.HMMFromMatrices(F,ghmm.GaussianDistribution(F), A, B, pi)
    return model

In [212]:
def mk_model():
    return mk_model_simple()

In [213]:
model = mk_model()
s = str(model)
print(s)

GaussianEmissionHMM(N=64)
  state 0 (initial=0.02, mu=302.24, sigma=17.15)
    Transitions: ->0 (0.00), ->1 (0.00), ->2 (0.00), ->3 (0.00)
  state 1 (initial=0.02, mu=272.18, sigma=15.59)
    Transitions: ->1 (0.99), ->4 (0.00), ->5 (0.00), ->6 (0.00), ->7 (0.00)

  ...

  state 62 (initial=0.02, mu=230.94, sigma=12.95)
    Transitions: ->56 (0.00), ->57 (0.00), ->58 (0.00), ->59 (0.00), ->62 (0.99)
  state 63 (initial=0.02, mu=215.48, sigma=12.62)
    Transitions: ->60 (0.00), ->61 (0.00), ->62 (0.00), ->63 (0.00)



In [214]:
def collapse_seq(seq):
    if len(seq) == 0: 
        return []
    collapsed = [seq[0]] + [seq[i] for i in range(1, len(seq)) if seq[i-1] != seq[i]]
    return collapsed

In [215]:
def result_to_seq(result):
    states = result[0]
    kmers = [ALL_KMERS[x] for x in states]
    seq = [kmer[NMERS/2] for kmer in kmers]
    seq = collapse_seq(seq)
    return "".join(seq)

In [216]:
def predict(raw):
    """mixed is a set of tuples (event_mean, event_stdv)"""
    seq = ghmm.EmissionSequence(F, raw.tolist())
    result = model.viterbi(seq)
    return result_to_seq(result)

In [217]:
s = model.sampleSingle(1000)
s = np.array([x for x in s])

In [218]:
predict(s)

'TCTGATA'

# Validate Model 

In [18]:
!pwd

/home/ibis/gregor.sturm/nanopore/own/notebooks


In [19]:
assert os.path.isfile(args["events"])

In [20]:
ref_file = args["ref"]
test = !cat {ref_file} | grep ">"
print(test)
ref = !cat {ref_file} | grep -v ">"
ref = ref[0]
print(ref[:100])

['>mmusMT_PCR1']
GTTAATGTAGCTTAATAACAAAGCAAAGCACTGAAAATGCTTAGATGGATAATTGTATCCCATAAACACAAAGGTTTGGTCCTGGCCTTATAATTAATTA


In [21]:
file_data = pickle.load(open(args["events"], 'rb'))
file_data = [f for f in file_data if f is not None]

In [22]:
raw_data = pickle.load(open(args["raw"], 'rb'))

In [23]:
prepare_filemap(file_data)

In [232]:
def basecall_read(file_obj):
    try:
        read_min = file_obj["events"]["template.start"].iloc[0]
        read_max = file_obj["events"]["template.end"].iloc[-1]
    except KeyError: 
        read_min = file_obj["events"]["start"].iloc[0]
        read_max = file_obj["events"]["end"].iloc[-1]
    raw = raw_data[file_obj["channel"]]
    for i in range(read_min, read_max): 
        if np.isnan(raw[i]): 
            ## raw data now available
            return (file_obj["channel"], file_obj["file_id"], None)
        if raw[i] < 150: raw[i] = 150
        if raw[i] > 450: raw[i] = 450
#     raw_new = np.copy(raw)
    for i in range(read_min, read_max-50):
        raw[i] = np.mean(raw[i:i+50])      
    called_seq = predict(raw[read_min:read_max])
    return (file_obj["channel"], file_obj["file_id"], called_seq)

In [233]:
# for file_obj in file_data:
#     print(file_obj["file_id"], file_obj["channel"])
#     print(basecall_read(file_obj))

In [234]:
p = Pool(args["ncores"])

In [235]:
print("Prediction: ")
results = []
try:
    for i, res in enumerate(p.imap_unordered(basecall_read, file_data), 1):
        results.append(res)
        sys.stdout.write('\rdone {0:%}'.format(i/float(len(file_data))))
    p.close()
    p.join()
except KeyboardInterrupt:
    p.terminate()

Prediction: 
done 100.000000%

In [236]:
for ch, fid, seq in results: 
    fo = get_file(ch, fid)
    if seq is not None:
        print(len(seq), len(fo["fastq"].split("\n")[1]))

(2163, 921)
(2519, 1168)
(1640, 754)
(2479, 1116)
(3099, 1380)
(2768, 1246)
(3812, 1614)
(4638, 2519)
(5079, 2359)
(5330, 2104)
(5835, 2658)
(6431, 2931)
(6455, 2681)
(4948, 1972)
(7998, 3517)
(7598, 3187)
(6690, 3219)
(7115, 3171)
(9207, 4050)
(7670, 3102)
(10717, 4866)
(13207, 6292)
(11731, 5322)
(14835, 6709)
(12993, 6570)
(7726, 6009)
(14301, 6770)
(13690, 6117)
(14021, 6642)
(15391, 6831)
(15483, 6636)
(15239, 6867)
(12493, 5700)
(14978, 6827)
(14585, 6255)
(13123, 6329)
(16776, 7605)
(10285, 4556)
(14692, 6831)
(14963, 6393)
(14664, 6696)
(16996, 7379)
(15426, 6586)
(10803, 4725)
(15870, 6578)
(15222, 7058)
(16740, 7380)
(15248, 6602)
(14707, 6734)
(11410, 4496)
(17080, 7450)
(15709, 6907)
(15622, 6895)
(17930, 7818)
(16453, 6878)
(16625, 7596)
(16865, 6842)
(12902, 5256)
(17568, 7621)
(13836, 6256)
(15035, 6870)
(16591, 7077)
(15927, 6757)
(16003, 6672)
(13997, 5829)
(18144, 7754)
(17253, 7251)
(17052, 6917)
(24971, 10862)
(18177, 7233)
(19500, 7132)
(33654, 14176)
(18516, 7086)

### Stats

In [237]:
types = ["metrichor", "called", "random"]
fasta_files = {t: "{0}.{1}.fa".format(args["out_basename"], t) for t in types}

In [238]:
## metrichor fasta
with open(fasta_files["metrichor"], 'w') as f: 
    for file_obj in file_data: 
        f.write(">ch{0}_file{1}_metrichor".format(file_obj["channel"], file_obj["file_id"])+ "\n")
        f.write(file_obj["fastq"].split("\n")[1] + "\n")

In [239]:
## called fasta/random fasta
with open(fasta_files["called"], 'w') as f: 
    with open(fasta_files["random"], 'w') as fr:
        for channel, file_id, seq in results: 
            if seq is not None:
                f.write(">ch{0}_file{1}_called".format(file_obj["channel"], file_obj["file_id"])+ "\n")
                fr.write(">ch{0}_file{1}_random".format(file_obj["channel"], file_obj["file_id"])+ "\n")
                f.write(seq + "\n")
                fr.write("".join([random.choice("ACGT") for _ in range(len(seq))]))

In [240]:
for t in types: 
    sam_file = "{0}.{1}.sam".format(args["out_basename"], t)
    graphmap(ref_file, fasta_files[t], sam_file, args["ncores"])
    prepare_sam("{0}.{1}".format(args["out_basename"], t))

[Index 13:54:27] Running in fast and sensitive mode. Two indexes will be used (double memory consumption).
[Index 13:54:27] Index already exists. Loading from file.
[Index 13:54:27] Secondary index already exists. Loading from file.
[Index 13:54:27] Index loaded in 0.47 sec.
[Index 13:54:27] Memory consumption: [currentRSS = 515 MB, peakRSS = 6172 MB]

[Run 13:54:27] Automatically setting the maximum allowed number of regions: max. 500, attempt to reduce after 100
[Run 13:54:27] Reference genome is assumed to be linear.
[Run 13:54:27] Only one alignment will be reported per mapped read.
[ProcessReads 13:54:27] Reads will be loaded in batches of up to 200 MB in size.
[ProcessReads 13:54:27] Batch of 81 reads (0 MiB) loaded in 0.00 sec. (30909608 bases)
[ProcessReads 13:54:27] Memory consumption: [currentRSS = 515 MB, peakRSS = 6172 MB]
[ProcessReads 13:54:27] Using 60 threads.
[ProcessReads 13:54:28] [CPU time: 12.35 sec, RSS: 528 MB] Read: 81/81 (100.00%) [m: 80, u: 1]                 

In [241]:
def mk_stat(t):
    samfile = pysam.AlignmentFile("{0}.{1}.sorted.bam".format(args["out_basename"], t))
    sst = samstats(samfile, ref)
    return pandas.DataFrame(sst.print_summary())

In [242]:
p = Pool(args["ncores"])

In [243]:
try:
    stats = p.map(mk_stat, types)
    p.close()
except KeyboardInterrupt:
    p.terminate()

In [244]:
print(types)
side_by_side(*stats)

['metrichor', 'called', 'random']


Unnamed: 0,0,1,2,3
0,mapped_reads/total_reads,80,81,98.765432%
1,significant_reads/total_reads,68,81,83.950617%
2,mapped_nts/total_nts,387166,454059,85.267774%
3,editdistance/alignment_length,201132,442591,45.444214%
4,alignment_score/alignment_length,319854,442591,72.268528%
5,SNPs/mapped_nts,78814,387166,20.356643%
6,ins/mapped_nts,66821,387166,17.259005%
7,del/mapped_nts,55425,387166,14.315565%

Unnamed: 0,0,1,2,3
0,mapped_reads/total_reads,34,73,46.575342%
1,significant_reads/total_reads,2,73,2.739726%
2,mapped_nts/total_nts,284728,371986,76.542666%
3,editdistance/alignment_length,181581,300289,60.468748%
4,alignment_score/alignment_length,-70760,300289,-23.563967%
5,SNPs/mapped_nts,78762,284728,27.662190%
6,ins/mapped_nts,87237,284728,30.638715%
7,del/mapped_nts,15561,284728,5.465216%

Unnamed: 0,0,1,2,3
0,mapped_reads/total_reads,70,73,95.890411%
1,significant_reads/total_reads,0,73,0.000000%
2,mapped_nts/total_nts,674541,887570,75.998625%
3,editdistance/alignment_length,444666,712420,62.416271%
4,alignment_score/alignment_length,-227223,712420,-31.894529%
5,SNPs/mapped_nts,193758,674541,28.724421%
6,ins/mapped_nts,212976,674541,31.573470%
7,del/mapped_nts,37879,674541,5.615522%


In [None]:
for t, df in zip(types, stats):
    with open("{0}.stats.{1}.html".format(args["out_basename"], t), 'w') as f:
        f.write(df.to_html())

In [None]:
# def score_consensus(t):
#     consensus = mk_consensus("{0}.{1}.sorted.bam".format(args["out_basename"], t), ref_file)
#     return(consensus)
#     consensus = consensus.split("\n")[1].to_upper()
#     score = needle(ref, consensus)
#     return (consensus, score)

In [None]:
# p = Pool(args["ncores"])
# try:
#     consensus = p.map(score_consensus, types)
#     p.close()
# except KeyboardInterrupt:
#     p.terminate()

In [None]:
# consensus

In [None]:
# mk_consensus("{0}.{1}.sorted.bam".format(args["out_basename"], "metrichor"), ref_file)