In [168]:
import ghmm
from collections import OrderedDict
import cPickle as pickle
import numpy as np
from itertools import product as iterproduct, chain
from pprint import pprint
import pysam
import os
import pandas
from copy import deepcopy
import re
import editdistance
import sys
import math
import random
from nbwrapper import getargs
from multiprocessing import Pool
%run "/home/ibis/gregor.sturm/nanopore/own/notebooks/05_MAP006-basecaller/lib/alignment_validation.ipynb"
%run "/home/ibis/gregor.sturm/nanopore/own/notebooks/05_MAP006-basecaller/lib/alignment_lib.ipynb"
%run "/home/ibis/gregor.sturm/nanopore/own/notebooks/05_MAP006-basecaller/lib/analysis_lib.ipynb"

In [169]:
args = getargs()

In [170]:
NMERS = 6

In [171]:
args = {
    "events" : "/home/ibis/gregor.sturm/nanopore/own/notebooks/05_MAP006-basecaller/loman006-1_100.events.template.pickle",
    "out_basename" : "/home/ibis/gregor.sturm/nanopore/own/notebooks/05_MAP006-basecaller/loman006-1_100.called",
    "ref": "/home/ibis/gregor.sturm/nanopore/NanoporeData/PublicData/LomanLab_MAP-006/ecoli_mg1655.fa",
    "hmm_params": "/home/ibis/gregor.sturm/nanopore/own/notebooks/05_MAP006-basecaller/loman006-1.model.pickle",
    "corr_model": "/home/ibis/gregor.sturm/nanopore/own/notebooks/05_MAP006-basecaller/context_prediction/models/model-test2.pickle",
    "ncores": 62,
    "nmers": NMERS,
    "multivariate": False
}
# args = {
#     "events" : "/home/ibis/gregor.sturm/nanopore/own/notebooks/05_MAP006-basecaller/wouter_lambda006_100.events.template.pickle",
#     "out_basename" : "/home/ibis/gregor.sturm/nanopore/own/notebooks/05_MAP006-basecaller/loman006-1_100.called",
#     "ref": "/home/ibis/gregor.sturm/nanopore/own/notebooks/03_pipeline/lambda_ref.fasta",
#     "hmm_params": "/home/ibis/gregor.sturm/nanopore/own/notebooks/05_MAP006-basecaller/loman006-1.model.pickle",
#     "corr_model": "/home/ibis/gregor.sturm/nanopore/own/notebooks/05_MAP006-basecaller/context_prediction/models/model-test2.pickle",
#     "ncores": 62,
#     "nmers": NMERS,
#     "multivariate": False
# }


In [172]:
NMERS = int(args["nmers"])
NSTATES = 4**NMERS
MULTIVARIATE = bool(int(args["multivariate"]))
args["ncores"] = int(args["ncores"])

In [173]:
HMM_PARAMS = pickle.load(open(args["hmm_params"], 'rb'))
HMM_PARAMS = HMM_PARAMS["/opt/chimaera/model/r7.3_e6_70bps_6mer/template_median68pA.model"]
ALL_KMERS = ["".join(x) for x in iterproduct("ACGT", repeat=NMERS)]
assert HMM_PARAMS["kmer"].tolist() == ALL_KMERS

# Train Model 

In [174]:
def mk_transmat1(nmers):
    """make a transition matrix assuming move=1"""
    n_components = len(ALL_KMERS)
    transmat = np.empty((n_components, n_components))
    for j, from_kmer in enumerate(ALL_KMERS):
        for i, to_kmer in enumerate(ALL_KMERS):
            p = 1/4. if from_kmer[-(NMERS-1):] == to_kmer[:(NMERS-1)] else 0.
            transmat[j, i] = p          
            
    return transmat.tolist()

In [175]:
def mk_transmat0(nmers):
    """make a transition matrix assuming move=0 or move=1"""
    n_components = len(ALL_KMERS)
    transmat = np.empty((n_components, n_components))
    for j, from_kmer in enumerate(ALL_KMERS):
        for i, to_kmer in enumerate(ALL_KMERS):
            p = 0
            if from_kmer[-(NMERS-1):] == to_kmer[:(NMERS-1)]:
                """move=1"""
                p = (9/10.) * (1/4.) 
            elif from_kmer == to_kmer:
                """move=0"""
                p = (1/10.) * 1
            transmat[j, i] = p          
            
    return transmat.tolist()

In [176]:
def mk_transmat2(nmers):
    """make a transition matrix assuming move=0 or move=1 or move=2"""
    n_components = len(ALL_KMERS)
    transmat = np.empty((n_components, n_components))
    for j, from_kmer in enumerate(ALL_KMERS):
        for i, to_kmer in enumerate(ALL_KMERS):
            p = 0
            if from_kmer[-(NMERS-2):] == to_kmer[:(NMERS-2)]:
                """move=2"""
                p = (2/50.) * (1/16.)
            elif from_kmer[-(NMERS-1):] == to_kmer[:(NMERS-1)]:
                """move=1"""
                p = (47/50.) * (1/4.) 
            elif from_kmer == to_kmer:
                """move=0"""
                p = (1/50.) * 1
            transmat[j, i] = p          
            
    return transmat.tolist()

In [177]:
mk_transmat = mk_transmat2

In [178]:
def mk_model_simple(): 
    """ simple model, only taking the means into account. """
    A = mk_transmat(NMERS)
    B = HMM_PARAMS[["level_mean", "level_stdv"]].values.tolist() #mu, std of each state
    pi = [1/float(NSTATES)] * NSTATES   # initial probabilities per state
    # generate model from parameters
    model = ghmm.HMMFromMatrices(F,ghmm.GaussianDistribution(F), A, B, pi)
    return model

In [179]:
F = ghmm.Float()  # emission domain of this model
def mk_model():
    if MULTIVARIATE: 
        return mk_model_multivariate()
    else: 
        return mk_model_simple()

In [180]:
model = mk_model()
s = str(model)
print(s)

GaussianEmissionHMM(N=4096)
  state 0 (initial=0.00, mu=62.78, sigma=0.84)
    Transitions: ->0 (0.00), ->1 (0.00), ->2 (0.00), ->3 (0.00), ->4 (0.00), ->5 (0.00), ->6 (0.00), ->7 (0.00), ->8 (0.00), ->9 (0.00), ->10 (0.00), ->11 (0.00), ->12 (0.00), ->13 (0.00), ->14 (0.00), ->15 (0.00)
  state 1 (initial=0.00, mu=58.02, sigma=0.66)
    Transitions: ->1 (0.02), ->4 (0.23), ->5 (0.23), ->6 (0.23), ->7 (0.23), ->16 (0.00), ->17 (0.00), ->18 (0.00), ->19 (0.00), ->20 (0.00), ->21 (0.00), ->22 (0.00), ->23 (0.00), ->24 (0.00), ->25 (0.00), ->26 (0.00), ->27 (0.00), ->28 (0.00), ->29 (0.00), ->30 (0.00), ->31 (0.00)

  ...

  state 4094 (initial=0.00, mu=45.36, sigma=0.64)
    Transitions: ->4064 (0.00), ->4065 (0.00), ->4066 (0.00), ->4067 (0.00), ->4068 (0.00), ->4069 (0.00), ->4070 (0.00), ->4071 (0.00), ->4072 (0.00), ->4073 (0.00), ->4074 (0.00), ->4075 (0.00), ->4076 (0.00), ->4077 (0.00), ->4078 (0.00), ->4079 (0.00), ->4088 (0.23), ->4089 (0.23), ->4090 (0.23), ->4091 (0.23), ->409

In [181]:
def result_to_seq(result):
    states = result[0]
    kmers = [ALL_KMERS[x] for x in states]
    seq = [kmer[0] for kmer in kmers] + [kmers[-1][1:]]
    return "".join(seq)

In [182]:
def predict(events):
    """mixed is a set of tuples (event_mean, event_stdv)"""
    emissions = [x[0] for x in events]
    seq = ghmm.EmissionSequence(F, emissions)
    result = model.viterbi(seq)
    return result_to_seq(result)

In [183]:
s = model.sampleSingle(10)
s = [x for x in s]
seq = zip([s[i] for i in range(0, len(s), 2)], [s[i] for i in range(1, len(s), 2)])

In [184]:
predict(seq)

'TAGTGGCGCC'

In [185]:
model

<ghmm.GaussianEmissionHMM at 0x7f2f00435210>

## multistep-prediction

In [186]:
from sklearn import ensemble
import joblib
import mltools

In [187]:
corr_model = joblib.load(args["corr_model"])

In [188]:
OFFSET = 20

In [189]:
def normalize_col(features, col): 
    col_data = [f[col] for f in features]
    col_data = mltools.normalize(col_data)
    for i, f in enumerate(features): 
        f[col] = col_data[i]
    return features

In [190]:
def correct_events(events, seq): 
    corr_range = (OFFSET, len(seq)-OFFSET-NMERS)
#     print (">> started feature generation")
    features = []
    for i in range(*corr_range):
        event = events[i]
        feat = []
        feat.append(event[0])
        feat.append(event[1])
        feat.extend(mltools.seq2binvec(seq[i-OFFSET:i+NMERS+OFFSET]))
        features.append(feat)

    features = normalize_col(features, 0)
    features = normalize_col(features, 1)
    
    #print([x for x in enumerate(features[0])])
        
#     print (">> started correction prediction")
    correction = corr_model.predict(features)
    for i, j  in enumerate(range(*corr_range)):
        mean, stdv = events[j]
        tmp_mean = mean - .5 * correction[i]
        ratio = tmp_mean/mean
        tmp_stdv = ratio * stdv
        events[j] = tmp_mean, tmp_stdv
#     print (">> correction applied")
    return events

In [191]:
def predict_iterative(events, n_steps=3): 
    for _ in range(n_steps):
#         print (">> predict {0}".format(_))
        seq = predict(events)
        if _ < n_steps-1: 
#             print (">> correct {0}".format(_))
#             print(seq)
#             print(means)
            events = correct_events(events, seq)
#             print(means)
    return seq

In [192]:
file_data = pickle.load(open(args["events"], 'rb'))
file_data = [f for f in file_data if f is not None]

In [193]:
file_obj = correct_read(file_data[7], col="mean")
events = [(x["mean"], x["stdv"]) for x in file_obj["events"].to_dict("records")]

In [194]:
seq = predict(events)

In [195]:
seq[18:40]

'CTTCTTGTTCAGTTTCTGAGCT'

In [196]:
events[18:30]

[(55.60145482006586, 0.6603919956543661),
 (59.63927379425131, 2.321354572072122),
 (50.03479198061651, 0.48440114874166157),
 (48.50134036335742, 0.562943769826082),
 (41.90360182508564, 1.2487938620804517),
 (46.840335259801854, 0.9982156075538023),
 (59.13531558945736, 0.8090309460540313),
 (62.69417833392122, 1.7257578113042558),
 (49.48192750307938, 0.8809297388753783),
 (44.32932399549752, 0.7315835445520823),
 (43.20622155658872, 0.5149720773474338),
 (45.87098708017269, 0.7046783476310581)]

In [197]:
corr_range = (OFFSET, len(seq)-OFFSET-NMERS)
#     print (">> started feature generation")
features = []
for i in range(*corr_range):
    event = events[i]
    feat = []
    feat.append(event[0])
    feat.append(event[1])
    feat.extend(mltools.seq2binvec(seq[i-OFFSET:i+NMERS+OFFSET]))
    features.append(feat)

features = normalize_col(features, 0)
features = normalize_col(features, 1)

#print([x for x in enumerate(features[0])])

In [198]:
[x for x in enumerate(features[0])]

[(0, 0.3020380852778277),
 (1, 0.06079100131230351),
 (2, 0),
 (3, 1),
 (4, 0),
 (5, 0),
 (6, 1),
 (7, 0),
 (8, 0),
 (9, 0),
 (10, 0),
 (11, 0),
 (12, 1),
 (13, 0),
 (14, 0),
 (15, 0),
 (16, 0),
 (17, 1),
 (18, 0),
 (19, 0),
 (20, 0),
 (21, 1),
 (22, 1),
 (23, 0),
 (24, 0),
 (25, 0),
 (26, 0),
 (27, 0),
 (28, 0),
 (29, 1),
 (30, 0),
 (31, 0),
 (32, 1),
 (33, 0),
 (34, 0),
 (35, 0),
 (36, 0),
 (37, 1),
 (38, 0),
 (39, 1),
 (40, 0),
 (41, 0),
 (42, 0),
 (43, 0),
 (44, 0),
 (45, 1),
 (46, 0),
 (47, 0),
 (48, 1),
 (49, 0),
 (50, 0),
 (51, 0),
 (52, 0),
 (53, 1),
 (54, 1),
 (55, 0),
 (56, 0),
 (57, 0),
 (58, 0),
 (59, 0),
 (60, 1),
 (61, 0),
 (62, 0),
 (63, 0),
 (64, 0),
 (65, 1),
 (66, 1),
 (67, 0),
 (68, 0),
 (69, 0),
 (70, 1),
 (71, 0),
 (72, 0),
 (73, 0),
 (74, 0),
 (75, 1),
 (76, 0),
 (77, 0),
 (78, 0),
 (79, 0),
 (80, 0),
 (81, 1),
 (82, 0),
 (83, 0),
 (84, 0),
 (85, 1),
 (86, 0),
 (87, 1),
 (88, 0),
 (89, 0),
 (90, 0),
 (91, 0),
 (92, 0),
 (93, 1),
 (94, 0),
 (95, 0),
 (96, 0),
 (97,

In [199]:
#     print (">> started correction prediction")
correction = corr_model.predict(features)

In [200]:
OFFSET

20

In [201]:
correction[:10]

array([ 0.7993523 ,  0.37097458,  0.2274876 , -0.63482867,  0.36194889,
        0.74804053,  1.0275012 ,  0.15076036, -0.02243579, -0.12817443])

In [202]:
events[OFFSET:OFFSET+10]

[(50.03479198061651, 0.48440114874166157),
 (48.50134036335742, 0.562943769826082),
 (41.90360182508564, 1.2487938620804517),
 (46.840335259801854, 0.9982156075538023),
 (59.13531558945736, 0.8090309460540313),
 (62.69417833392122, 1.7257578113042558),
 (49.48192750307938, 0.8809297388753783),
 (44.32932399549752, 0.7315835445520823),
 (43.20622155658872, 0.5149720773474338),
 (45.87098708017269, 0.7046783476310581)]

In [203]:
for i, j  in enumerate(range(*corr_range)):
    mean, stdv = events[j]
    tmp_mean = mean - .5 * correction[i]
    ratio = tmp_mean/mean
    tmp_stdv = ratio * stdv
    events[j] = tmp_mean, tmp_stdv
#     print (">> correction applied")

In [204]:
events[OFFSET:OFFSET+10]

[(49.635115830321332, 0.48053176948247267),
 (48.315853071308801, 0.56079086199593942),
 (41.78985802401013, 1.2454041162245775),
 (47.157749594072861, 1.004980032717995),
 (58.954341142318526, 0.80655503252045335),
 (62.320158069883199, 1.7154623036605985),
 (48.968176904224258, 0.87178340598710613),
 (44.253943814971343, 0.73033951701708277),
 (43.217439452614848, 0.51510578270309593),
 (45.935074292751452, 0.70566286690868674)]

# Validate Model 

In [205]:
!pwd

/home/ibis/gregor.sturm/nanopore/own/notebooks


In [206]:
assert os.path.isfile(args["events"])

In [207]:
ref = load_ref(args["ref"])

['>gi|556503834|ref|NC_000913.3| Escherichia coli str. K-12 substr. MG1655, complete genome']
AGCTTTTCATTCTGACTGCAACGGGCAATATGTCTCTGTGTGGATTAAAAAAAGAGTGTCTGATAGCAGCTTCTGAACTGGTTACCTGCCGTGAGTAAAT


In [208]:
file_data = pickle.load(open(args["events"], 'rb'))
file_data = [f for f in file_data if f is not None]

In [209]:
prepare_filemap(file_data)

In [210]:
def basecall_read(file_obj):
    file_obj = correct_read(file_obj, col="mean")
    events = [(x["mean"], x["stdv"]) for x in file_obj["events"].to_dict("records")]
    called_seq = predict_iterative(events)
    return (file_obj["channel"], file_obj["file_id"], called_seq)

In [211]:
# """ train with baum-welch """
# for i, file_obj in enumerate(file_data): 
#     sys.stdout.write('\rdone {0:%}'.format(i/float(len(file_data))))
#     train_read(file_obj)

In [212]:
p = Pool(args["ncores"])

In [None]:
#basecall_read(file_data[3])

In [None]:
print("Prediction: ")
results = []
try:
    for i, res in enumerate(p.imap_unordered(basecall_read, file_data), 1):
        results.append(res)
        sys.stdout.write('\rdone {0:%}'.format(i/float(len(file_data))))
    p.close()
    p.join()
except KeyboardInterrupt:
    p.terminate()

### Stats

In [None]:
types = ["metrichor", "called", "random"]
fasta_files = {t: "{0}.{1}.fa".format(args["out_basename"], t) for t in types}

In [None]:
## metrichor fasta
with open(fasta_files["metrichor"], 'w') as f: 
    for file_obj in file_data: 
        f.write(">ch{0}_file{1}_metrichor".format(file_obj["channel"], file_obj["file_id"])+ "\n")
        f.write(file_obj["fastq"].split("\n")[1] + "\n")

In [None]:
## called fasta/random fasta
with open(fasta_files["called"], 'w') as f: 
    with open(fasta_files["random"], 'w') as fr:
        for channel, file_id, seq in results: 
            f.write(">ch{0}_file{1}_called".format(channel, file_id)+ "\n")
            fr.write(">ch{0}_file{1}_random".format(channel, file_id)+ "\n")
            f.write(seq + "\n")
            fr.write("".join([random.choice("ACGT") for _ in range(len(seq))]))

In [None]:
for t in types: 
    sam_file = "{0}.{1}.sam".format(args["out_basename"], t)
    graphmap(args["ref"], fasta_files[t], sam_file, args["ncores"])
    prepare_sam("{0}.{1}".format(args["out_basename"], t))

In [None]:
def mk_stat(t):
    samfile = "{0}.{1}.sorted.bam".format(args["out_basename"], t)
    sst = samstats(samfile, ref, ncores=args["ncores"])
    return pandas.DataFrame(sst.print_summary())

In [None]:
stats = map(mk_stat, types)
print(types)
side_by_side(*stats)

In [None]:
# for t, df in zip(types, stats):
#     with open("{0}.stats.{1}.html".format(args["out_basename"], t), 'w') as f:
#         f.write(df.to_html())

In [None]:
# def score_consensus(t):
#     consensus = mk_consensus("{0}.{1}.sorted.bam".format(args["out_basename"], t), ref_file)
#     return(consensus)
#     consensus = consensus.split("\n")[1].to_upper()
#     score = needle(ref, consensus)
#     return (consensus, score)

In [None]:
# p = Pool(args["ncores"])
# try:
#     consensus = p.map(score_consensus, types)
#     p.close()
# except KeyboardInterrupt:
#     p.terminate()

In [None]:
# consensus

In [None]:
# mk_consensus("{0}.{1}.sorted.bam".format(args["out_basename"], "metrichor"), ref_file)