In [1]:
import pickle
from collections import OrderedDict
import scipy.stats
from pylab import * 
import seaborn as sns
from IPython.display import display
import pandas
from skbio.sequence import DNA
%matplotlib inline
%run "/home/ibis/gregor.sturm/nanopore/own/notebooks/05_MAP006-basecaller/lib/analysis_lib.ipynb"

In [2]:
!pwd

/home/ibis/gregor.sturm/nanopore/own/notebooks


In [3]:
NMER = 6

In [4]:
args = {
    "true_events": "/home/ibis/gregor.sturm/nanopore/own/notebooks/05_MAP006-basecaller/loman006-1_100.alignment_true_events.pickle",
    "model_out": "/home/ibis/gregor.sturm/nanopore/own/notebooks/05_MAP006-basecaller/loman006-1.model.computed.pickle"
}

In [5]:
true_events = pickle.load(open(args["true_events"], 'rb'))

### make dict with all kmers 

In [6]:
all_kmers = mk_kmer_dict(NMER)
ev_map = mk_event_map(true_events, all_kmers)

# Analysis

## compare features

In [7]:
def sorted_boxplot(ordered_dict, ylim=None):
    sns.set_context("notebook", font_scale=1.5, rc={"lines.linewidth": 1})
    map2 = OrderedDict(sorted(ordered_dict.items(), key=lambda x : 0 if len(x[1]) == 0 else np.mean(x[1])))
    fig, ax = subplots(figsize = (15, 5))
    boxplot(list(map2.values()))
    ax.set_xticklabels(list(map2.keys()), rotation=90, fontsize=8)
    if ylim is not None:
        ax.set_ylim(ylim)
    fig.show()

In [8]:
def make_feature_map(process_event, ev_map):
    """
    Args:
        process_events: function(strand, event) with 
            strand in ["template", "complement"] and 
            event containing the event features
        ev_map: dict kmer -> list of correct events
            
    Returns: 
        dict kmer -> list_with_features
    """
    fmap = {kmer: [] for kmer in ev_map.keys()}
    for kmer, events in ev_map.items():
        kmer_c = str(DNA(kmer).reverse_complement())
        d = {"template" : [], "complement" : []}
        for ev in events: 
            for t in d.keys():
                feature = process_event(t, ev)
                if not np.isnan(feature): 
                    d[t].append(feature)

        fmap[kmer].extend(d["template"])
        #fmap[kmer_c].extend(d["complement"])
    return fmap

### mean pA

In [9]:
def feature_pa(t, ev): 
    return ev["{0}.mean".format(t)]

pa_map = make_feature_map(feature_pa, ev_map)

In [10]:
# sorted_boxplot(pa_map)

### mean stdev

In [11]:
def feature_stdv(t, ev): 
    return ev["{0}.stdv".format(t)]

stdv_map = make_feature_map(feature_stdv, ev_map)

In [12]:
# sorted_boxplot(stdv_map)

### mean length

In [13]:
def length_feature(t, ev):
    l, r = ev["{0}.start".format(t)], ev["{0}.end".format(t)]
    try:
        return r-l
    except: 
        return None

len_map = make_feature_map(length_feature, ev_map)

In [14]:
# mean([mean(x) for x in len_map.values()])

In [15]:
# sorted_boxplot(len_map, ylim=(0,600))

## outputs for hmm

In [16]:
model = []
for kmer in all_kmers: 
    entry = {
        "kmer": kmer,
        "level_mean": mean(pa_map[kmer]),
        "level_stdv": std(pa_map[kmer]),
        "sd_mean": mean(stdv_map[kmer]),
        "sd_stdv": std(stdv_map[kmer]),
        "weight": 1.
    }
    model.append(entry)
model = pandas.DataFrame(model)



In [17]:
models = {}
models["/opt/chimaera/model/r7.3_e6_70bps_6mer/template_median68pA.model"] = model

In [18]:
pickle.dump(models, open(args["model_out"], 'wb'), protocol=2)