In [1]:
import cPickle
import glob
import os
import subprocess
import tempfile

import cdpybio as cpb
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pybedtools as pbt
from scipy.linalg import svd
#import seaborn as sns

#import fampy as fpy
import cardipspy as cpy

%matplotlib inline

In [2]:
outdir = os.path.join('/home/paola/Family1070/private_output/Jaccard', 'output',
                      'jaccard_analysis_25_state')
cpy.makedir(outdir)

private_outdir = os.path.join('/home/paola/Family1070/private_output/Jaccard', 'private_output',
                              'jaccard_analysis_25_state')
cpy.makedir(private_outdir)

In [3]:
# Roadmap 25 state data.

roadmap_beds = sorted(cpy.roadmap_25_state_beds)

fn = os.path.join(os.path.split(cpy.roadmap_25_state_annotation)[0], 'EIDlegend.txt')

roadmap_ids = pd.read_table(fn, index_col=0, header=None, squeeze=True)
state_annot = pd.read_table(cpy.roadmap_25_state_annotation, index_col=0)

In [4]:
# This defines a subset of lines for plotting.
lines = ['E003', 'E011', 'E012', 'E013', 'E020', 'E032', 'E034', 'E055',
         'E063', 'E065', 'E066', 'E071', 'E083', 'E082', 'E096', 'E104',
         'E105', 'E108', 'E116', 'E117']
names = ['H9 hESC', 'hESC-Endoderm', 'hESC-Ectoderm', 'hESC-Mesoderm',
         'hiPSC', 'Blood B cells', 'Blood T cells', 'Fibroblasts  ',
         'Adipocytes', 'Aorta', 'Liver', 'Hippocampus ', 'Fetal Heart',
         'Fetal Brain', 'Lung', 'Right Atrium', 'Right Ventricle',
         'Skeletal Muscle', 'Lymphoblastoids', 'HeLa cells']
subset = pd.Series(lines, index=names)

In [5]:
def jaccard_similarity(peaks_bed, hmm_bed, outdir):
    eid = os.path.split(hmm_bed)[1][0:4]
    out = os.path.join(outdir, '{}_jac.tsv'.format(eid))
    bt_jacs = []
    bt = pbt.BedTool(hmm_bed)
    for state in state_annot.name:
        new_bt = bt.filter(lambda x: x.name == state)
        tf = tempfile.NamedTemporaryFile(delete=False)
        new_bt.saveas(tf.name)
        c = '/frazer01/software/bedtools-2.25.0/bin/bedtools jaccard -a {} -b {}'.format(peaks_bed.fn, tf.name)
        res = subprocess.check_output(c, shell=True)
        os.remove(tf.name)
        names, values = [x.split('\t') for x in res.strip().split('\n')]
        bt_jacs.append(pd.Series(values, index=names).astype(float))
    pd.DataFrame(bt_jacs, index=state_annot.name).to_csv(out, sep='\t')

In [6]:
from ipyparallel import Client  
parallel_client = Client()#profile='parallel')
dview = parallel_client[:]
print('Cluster has {} engines.'.format(len(parallel_client.ids)))

Cluster has 20 engines.


In [7]:
with dview.sync_imports():
    import glob
    import os
    import subprocess
    import tempfile
    import pandas
    import cardipspy
    import pybedtools

importing glob on engine(s)
importing os on engine(s)
importing subprocess on engine(s)
importing tempfile on engine(s)
importing pandas on engine(s)
importing cardipspy on engine(s)
importing pybedtools on engine(s)


In [8]:
%px pd = pandas
%px cpy = cardipspy
%px pbt = pybedtools

In [9]:
dview.push(dict(state_annot=state_annot, jaccard_similarity=jaccard_similarity))
dview.scatter('roadmap_beds', roadmap_beds)

<AsyncResult: scatter>

In [10]:
def make_jaccard_table(dy):
    """Combine individual jaccard results into a table."""
    fns = glob.glob(os.path.join(dy, 'E*_jac.tsv'))
    res = []
    for fn in fns:
        df = pd.read_table(fn, index_col=0)
        res.append(df.jaccard)
    df = pd.DataFrame(res, index=[os.path.split(x)[1][0:4] for x in fns])
    df = df.ix[sorted(df.index)]
    return df

In [11]:
def do_analysis(name, bed):
    bt = pbt.BedTool(bed)
    bt = bt.sort()
    dview.push(dict(bt=bt))

    toutdir = os.path.join(outdir, name)
    cpy.makedir(toutdir)
    dview.push(dict(outdir=toutdir))

    %px [jaccard_similarity(bt, x, outdir) for x in roadmap_beds];

    res = make_jaccard_table(toutdir)
    res.to_csv(os.path.join(toutdir, 'jaccard.tsv'), sep='\t')
    res = res - res.mean()
    res = res / res.std()
    res.to_csv(os.path.join(toutdir, 'jaccard_z_score.tsv'), sep='\t')

In [12]:
name = 'NKX25'
bt = pbt.BedTool('/home/paola/Family1070/private_output/PeakCalling/NKX25/meta_macs2_callPeak_peaks.q001.narrowPeak.collapse.bed')
bt = bt.sort()
dview.push(dict(bt=bt))

toutdir = os.path.join(outdir, name)

In [13]:
cpy.makedir(toutdir)
dview.push(dict(outdir=toutdir))

#%px [jaccard_similarity(bt, x, outdir) for x in roadmap_beds];
res = dview.map_sync(lambda x: jaccard_similarity(bt, x, outdir), roadmap_beds)

res = make_jaccard_table(toutdir)
res.to_csv(os.path.join(toutdir, 'jaccard.tsv'), sep='\t')
res = res - res.mean()
res = res / res.std()
res.to_csv(os.path.join(toutdir, 'jaccard_z_score.tsv'), sep='\t')