In [1]:
import sys
sys.path.insert(0, '/net/proteome/home/nezar/local/devel/tools/functional/lib/')
sys.path.insert(0, '/net/proteome/home/nezar/local/devel/tools/genomebrowser/')
sys.path.insert(0, '/net/proteome/home/nezar/local/devel/tools/')
sys.path.insert(0, '/net/proteome/home/nezar/local/devel/bbifile/pykent/')
%matplotlib inline

In [5]:
import bioframe

In [12]:
from collections import OrderedDict

import matplotlib as mpl
import seaborn as sns; sns.set_style('white')
import matplotlib.pyplot as plt
import numpy as np
import pandas
import h5py
import requests

from bioframe.tools import bedtools, tsv
import bioframe
# import piling
# from kent import bbi

chromsizes = bioframe.fetch_chromsizes('hg19')

In [13]:
def get_peak_files(db, centername, factor, prefix, suffix):
    base_url = 'http://hgdownload.cse.ucsc.edu/goldenPath/' + db + '/encodeDCC/' + centername
    r = requests.get(base_url + '/files.txt')
    filenames = []
    for line in r.text.split('\n'):
        filename = line.split('\t')[0]
        if ((filename.startswith(prefix) and 
             factor in filename and filename.endswith(suffix))):
            filenames.append(filename)
    
    urls = [(base_url + '/' + fn) for fn in filenames]
    names = [fn.replace(prefix, '').replace(suffix, '')
             for fn in filenames]
    return OrderedDict(zip(names, urls))

urls = pandas.Series(get_peak_files('hg19', 'wgEncodeAwgTfbsUniform', 'Ctcf', 'wgEncodeAwgTfbs', 'narrowPeak.gz'))

In [14]:
urls

BroadDnd41CtcfUniPk.                        http://hgdownload.cse.ucsc.edu/goldenPath/hg19...
BroadGm12878CtcfUniPk.                      http://hgdownload.cse.ucsc.edu/goldenPath/hg19...
BroadH1hescCtcfUniPk.                       http://hgdownload.cse.ucsc.edu/goldenPath/hg19...
BroadHelas3CtcfUniPk.                       http://hgdownload.cse.ucsc.edu/goldenPath/hg19...
BroadHepg2CtcfUniPk.                        http://hgdownload.cse.ucsc.edu/goldenPath/hg19...
BroadHmecCtcfUniPk.                         http://hgdownload.cse.ucsc.edu/goldenPath/hg19...
BroadHsmmCtcfUniPk.                         http://hgdownload.cse.ucsc.edu/goldenPath/hg19...
BroadHsmmtCtcfUniPk.                        http://hgdownload.cse.ucsc.edu/goldenPath/hg19...
BroadHuvecCtcfUniPk.                        http://hgdownload.cse.ucsc.edu/goldenPath/hg19...
BroadK562CtcfUniPk.                         http://hgdownload.cse.ucsc.edu/goldenPath/hg19...
BroadNhaCtcfUniPk.                          http://hgdownloa

In [15]:
datasets = [bioframe.read_table(url, schema='narrowPeak') for url in urls]
for name, dataset in zip(urls.keys(), datasets):
    dataset['name'] = name

In [24]:
combined = pandas.concat(datasets, axis=0, ignore_index=True)

In [18]:
len(combined)

4662435

In [26]:
combined = combined.sort_values(['chrom', 'start'])
combined.head(20)

Unnamed: 0,chrom,start,end,name,score,strand,fc,-log10p,-log10q,relSummit
3443041,chr1,16110,16390,UwHcpeCtcfUniPk.,170,.,26.70888,-1,5.069213,140
977073,chr1,16118,16362,HaibK562CtcfcPcr1xUniPk.,110,.,17.292793,-1,4.964811,122
2212315,chr1,16128,16358,UtaMcf7CtcfVehUniPk.,200,.,31.357125,-1,2.570631,115
2011602,chr1,16149,16339,UtaMcf7CtcfSerumstimUniPk.,227,.,35.560401,-1,4.976226,95
1961618,chr1,16150,16350,UtaMcf7CtcfEstroUniPk.,209,.,32.742278,-1,1.854809,100
1890580,chr1,16155,16339,UtaK562CtcfUniPk.,171,.,26.766516,-1,5.104183,92
1752000,chr1,16165,16325,UtaHelas3CtcfUniPk.,139,.,21.832632,-1,4.550656,80
205350,chr1,91156,91580,BroadHelas3CtcfUniPk.,183,.,28.712648,-1,5.033589,212
3753955,chr1,91187,91563,UwHffmycCtcfUniPk.,145,.,22.782451,-1,4.408393,188
3191393,chr1,91198,91548,UwHacCtcfUniPk.,115,.,18.03993,-1,4.114358,175


### Cluster ze peaks

In [27]:
merge_dist = 5000

with tsv(combined) as i:
    out = bedtools.cluster(i=i.name, d=merge_dist)
    out.columns = list(combined.columns) + ['cluster_id']

grouped = out.groupby('cluster_id')
out['min_start'] = grouped.start.transform(np.min)
out['max_end'] = grouped.end.transform(np.max)


In [22]:
out.head(10)

Unnamed: 0,chrom,start,end,name,score,strand,fc,-log10p,-log10q,relSummit,cluster_id,min_start,max_end
0,chr1,16110,16390,UwHcpeCtcfUniPk.,170,.,26.70888,-1,5.069213,140,1,16110,16390
1,chr1,16118,16362,HaibK562CtcfcPcr1xUniPk.,110,.,17.292793,-1,4.964811,122,1,16110,16390
2,chr1,16128,16358,UtaMcf7CtcfVehUniPk.,200,.,31.357125,-1,2.570631,115,1,16110,16390
3,chr1,16149,16339,UtaMcf7CtcfSerumstimUniPk.,227,.,35.560401,-1,4.976226,95,1,16110,16390
4,chr1,16150,16350,UtaMcf7CtcfEstroUniPk.,209,.,32.742278,-1,1.854809,100,1,16110,16390
5,chr1,16155,16339,UtaK562CtcfUniPk.,171,.,26.766516,-1,5.104183,92,1,16110,16390
6,chr1,16165,16325,UtaHelas3CtcfUniPk.,139,.,21.832632,-1,4.550656,80,1,16110,16390
7,chr1,91156,91580,BroadHelas3CtcfUniPk.,183,.,28.712648,-1,5.033589,212,2,91156,91580
8,chr1,91187,91563,UwHffmycCtcfUniPk.,145,.,22.782451,-1,4.408393,188,2,91156,91580
9,chr1,91198,91548,UwHacCtcfUniPk.,115,.,18.03993,-1,4.114358,175,2,91156,91580


In [21]:
summary = pandas.DataFrame({
        'chrom': grouped.chrom.first(),
        'start': grouped.start.min(),
        'end': grouped.end.max(),
        'cluster_id': grouped.cluster_id.first(),
        'cluster_size': grouped.size(),
        'mean_fc': grouped.fc.mean(),
        'median_fc': grouped.fc.median(),
        'min_fc': grouped.fc.min(),
        'max_fc': grouped.fc.max(),
    }, 
    columns=['chrom', 'start', 'end', 'cluster_id', 'cluster_size', 'mean_fc', 'median_fc', 'min_fc', 'max_fc'])
summary = summary.reset_index(drop=True)
summary = summary.sort_values('mean_fc', ascending=False)
summary.head()

Unnamed: 0,chrom,start,end,cluster_id,cluster_size,mean_fc,median_fc,min_fc,max_fc
19786,chr12,58299051,58299575,19787,99,421.795083,429.485396,64.739853,657.644949
38062,chr17,78549087,78549594,38063,100,407.981867,409.537269,60.714637,625.438059
19627,chr12,54773353,54773923,19628,102,406.092712,420.840471,25.300614,674.760266
92652,chr9,130879847,130880377,92653,100,398.930859,413.943237,53.019034,617.837592
4397,chr1,114887365,114889566,4398,102,391.848698,422.145632,14.918692,688.923138


In [None]:
out.to_csv('/net/levsha/share/nezar/ctcf_sites/ctcf.hg19.allEncodeUniformNarrowPeak.clustered.txt.gz', sep='\t', index=False, compression='gzip')
summary.to_csv('/net/levsha/share/nezar/ctcf_sites/ctcf.hg19.allEncodeUniformNarrowPeak.clusters.txt.gz', sep='\t', index=False, compression='gzip')

In [None]:
(summary.end-summary.start).hist(bins=100)

In [None]:
(summary.mean_fc).hist(bins=100, alpha=0.5)
(summary.max_fc).hist(bins=100, alpha=0.5)

In [None]:
#!jupyter-nbconvert --to=html --stdout cluster-ctcfs-mm9.ipynb > /net/proteome/home/nezar/public_html/notebooks/cluster-ctcfs-mm9.html