In [1]:
import pathlib
import pandas as pd
import json

# Encode e-clip data

[eCLIP protocol](https://www.encodeproject.org/documents/dde0b669-0909-4f8b-946d-3cb9f35a6c52/@@download/attachment/eCLIP_analysisSOP_v1.P.pdf)

In [2]:
encode_eclip_path = pathlib.Path('/Volumes/biodb/encode')

In [6]:
encode_metadata = pd.read_table(encode_eclip_path / 'encode_hg38_clip_peaks/metadata.tsv')

In [8]:
encode_metadata.head()

Unnamed: 0,File accession,File format,Output type,Experiment accession,Assay,Biosample term id,Biosample term name,Biosample type,Biosample life stage,Biosample sex,...,dbxrefs,File download URL,Assembly,Platform,Controlled by,File Status,Audit WARNING,Audit INTERNAL_ACTION,Audit NOT_COMPLIANT,Audit ERROR
0,ENCFF743GIA,bed narrowPeak,peaks,ENCSR867DSZ,eCLIP,EFO:0002067,K562,immortalized cell line,adult,female,...,,https://www.encodeproject.org/files/ENCFF743GI...,hg19,,,released,,,,
1,ENCFF392QAQ,bed narrowPeak,peaks,ENCSR867DSZ,eCLIP,EFO:0002067,K562,immortalized cell line,adult,female,...,,https://www.encodeproject.org/files/ENCFF392QA...,hg19,,,released,,,,
2,ENCFF177GMP,bed narrowPeak,peaks,ENCSR867DSZ,eCLIP,EFO:0002067,K562,immortalized cell line,adult,female,...,,https://www.encodeproject.org/files/ENCFF177GM...,GRCh38,,,released,,,,
3,ENCFF669BCV,bed narrowPeak,peaks,ENCSR867DSZ,eCLIP,EFO:0002067,K562,immortalized cell line,adult,female,...,,https://www.encodeproject.org/files/ENCFF669BC...,GRCh38,,,released,,,,
4,ENCFF774EDY,bed narrowPeak,peaks,ENCSR513NDD,eCLIP,EFO:0001187,HepG2,immortalized cell line,child,male,...,,https://www.encodeproject.org/files/ENCFF774ED...,hg19,,,released,,,,


In [9]:
len(list(pathlib.Path('/Volumes/biodb/encode/encode_hg38_clip_peaks/HepG2/combined').glob('*.bed')))

70

In [10]:
template = {
    'description': 'encode eClip data.',
     'experiment': 'eClip',
     'id': 'eClip_all',
     'methods': '',
     'references': 'Nostrand E, et al. bioRxiv 179648; doi: https://doi.org/10.1101/179648.',
     'summary': 'all combined eClip sites from K562 and HepG2.'
}

In [22]:
from itertools import chain

In [23]:
!ls /Volumes/biodb/encode/encode_hg38_clip_peaks/

[34mHepG2[m[m        [34mK562[m[m         [31mmetadata.tsv[m[m


In [42]:
encode_HepG2_path = pathlib.Path('/Volumes/biodb/encode/encode_hg38_clip_peaks/HepG2/combined')
encode_K562_path = pathlib.Path('/Volumes/biodb/encode/encode_hg38_clip_peaks/K562/combined')
col_names = ["chrom","start","end","name","score","strand","signal_value","-log10(pValue)","-log10(qvalue)","peak"]

dest = pathlib.Path('/Volumes/prj/dorina2/regulators/h_sapiens/hg38/')
for f in chain(encode_HepG2_path.glob('*.bed'), encode_K562_path.glob('*.bed')):
    tmp_template = template.copy()
    tmp_template['id'] =  'eClip_' +  f.stem[:f.stem.rfind('_')] + 'Encode'
    name = pathlib.Path('eClip_' + f.stem.replace('combined', '') + 'hg38')
    
    x = pd.read_table(f, names=col_names)
    x['.'] = '.'
    x['name'] = tmp_template['id']
    
    tmp_template['summary'] = "{} peaks for targets of {} in {} cells.".format(
        x.shape[0], f.stem.split('_')[0], f.stem.split('_')[1])
    
    x[["chrom", "start", "end", "name", ".", "strand"]].to_csv(
        dest / name.with_suffix('.bed'), index=None, header=None, sep="\t")
    
    with open(dest / name.with_suffix('.json'), 'w') as fout:
        json.dump([tmp_template], fout, indent=True)

In [36]:
with open('/Volumes/prj/dorina2/regulators/m_musculus/mm10/HITSCLIP_Ago2Leung2011b_mm9.json') as f:
    x=json.load(f)

In [41]:
tmp_template

{'description': 'encode eClip data.',
 'experiment': 'eClip',
 'id': 'eClip_IGF2BP2_K562Encode',
 'methods': '',
 'references': 'Nostrand E, et al. bioRxiv 179648; doi: https://doi.org/10.1101/179648.',
 'summary': '211519 peaks for targets of IGF2BP2 in K562 cells.'}

In [43]:
tmp_template['id']

'eClip_IGF2BP2_K562Encode'

In [44]:
x['name']

0         eClip_IGF2BP2_K562Encode
1         eClip_IGF2BP2_K562Encode
2         eClip_IGF2BP2_K562Encode
3         eClip_IGF2BP2_K562Encode
4         eClip_IGF2BP2_K562Encode
5         eClip_IGF2BP2_K562Encode
6         eClip_IGF2BP2_K562Encode
7         eClip_IGF2BP2_K562Encode
8         eClip_IGF2BP2_K562Encode
9         eClip_IGF2BP2_K562Encode
10        eClip_IGF2BP2_K562Encode
11        eClip_IGF2BP2_K562Encode
12        eClip_IGF2BP2_K562Encode
13        eClip_IGF2BP2_K562Encode
14        eClip_IGF2BP2_K562Encode
15        eClip_IGF2BP2_K562Encode
16        eClip_IGF2BP2_K562Encode
17        eClip_IGF2BP2_K562Encode
18        eClip_IGF2BP2_K562Encode
19        eClip_IGF2BP2_K562Encode
20        eClip_IGF2BP2_K562Encode
21        eClip_IGF2BP2_K562Encode
22        eClip_IGF2BP2_K562Encode
23        eClip_IGF2BP2_K562Encode
24        eClip_IGF2BP2_K562Encode
25        eClip_IGF2BP2_K562Encode
26        eClip_IGF2BP2_K562Encode
27        eClip_IGF2BP2_K562Encode
28        eClip_IGF2