In [75]:
import requests, json
import matplotlib.pyplot as plt
import numpy as np
from multiprocessing import Pool
import tqdm
import os
import pickle
import pandas as pd

In [106]:
def construct_url():
    # url = "https://www.encodeproject.org/search/?type=Experiment&control_type!=*&status=released&perturbed=false&assay_title=TF+ChIP-seq&replicates.library.biosample.donor.organism.scientific_name=Homo+sapiens&files.run_type=paired-ended&assembly=GRCh38&files.read_length=100&files.read_length=101"
    #url = "https://www.encodeproject.org/search/?type=Experiment&control_type!=*&status=released&perturbed=false&assay_title=TF+ChIP-seq&replicates.library.biosample.donor.organism.scientific_name=Homo+sapiens&assembly=GRCh38"
    # CTCF
    url = "https://www.encodeproject.org/search/?type=Experiment&control_type!=*&assay_term_name=ChIP-seq&status=released&assay_title=TF+ChIP-seq&target.label=CTCF&replicates.library.biosample.donor.organism.scientific_name=Homo+sapiens"
    url += "&format=json&limit=all"
    return(url)

In [107]:
# Ping the ENCODE API and return data in JSON format
def fetch(url):
  headers = {'accept': 'application/json'}
  response = requests.get(url, headers=headers)
  data = response.json()
  return(data)

In [173]:
def get_exp_data(exp):
    url = "https://www.encodeproject.org/experiments/"+exp+"/?format=json"
    data = fetch(url)
    toReturn = []
    
    assay = data["assay_term_name"]
    try:
        biosample = data["biosample_ontology"]["term_name"]
        synonyms = ",".join(data["biosample_ontology"]["synonyms"])
    except:
        biosample = ""
        synonyms = ""
    try:
        target = data["target"]["label"]
    except:
        target = "Control"

    if target == "Control":
        controls = [data['control_type']]
    else:
        # get control / input experiment
        controls = []
        for ctrl in data["possible_controls"]:
            controls.append(ctrl["accession"])
        
    for x in data["files"]:
        try:
            acc = x["accession"]
        except:
            continue
        file_format = x["file_format"]
        if file_format == "fastq":
            output_type = x['output_type']
            if output_type == "reads":
                biorep = x['biological_replicates'][0]
                techrep = x["technical_replicates"][0]
                try:
                    url = x['cloud_metadata']['url']
                except:
                    continue
                read_length = x["read_length"]
                md5sum = x['md5sum']
                run_type = x['run_type']
                read_count = x['read_count']
                if run_type == "paired-ended":
                    read = x["paired_end"]
                else:
                    read = "1"

                status = x["status"]
                if status == "released":
                    toReturn.append([exp, acc, target, biosample, synonyms, biorep, techrep, read, read_length, run_type, read_count, url, md5sum, ",".join(controls)])
    return(toReturn)

In [131]:
exp_data = fetch(construct_url())

In [132]:
experiments = [x["accession"] for x in exp_data['@graph']]

In [133]:
print("There are {} experiments".format(len(experiments)))

There are 457 experiments


In [174]:
get_exp_data("ENCSR822CEA")

[['ENCSR822CEA',
  'ENCFF002EHI',
  'CTCF',
  'neural cell',
  '',
  2,
  '2_1',
  '2',
  100,
  'paired-ended',
  39365107,
  'https://encode-public.s3.amazonaws.com/2014/07/04/0099b724-157b-489f-bbb9-19ba18ed6fcd/ENCFF002EHI.fastq.gz',
  'a860b1f41197d39c9c1040dde4847de9',
  'ENCSR634PYN'],
 ['ENCSR822CEA',
  'ENCFF002EHK',
  'CTCF',
  'neural cell',
  '',
  2,
  '2_1',
  '1',
  100,
  'paired-ended',
  39365107,
  'https://encode-public.s3.amazonaws.com/2014/07/04/215128d7-eec6-4ae2-ae99-4e78ff26ae6a/ENCFF002EHK.fastq.gz',
  '8324487c52162841b4309b3b605fe979',
  'ENCSR634PYN'],
 ['ENCSR822CEA',
  'ENCFF002EHL',
  'CTCF',
  'neural cell',
  '',
  1,
  '1_1',
  '1',
  100,
  'paired-ended',
  39946046,
  'https://encode-public.s3.amazonaws.com/2014/07/04/3eabb9e1-b041-4d7e-8905-a21c0728ab88/ENCFF002EHL.fastq.gz',
  '5a1e600633ce756b1a5fb2a621d568fb',
  'ENCSR634PYN'],
 ['ENCSR822CEA',
  'ENCFF002EHM',
  'CTCF',
  'neural cell',
  '',
  1,
  '1_1',
  '2',
  100,
  'paired-ended',
  399

In [175]:
metadata = []
for i in tqdm.trange(len(experiments)):
    metadata.append(get_exp_data(experiments[i]))

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 457/457 [04:40<00:00,  1.63it/s]


In [176]:
def flatten(xss):
    return [x for xs in xss for x in xs]
    
df = pd.DataFrame(flatten(metadata), columns=["exp", "acc", "target", "biosample", "synonyms", "biorep", "techrep", "read", "read_length", "run_type", "read_count", "url", "md5sum", "controls"])

In [177]:
df.head()

Unnamed: 0,exp,acc,target,biosample,synonyms,biorep,techrep,read,read_length,run_type,read_count,url,md5sum,controls
0,ENCSR724YTA,ENCFF821OKY,CTCF,dorsolateral prefrontal cortex,,1,1_1,1,76,single-ended,5853546,https://encode-public.s3.amazonaws.com/2021/01...,bcbc7cfc4c05866d1316b11930298a74,ENCSR739QEI
1,ENCSR724YTA,ENCFF401WPF,CTCF,dorsolateral prefrontal cortex,,1,1_1,1,76,single-ended,5862012,https://encode-public.s3.amazonaws.com/2021/01...,f17049f0f4ba3cc05ab52658cad5d882,ENCSR739QEI
2,ENCSR724YTA,ENCFF305JSV,CTCF,dorsolateral prefrontal cortex,,1,1_1,1,76,single-ended,6025522,https://encode-public.s3.amazonaws.com/2021/01...,aa76f7e858e268a037193f0384ea9ced,ENCSR739QEI
3,ENCSR724YTA,ENCFF283GPJ,CTCF,dorsolateral prefrontal cortex,,1,1_1,1,76,single-ended,5868826,https://encode-public.s3.amazonaws.com/2021/01...,d024064500e402793bbe7447400752d4,ENCSR739QEI
4,ENCSR724YTA,ENCFF701YZU,CTCF,dorsolateral prefrontal cortex,,1,1_2,1,76,single-ended,4911514,https://encode-public.s3.amazonaws.com/2021/01...,c3ab8b90213e263c46dc149f63fd3ce0,ENCSR739QEI


In [178]:
df.to_csv("Metadata.txt", sep="\t", header=True, index=False)

In [181]:
!cat Metadata.txt |  head -n16

exp	acc	target	biosample	synonyms	biorep	techrep	read	read_length	run_type	read_count	url	md5sum	controls
ENCSR724YTA	ENCFF821OKY	CTCF	dorsolateral prefrontal cortex		1	1_1	1	76	single-ended	5853546	https://encode-public.s3.amazonaws.com/2021/01/28/d3d24b8a-afc7-4c2a-9def-928e1305f309/ENCFF821OKY.fastq.gz	bcbc7cfc4c05866d1316b11930298a74	ENCSR739QEI
ENCSR724YTA	ENCFF401WPF	CTCF	dorsolateral prefrontal cortex		1	1_1	1	76	single-ended	5862012	https://encode-public.s3.amazonaws.com/2021/01/28/05e53134-52d0-4ce6-b2d5-8cc41819d666/ENCFF401WPF.fastq.gz	f17049f0f4ba3cc05ab52658cad5d882	ENCSR739QEI
ENCSR724YTA	ENCFF305JSV	CTCF	dorsolateral prefrontal cortex		1	1_1	1	76	single-ended	6025522	https://encode-public.s3.amazonaws.com/2021/01/28/1d75b0a3-a567-4751-96ee-4392a85276bd/ENCFF305JSV.fastq.gz	aa76f7e858e268a037193f0384ea9ced	ENCSR739QEI
ENCSR724YTA	ENCFF283GPJ	CTCF	dorsolateral prefrontal cortex		1	1_1	1	76	single-ended	5868826	https://encode-public.s3.amazonaws.com/2021/01/28/f159f59c-a583

In [180]:
experiments_pe = df[df["run_type"] == "paired-ended"].exp.unique().tolist()
print(len(experiments_pe))

105


In [168]:
for exp in experiments_pe:
    print(exp, len(df[df["exp"] == exp]))

ENCSR987GXT 4
ENCSR240PRQ 3
ENCSR430YRJ 3
ENCSR243INX 4
ENCSR634OAQ 4
ENCSR066BZZ 4
ENCSR541AMF 4
ENCSR829HTO 2
ENCSR776AEL 2
ENCSR265PFQ 2
ENCSR756KRS 4
ENCSR822CEA 6
ENCSR450FRI 2
ENCSR559KAB 2
ENCSR225OKX 2
ENCSR071XWO 2
ENCSR770IWO 2
ENCSR960MDF 2
ENCSR430TEE 2
ENCSR206ETG 4
ENCSR484DDO 2
ENCSR582MTM 2
ENCSR146BGM 4
ENCSR003SZZ 2
ENCSR558HTE 2
ENCSR661NXJ 2
ENCSR720USO 2
ENCSR482PMN 2
ENCSR549WAU 2
ENCSR038FOS 2
ENCSR773JBP 2
ENCSR494TNM 2
ENCSR203QEB 4
ENCSR493APD 2
ENCSR822PJT 2
ENCSR727HME 2
ENCSR857PBV 4
ENCSR515LRI 4
ENCSR102CSD 2
ENCSR692ILH 2
ENCSR234HEM 2
ENCSR252XWG 2
ENCSR668BTN 4
ENCSR028YEV 4
ENCSR812FNU 4
ENCSR304XUZ 2
ENCSR460LGH 8
ENCSR817HTJ 4
ENCSR450BLH 2
ENCSR964BKO 2
ENCSR222SQE 2
ENCSR469POZ 2
ENCSR173AIR 4
ENCSR005LPI 2
ENCSR391ZKN 2
ENCSR548DDS 2
ENCSR655ECZ 2
ENCSR434XLP 2
ENCSR699BEK 2
ENCSR697YIN 2
ENCSR998NQG 2
ENCSR791AYW 2
ENCSR606TNN 2
ENCSR175FLL 2
ENCSR955BIB 2
ENCSR972LYL 2
ENCSR419ANE 2
ENCSR847XGE 4
ENCSR911GFJ 2
ENCSR408XTO 2
ENCSR492ZIW 2
ENCSR0

In [20]:
control_experiments = list(set(flatten([_.split(",") for _ in df.controls.tolist()])))

In [22]:
len(control_experiments)

442

In [23]:
control_metadata = []
for i in tqdm.trange(len(control_experiments)):
    control_metadata.append(get_exp_data(control_experiments[i]))

 63%|███████████████████████████████████████████████████████████████████████████████████████████████████▍                                                         | 280/442 [02:08<01:14,  2.18it/s]


TypeError: list indices must be integers or slices, not str

In [28]:
# get_exp_data(control_experiments[i])
test = get_exp_data(control_experiments[i])

KeyError: 'control_type'