In [33]:
import requests, json
import matplotlib.pyplot as plt
import numpy as np
from multiprocessing import Pool
import tqdm
import os
import pickle
import pandas as pd

In [34]:
def construct_url():
    url = "https://www.encodeproject.org/search/?type=Experiment&control_type!=*&status=released&perturbed=false&assay_title=TF+ChIP-seq&replicates.library.biosample.donor.organism.scientific_name=Homo+sapiens&files.run_type=paired-ended&assembly=GRCh38&files.read_length=100&files.read_length=101"
    url += "&format=json&limit=all"
    return(url)

In [35]:
# Ping the ENCODE API and return data in JSON format
def fetch(url):
  headers = {'accept': 'application/json'}
  response = requests.get(url, headers=headers)
  data = response.json()
  return(data)

In [50]:
def get_exp_data(exp):
    url = "https://www.encodeproject.org/experiments/"+exp+"/?format=json"
    data = fetch(url)
    toReturn = []
    
    assay = data["assay_term_name"]
    try:
        biosample = data["biosample_ontology"]["term_name"]
        synonyms = ",".join(data["biosample_ontology"]["synonyms"])
    except:
        biosample = ""
        synonyms = ""
    try:
        target = data["target"]["label"]
    except:
        target = "Control"

    if target == "Control":
        controls = [data['control_type']]
    else:
        # get control / input experiment
        controls = []
        for ctrl in data["possible_controls"]:
            controls.append(ctrl["accession"])
        
    for x in data["files"]:
        try:
            acc = x["accession"]
        except:
            continue
        file_format = x["file_format"]
        if file_format == "fastq":
            output_type = x['output_type']
            if output_type == "reads":
                biorep = x['biological_replicates'][0]
                techrep = x["technical_replicates"][0]
                url = x['cloud_metadata']['url']
                read = x["paired_end"]
                read_length = (x["read_length"])
                md5sum = x['md5sum']
                toReturn.append([exp, acc, target, biosample, synonyms, biorep, techrep, read, read_length, url, md5sum, ",".join(controls)])
    return(toReturn)

In [37]:
exp_data = fetch(construct_url())

In [58]:
experiments = [x["accession"] for x in exp_data['@graph']]

In [59]:
print("There are {} experiments".format(len(experiments)))

There are 1059 experiments


In [60]:
metadata = []
for i in tqdm.trange(len(experiments)):
    metadata.append(get_exp_data(experiments[i]))

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1059/1059 [11:37<00:00,  1.52it/s]


In [61]:
def flatten(xss):
    return [x for xs in xss for x in xs]
    
df = pd.DataFrame(flatten(metadata), columns=["exp", "acc", "target", "biosample", "synonyms", "biorep", "techrep", "read", "read_length", "url", "md5sum", "controls"])

In [62]:
df.head()

Unnamed: 0,exp,acc,target,biosample,synonyms,biorep,techrep,read,read_length,url,md5sum,controls
0,ENCSR897MYK,ENCFF002EME,SREBF1,A549,"A-549,A549 cell",2,2_1,1,100,https://encode-public.s3.amazonaws.com/2014/07...,0f55f096e4b439f274bc897110991d5e,"ENCSR610YRT,ENCSR871XML"
1,ENCSR897MYK,ENCFF002EMH,SREBF1,A549,"A-549,A549 cell",2,2_1,2,100,https://encode-public.s3.amazonaws.com/2014/07...,7fc471b2bf42d45125e07262994c5334,"ENCSR610YRT,ENCSR871XML"
2,ENCSR897MYK,ENCFF002EMI,SREBF1,A549,"A-549,A549 cell",3,3_1,1,100,https://encode-public.s3.amazonaws.com/2014/07...,65d1138ebd666e754b59418f065d46ee,"ENCSR610YRT,ENCSR871XML"
3,ENCSR897MYK,ENCFF002EMK,SREBF1,A549,"A-549,A549 cell",3,3_1,2,100,https://encode-public.s3.amazonaws.com/2014/07...,acae3e7be31703203b1bf6187e5fb193,"ENCSR610YRT,ENCSR871XML"
4,ENCSR077TKJ,ENCFF450YDH,ZNF707,K562,"K562 cell,K-562,K-562 cell",1,1_1,1,100,https://encode-public.s3.amazonaws.com/2021/09...,44f61fc5a212bf4fcc2f8930a787e4fe,ENCSR328FYU


In [63]:
df.to_csv("Metadata.txt", sep="\t", header=True, index=False)

In [64]:
!cat Metadata.txt | head

exp	acc	target	biosample	synonyms	biorep	techrep	read	read_length	url	md5sum	controls
ENCSR897MYK	ENCFF002EME	SREBF1	A549	A-549,A549 cell	2	2_1	1	100	https://encode-public.s3.amazonaws.com/2014/07/06/ad73704c-43ef-4e73-95a6-5eb2763d65a6/ENCFF002EME.fastq.gz	0f55f096e4b439f274bc897110991d5e	ENCSR610YRT,ENCSR871XML
ENCSR897MYK	ENCFF002EMH	SREBF1	A549	A-549,A549 cell	2	2_1	2	100	https://encode-public.s3.amazonaws.com/2014/07/06/b915f24c-ecf8-43eb-92c5-675079ee0e89/ENCFF002EMH.fastq.gz	7fc471b2bf42d45125e07262994c5334	ENCSR610YRT,ENCSR871XML
ENCSR897MYK	ENCFF002EMI	SREBF1	A549	A-549,A549 cell	3	3_1	1	100	https://encode-public.s3.amazonaws.com/2014/07/06/bfe0cd37-ca2e-4d31-8f2d-1ed9e4bab64c/ENCFF002EMI.fastq.gz	65d1138ebd666e754b59418f065d46ee	ENCSR610YRT,ENCSR871XML
ENCSR897MYK	ENCFF002EMK	SREBF1	A549	A-549,A549 cell	3	3_1	2	100	https://encode-public.s3.amazonaws.com/2014/07/06/acf4e8e9-1589-43b6-a1f0-8388e9f6887f/ENCFF002EMK.fastq.gz	acae3e7be31703203b1bf6187e5fb193	ENCSR610YRT,ENCSR871XM

In [20]:
control_experiments = list(set(flatten([_.split(",") for _ in df.controls.tolist()])))

In [22]:
len(control_experiments)

442

In [23]:
control_metadata = []
for i in tqdm.trange(len(control_experiments)):
    control_metadata.append(get_exp_data(control_experiments[i]))

 63%|███████████████████████████████████████████████████████████████████████████████████████████████████▍                                                         | 280/442 [02:08<01:14,  2.18it/s]


TypeError: list indices must be integers or slices, not str

In [28]:
# get_exp_data(control_experiments[i])
test = get_exp_data(control_experiments[i])

KeyError: 'control_type'