In [33]:
import requests, json
import matplotlib.pyplot as plt
import numpy as np
from multiprocessing import Pool
import tqdm
import os
import pickle
import pandas as pd

In [2]:
def construct_url():
    url = "https://www.encodeproject.org/search/?type=Experiment&control_type!=*&status=released&perturbed=false&assay_title=TF+ChIP-seq&replicates.library.biosample.donor.organism.scientific_name=Homo+sapiens&files.run_type=paired-ended&assembly=GRCh38&files.read_length=100&files.read_length=101"
    url += "&format=json&limit=all"
    return(url)

In [3]:
# Ping the ENCODE API and return data in JSON format
def fetch(url):
  headers = {'accept': 'application/json'}
  response = requests.get(url, headers=headers)
  data = response.json()
  return(data)

In [53]:
def get_exp_data(exp):
    url = "https://www.encodeproject.org/experiments/"+exp+"/?format=json"
    data = fetch(url)
    toReturn = []
    assay = data["assay_term_name"]
    biosample = data["biosample_ontology"]["term_name"]
    synonyms = ",".join(data["biosample_ontology"]["synonyms"])
    target = data["target"]["label"]

    # get control / input experiment
    controls = []
    for ctrl in data["possible_controls"]:
        controls.append(ctrl["accession"])
        
    for x in data["files"]:
        try:
            acc = x["accession"]
        except:
            continue
        file_format = x["file_format"]
        if file_format == "fastq":
            output_type = x['output_type']
            if output_type == "reads":
                biorep = x['biological_replicates'][0]
                techrep = x["technical_replicates"][0]
                url = x['cloud_metadata']['url']
                read = x["paired_end"]
                read_length = (x["read_length"])
                toReturn.append([exp, acc, target, biosample, synonyms, biorep, techrep, read, read_length, url, ",".join(controls)])
    return(toReturn)

In [54]:
exp_data = fetch(construct_url())

In [55]:
experiments = [x["accession"] for x in exp_data['@graph']]

In [56]:
print("There are {} experiments".format(len(experiments)))

There are 1059 experiments


In [60]:
metadata = []
for i in tqdm.trange(len(experiments)):
    metadata.append(get_exp_data(experiments[i]))
    if i == 10:
        break

  1%|▉                                                                                                 | 10/1059 [00:06<11:13,  1.56it/s]


In [73]:
pd.DataFrame(metadata[10], columns=["exp", "acc", "target", "biosample", "synonyms", "biorep", "techrep", "read", "read_length", "url", "controls"])

Unnamed: 0,exp,acc,target,biosample,synonyms,biorep,techrep,read,read_length,url,controls
0,ENCSR036THH,ENCFF461AXC,BACH2,SK-N-SH,SKNSH,1,1_1,1,100,https://encode-public.s3.amazonaws.com/2020/09...,"ENCSR456LEZ,ENCSR072QAK"
1,ENCSR036THH,ENCFF299DAG,BACH2,SK-N-SH,SKNSH,2,2_1,1,100,https://encode-public.s3.amazonaws.com/2020/09...,"ENCSR456LEZ,ENCSR072QAK"
2,ENCSR036THH,ENCFF161VMO,BACH2,SK-N-SH,SKNSH,1,1_1,2,100,https://encode-public.s3.amazonaws.com/2020/09...,"ENCSR456LEZ,ENCSR072QAK"
3,ENCSR036THH,ENCFF520BKX,BACH2,SK-N-SH,SKNSH,2,2_1,2,100,https://encode-public.s3.amazonaws.com/2020/09...,"ENCSR456LEZ,ENCSR072QAK"
