In [187]:
# Ping the ENCODE API and return data in JSON format
def fetch(url):
  headers = {'accept': 'application/json'}
  response = requests.get(url, headers=headers)
  data = response.json()
  return(data)

In [188]:
def get_exp_data(exp):
    url = "https://www.encodeproject.org/experiments/"+exp+"/?format=json"
    data = fetch(url)
    toReturn = []
    
    assay = data["assay_term_name"]
    try:
        biosample = data["biosample_ontology"]["term_name"]
        synonyms = ",".join(data["biosample_ontology"]["synonyms"])
    except:
        biosample = ""
        synonyms = ""
    try:
        target = data["target"]["label"]
    except:
        target = "Control"

    if target == "Control":
        controls = [data['control_type']]
    else:
        # get control / input experiment
        controls = []
        for ctrl in data["possible_controls"]:
            controls.append(ctrl["accession"])
        
    for x in data["files"]:
        try:
            acc = x["accession"]
        except:
            continue
        file_format = x["file_format"]
        if file_format == "fastq":
            output_type = x['output_type']
            if output_type == "reads":
                biorep = x['biological_replicates'][0]
                techrep = x["technical_replicates"][0]
                try:
                    url = x['cloud_metadata']['url']
                except:
                    continue
                read_length = x["read_length"]
                md5sum = x['md5sum']
                run_type = x['run_type']
                read_count = x['read_count']
                if run_type == "paired-ended":
                    read = x["paired_end"]
                else:
                    read = "1"

                status = x["status"]
                if status == "released":
                    toReturn.append([exp, acc, target, biosample, synonyms, biorep, techrep, read, read_length, run_type, read_count, url, md5sum, ",".join(controls)])
    return(toReturn)

In [189]:
exp_data = fetch(construct_url())

In [190]:
experiments = [x["accession"] for x in exp_data['@graph']]

In [191]:
print("There are {} experiments".format(len(experiments)))

There are 3204 experiments


In [205]:
# get_exp_data("ENCSR822CEA")

In [193]:
metadata = []
for i in tqdm.trange(len(experiments)):
    metadata.append(get_exp_data(experiments[i]))

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3204/3204 [34:31<00:00,  1.55it/s]


In [194]:
def flatten(xss):
    return [x for xs in xss for x in xs]
    
df = pd.DataFrame(flatten(metadata), columns=["exp", "acc", "target", "biosample", "synonyms", "biorep", "techrep", "read", "read_length", "run_type", "read_count", "url", "md5sum", "controls"])

In [204]:
df.head()

Unnamed: 0,exp,acc,target,biosample,synonyms,biorep,techrep,read,read_length,run_type,read_count,url,md5sum,controls
0,ENCSR959XNY,ENCFF655BEL,MED1,HepG2,"HepG2 cell,Hep-G2",2,2_1,1,75,single-ended,40474263,https://encode-public.s3.amazonaws.com/2018/06...,596f29273326f236d21e9dcfbc7bde44,ENCSR020QUJ
1,ENCSR959XNY,ENCFF920MZO,MED1,HepG2,"HepG2 cell,Hep-G2",1,1_1,1,75,single-ended,38046079,https://encode-public.s3.amazonaws.com/2018/06...,872871049ed39779348841c516e36311,ENCSR020QUJ
2,ENCSR577YCO,ENCFF075LHW,ARHGAP35,HepG2,"HepG2 cell,Hep-G2",1,1_2,1,100,single-ended,32210747,https://encode-public.s3.amazonaws.com/2022/01...,a42e339580b9832afb0357dabda8b648,ENCSR239QGH
3,ENCSR577YCO,ENCFF504PYF,ARHGAP35,HepG2,"HepG2 cell,Hep-G2",1,1_2,1,100,single-ended,32231636,https://encode-public.s3.amazonaws.com/2022/01...,dd750faa1dec4db3bb1f27a83674df2c,ENCSR239QGH
4,ENCSR577YCO,ENCFF574DSC,ARHGAP35,HepG2,"HepG2 cell,Hep-G2",2,2_2,1,100,single-ended,18968586,https://encode-public.s3.amazonaws.com/2022/01...,8f94a92a30e42ad4ee2d32a3b8b3cb78,ENCSR239QGH


In [196]:
df.to_csv("Metadata.txt", sep="\t", header=True, index=False)

In [197]:
!cat Metadata.txt |  wc -l

9852


In [206]:
control_experiments = list(set(flatten([_.split(",") for _ in df.controls.tolist()])))

In [207]:
len(control_experiments)

1071

In [211]:
control_metadata = []
for i in tqdm.trange(len(control_experiments)):
    try:
        control_metadata.append(get_exp_data(control_experiments[i]))
    except:
        pass

100%|████████████████████████████████████████████████████████████████████████████████████████████████| 1071/1071 [08:28<00:00,  2.11it/s]


In [226]:
control_df = pd.DataFrame(flatten(control_metadata), columns=["exp", "acc", "target", "biosample", "synonyms", "biorep", "techrep", "read", "read_length", "run_type", "read_count", "url", "md5sum", "controls"])

In [227]:
control_df.head()

Unnamed: 0,exp,acc,target,biosample,synonyms,biorep,techrep,read,read_length,run_type,read_count,url,md5sum,controls
0,ENCSR501CEJ,ENCFF480KWL,Control,K562,"K562 cell,K-562,K-562 cell",1,1_1,1,100,paired-ended,40527474,https://encode-public.s3.amazonaws.com/2019/01...,2ed14357716c9c6984e253a203232b3f,input library
1,ENCSR501CEJ,ENCFF440JUL,Control,K562,"K562 cell,K-562,K-562 cell",1,1_1,2,100,paired-ended,40527474,https://encode-public.s3.amazonaws.com/2019/01...,833b82da601d4bd440d036c355f46317,input library
2,ENCSR631ESZ,ENCFF144AGB,Control,foreskin keratinocyte,,1,1_1,1,101,paired-ended,53545193,https://encode-public.s3.amazonaws.com/2016/12...,12205cf8fe66fc7df336a89f014cba33,input library
3,ENCSR631ESZ,ENCFF699DOM,Control,foreskin keratinocyte,,1,1_1,2,101,paired-ended,53545193,https://encode-public.s3.amazonaws.com/2016/12...,522b0254a97c18e2878d4f253359919c,input library
4,ENCSR631ESZ,ENCFF314CVE,Control,foreskin keratinocyte,,2,2_1,1,101,paired-ended,55163711,https://encode-public.s3.amazonaws.com/2016/12...,946aa908c62e8c131f5ae7160c134001,input library


In [229]:
input_controls = control_df[control_df["controls"] == "input library"].exp.tolist()
wild_type_controls = control_df[control_df["controls"] == "wild type"].exp.tolist()

In [230]:
def get_matching_control(possible_controls):
    for exp in possible_controls.split(","):
        if exp in input_controls:
            return(exp)
    for exp in possible_controls.split(","):
        if exp in wild_type_controls:
            return(exp)
    return(None)

In [231]:
df.head()

Unnamed: 0,exp,acc,target,biosample,synonyms,biorep,techrep,read,read_length,run_type,read_count,url,md5sum,controls
0,ENCSR959XNY,ENCFF655BEL,MED1,HepG2,"HepG2 cell,Hep-G2",2,2_1,1,75,single-ended,40474263,https://encode-public.s3.amazonaws.com/2018/06...,596f29273326f236d21e9dcfbc7bde44,ENCSR020QUJ
1,ENCSR959XNY,ENCFF920MZO,MED1,HepG2,"HepG2 cell,Hep-G2",1,1_1,1,75,single-ended,38046079,https://encode-public.s3.amazonaws.com/2018/06...,872871049ed39779348841c516e36311,ENCSR020QUJ
2,ENCSR577YCO,ENCFF075LHW,ARHGAP35,HepG2,"HepG2 cell,Hep-G2",1,1_2,1,100,single-ended,32210747,https://encode-public.s3.amazonaws.com/2022/01...,a42e339580b9832afb0357dabda8b648,ENCSR239QGH
3,ENCSR577YCO,ENCFF504PYF,ARHGAP35,HepG2,"HepG2 cell,Hep-G2",1,1_2,1,100,single-ended,32231636,https://encode-public.s3.amazonaws.com/2022/01...,dd750faa1dec4db3bb1f27a83674df2c,ENCSR239QGH
4,ENCSR577YCO,ENCFF574DSC,ARHGAP35,HepG2,"HepG2 cell,Hep-G2",2,2_2,1,100,single-ended,18968586,https://encode-public.s3.amazonaws.com/2022/01...,8f94a92a30e42ad4ee2d32a3b8b3cb78,ENCSR239QGH


In [232]:
get_matching_control("ENCSR020QUJ")

'ENCSR020QUJ'

In [233]:
df["matching_control"] = df.controls.map(get_matching_control)

In [234]:
df.head()

Unnamed: 0,exp,acc,target,biosample,synonyms,biorep,techrep,read,read_length,run_type,read_count,url,md5sum,controls,matching_control
0,ENCSR959XNY,ENCFF655BEL,MED1,HepG2,"HepG2 cell,Hep-G2",2,2_1,1,75,single-ended,40474263,https://encode-public.s3.amazonaws.com/2018/06...,596f29273326f236d21e9dcfbc7bde44,ENCSR020QUJ,ENCSR020QUJ
1,ENCSR959XNY,ENCFF920MZO,MED1,HepG2,"HepG2 cell,Hep-G2",1,1_1,1,75,single-ended,38046079,https://encode-public.s3.amazonaws.com/2018/06...,872871049ed39779348841c516e36311,ENCSR020QUJ,ENCSR020QUJ
2,ENCSR577YCO,ENCFF075LHW,ARHGAP35,HepG2,"HepG2 cell,Hep-G2",1,1_2,1,100,single-ended,32210747,https://encode-public.s3.amazonaws.com/2022/01...,a42e339580b9832afb0357dabda8b648,ENCSR239QGH,ENCSR239QGH
3,ENCSR577YCO,ENCFF504PYF,ARHGAP35,HepG2,"HepG2 cell,Hep-G2",1,1_2,1,100,single-ended,32231636,https://encode-public.s3.amazonaws.com/2022/01...,dd750faa1dec4db3bb1f27a83674df2c,ENCSR239QGH,ENCSR239QGH
4,ENCSR577YCO,ENCFF574DSC,ARHGAP35,HepG2,"HepG2 cell,Hep-G2",2,2_2,1,100,single-ended,18968586,https://encode-public.s3.amazonaws.com/2022/01...,8f94a92a30e42ad4ee2d32a3b8b3cb78,ENCSR239QGH,ENCSR239QGH


In [235]:
df[df["matching_control"] == None]

Unnamed: 0,exp,acc,target,biosample,synonyms,biorep,techrep,read,read_length,run_type,read_count,url,md5sum,controls,matching_control


In [238]:
df[df["read_length"] >= 50]

Unnamed: 0,exp,acc,target,biosample,synonyms,biorep,techrep,read,read_length,run_type,read_count,url,md5sum,controls,matching_control
0,ENCSR959XNY,ENCFF655BEL,MED1,HepG2,"HepG2 cell,Hep-G2",2,2_1,1,75,single-ended,40474263,https://encode-public.s3.amazonaws.com/2018/06...,596f29273326f236d21e9dcfbc7bde44,ENCSR020QUJ,ENCSR020QUJ
1,ENCSR959XNY,ENCFF920MZO,MED1,HepG2,"HepG2 cell,Hep-G2",1,1_1,1,75,single-ended,38046079,https://encode-public.s3.amazonaws.com/2018/06...,872871049ed39779348841c516e36311,ENCSR020QUJ,ENCSR020QUJ
2,ENCSR577YCO,ENCFF075LHW,ARHGAP35,HepG2,"HepG2 cell,Hep-G2",1,1_2,1,100,single-ended,32210747,https://encode-public.s3.amazonaws.com/2022/01...,a42e339580b9832afb0357dabda8b648,ENCSR239QGH,ENCSR239QGH
3,ENCSR577YCO,ENCFF504PYF,ARHGAP35,HepG2,"HepG2 cell,Hep-G2",1,1_2,1,100,single-ended,32231636,https://encode-public.s3.amazonaws.com/2022/01...,dd750faa1dec4db3bb1f27a83674df2c,ENCSR239QGH,ENCSR239QGH
4,ENCSR577YCO,ENCFF574DSC,ARHGAP35,HepG2,"HepG2 cell,Hep-G2",2,2_2,1,100,single-ended,18968586,https://encode-public.s3.amazonaws.com/2022/01...,8f94a92a30e42ad4ee2d32a3b8b3cb78,ENCSR239QGH,ENCSR239QGH
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9846,ENCSR236YGF,ENCFF730HKC,CTCF,transverse colon,colon transversum,1,1_2,1,76,single-ended,17309794,https://encode-public.s3.amazonaws.com/2017/12...,df2c6191487927c534c9c4b1fddce94e,ENCSR828XQV,ENCSR828XQV
9847,ENCSR775FSH,ENCFF533YAK,ZNF280B,K562,"K562 cell,K-562,K-562 cell",1,1_1,1,100,paired-ended,36277628,https://encode-public.s3.amazonaws.com/2020/05...,c3e71c1f0a52db61fb452109c1508279,"ENCSR502CWG,ENCSR275WKO",ENCSR502CWG
9848,ENCSR775FSH,ENCFF587EBW,ZNF280B,K562,"K562 cell,K-562,K-562 cell",2,2_1,2,100,paired-ended,40551479,https://encode-public.s3.amazonaws.com/2020/05...,00bd2b99ec66cd2238a3cbb67d304a6c,"ENCSR502CWG,ENCSR275WKO",ENCSR502CWG
9849,ENCSR775FSH,ENCFF084KUN,ZNF280B,K562,"K562 cell,K-562,K-562 cell",1,1_1,2,100,paired-ended,36277628,https://encode-public.s3.amazonaws.com/2020/05...,dc457e356a75e080d38ae79397999c9c,"ENCSR502CWG,ENCSR275WKO",ENCSR502CWG


In [240]:
df = df.drop(columns = ["controls"])

In [242]:
df = df.rename(columns={"matching_control" : "control"})

In [243]:
df.head()

Unnamed: 0,exp,acc,target,biosample,synonyms,biorep,techrep,read,read_length,run_type,read_count,url,md5sum,control
0,ENCSR959XNY,ENCFF655BEL,MED1,HepG2,"HepG2 cell,Hep-G2",2,2_1,1,75,single-ended,40474263,https://encode-public.s3.amazonaws.com/2018/06...,596f29273326f236d21e9dcfbc7bde44,ENCSR020QUJ
1,ENCSR959XNY,ENCFF920MZO,MED1,HepG2,"HepG2 cell,Hep-G2",1,1_1,1,75,single-ended,38046079,https://encode-public.s3.amazonaws.com/2018/06...,872871049ed39779348841c516e36311,ENCSR020QUJ
2,ENCSR577YCO,ENCFF075LHW,ARHGAP35,HepG2,"HepG2 cell,Hep-G2",1,1_2,1,100,single-ended,32210747,https://encode-public.s3.amazonaws.com/2022/01...,a42e339580b9832afb0357dabda8b648,ENCSR239QGH
3,ENCSR577YCO,ENCFF504PYF,ARHGAP35,HepG2,"HepG2 cell,Hep-G2",1,1_2,1,100,single-ended,32231636,https://encode-public.s3.amazonaws.com/2022/01...,dd750faa1dec4db3bb1f27a83674df2c,ENCSR239QGH
4,ENCSR577YCO,ENCFF574DSC,ARHGAP35,HepG2,"HepG2 cell,Hep-G2",2,2_2,1,100,single-ended,18968586,https://encode-public.s3.amazonaws.com/2022/01...,8f94a92a30e42ad4ee2d32a3b8b3cb78,ENCSR239QGH


In [244]:
control_df.head()

Unnamed: 0,exp,acc,target,biosample,synonyms,biorep,techrep,read,read_length,run_type,read_count,url,md5sum,controls
0,ENCSR501CEJ,ENCFF480KWL,Control,K562,"K562 cell,K-562,K-562 cell",1,1_1,1,100,paired-ended,40527474,https://encode-public.s3.amazonaws.com/2019/01...,2ed14357716c9c6984e253a203232b3f,input library
1,ENCSR501CEJ,ENCFF440JUL,Control,K562,"K562 cell,K-562,K-562 cell",1,1_1,2,100,paired-ended,40527474,https://encode-public.s3.amazonaws.com/2019/01...,833b82da601d4bd440d036c355f46317,input library
2,ENCSR631ESZ,ENCFF144AGB,Control,foreskin keratinocyte,,1,1_1,1,101,paired-ended,53545193,https://encode-public.s3.amazonaws.com/2016/12...,12205cf8fe66fc7df336a89f014cba33,input library
3,ENCSR631ESZ,ENCFF699DOM,Control,foreskin keratinocyte,,1,1_1,2,101,paired-ended,53545193,https://encode-public.s3.amazonaws.com/2016/12...,522b0254a97c18e2878d4f253359919c,input library
4,ENCSR631ESZ,ENCFF314CVE,Control,foreskin keratinocyte,,2,2_1,1,101,paired-ended,55163711,https://encode-public.s3.amazonaws.com/2016/12...,946aa908c62e8c131f5ae7160c134001,input library


In [249]:
control_df = control_df.rename(columns = {"controls" : "control"})

In [250]:
control_df.head()

Unnamed: 0,exp,acc,target,biosample,synonyms,biorep,techrep,read,read_length,run_type,read_count,url,md5sum,control
0,ENCSR501CEJ,ENCFF480KWL,Control,K562,"K562 cell,K-562,K-562 cell",1,1_1,1,100,paired-ended,40527474,https://encode-public.s3.amazonaws.com/2019/01...,2ed14357716c9c6984e253a203232b3f,input library
1,ENCSR501CEJ,ENCFF440JUL,Control,K562,"K562 cell,K-562,K-562 cell",1,1_1,2,100,paired-ended,40527474,https://encode-public.s3.amazonaws.com/2019/01...,833b82da601d4bd440d036c355f46317,input library
2,ENCSR631ESZ,ENCFF144AGB,Control,foreskin keratinocyte,,1,1_1,1,101,paired-ended,53545193,https://encode-public.s3.amazonaws.com/2016/12...,12205cf8fe66fc7df336a89f014cba33,input library
3,ENCSR631ESZ,ENCFF699DOM,Control,foreskin keratinocyte,,1,1_1,2,101,paired-ended,53545193,https://encode-public.s3.amazonaws.com/2016/12...,522b0254a97c18e2878d4f253359919c,input library
4,ENCSR631ESZ,ENCFF314CVE,Control,foreskin keratinocyte,,2,2_1,1,101,paired-ended,55163711,https://encode-public.s3.amazonaws.com/2016/12...,946aa908c62e8c131f5ae7160c134001,input library


In [253]:
pd.concat([df, control_df]).to_csv("Metadata.txt", sep="\t", header=True, index=False)

In [254]:
!cat Metadata.txt | head

exp	acc	target	biosample	synonyms	biorep	techrep	read	read_length	run_type	read_count	url	md5sum	control
ENCSR959XNY	ENCFF655BEL	MED1	HepG2	HepG2 cell,Hep-G2	2	2_1	1	75	single-ended	40474263	https://encode-public.s3.amazonaws.com/2018/06/06/cfcbb186-586a-4aed-bf7f-03bdce6da3e8/ENCFF655BEL.fastq.gz	596f29273326f236d21e9dcfbc7bde44	ENCSR020QUJ
ENCSR959XNY	ENCFF920MZO	MED1	HepG2	HepG2 cell,Hep-G2	1	1_1	1	75	single-ended	38046079	https://encode-public.s3.amazonaws.com/2018/06/06/f0bfba19-75e2-4e99-8788-67c583ec86c9/ENCFF920MZO.fastq.gz	872871049ed39779348841c516e36311	ENCSR020QUJ
ENCSR577YCO	ENCFF075LHW	ARHGAP35	HepG2	HepG2 cell,Hep-G2	1	1_2	1	100	single-ended	32210747	https://encode-public.s3.amazonaws.com/2022/01/27/e698fa0d-06b9-4f07-8719-753ed23b6180/ENCFF075LHW.fastq.gz	a42e339580b9832afb0357dabda8b648	ENCSR239QGH
ENCSR577YCO	ENCFF504PYF	ARHGAP35	HepG2	HepG2 cell,Hep-G2	1	1_2	1	100	single-ended	32231636	https://encode-public.s3.amazonaws.com/2022/01/27/548c070e-0fef-4230-9d71-a3a814f9

In [255]:
pd.concat([df, control_df]).exp.unique().shape

(4269,)

In [263]:
TFs = df.target.unique().tolist()
ZNF_TFs = df[df["target"].str.contains("ZNF")].target.unique().tolist()
print("There are {} ZNF TFs".format(len(ZNF_TFs)))
TFs_of_interest = ZNF_TFs + ["CTCF", "REST", "MAX"]

There are 306 ZNF TFs


In [264]:
df['read_length'].unique()

array([ 75, 100,  76, 101,  51, 151,  36,  50,  32,  40,  30,  34,  35,
        27,  28,  37,  25,  33,  29,  74])

In [None]:
readcounts = metadata.groupby("exp")["read_count"].sum().to_dict()