In [None]:
import requests
import polars as pl
import json
from box import Box
from IPython.display import display

In [None]:
def get_size_of_encode():
    # url = 'https://www.encodeproject.org/search/?type=Experiment&format=json&limit=all'
    url = 'https://www.encodeproject.org/search/?type=Experiment&assay_title=total+RNA-seq&assay_title=long+read+RNA-seq&assay_title=RAMPAGE&assay_title=CAGE&assay_title=BruChase-seq&assay_title=PRO-cap&assay_title=BruUV-seq&assay_title=Bru-seq&biosample_ontology.term_name=K562&replicates.library.biosample.donor.organism.scientific_name=Homo+sapiens&assembly=GRCh38&status=released&limit=all&format=json'
    headers = {'accept': 'application/json'}
    
    response = requests.get(url, headers=headers)
    data = response.json()
    experiment_graph = data.get('@graph', [])
    number_of_experiments = len(experiment_graph)
    number_of_files = 0
    
    for experiment in experiment_graph:
        experiment_files = experiment.get('files', [])
        experiment_file_count = len(experiment_files)
        number_of_files += experiment_file_count
    
    return number_of_experiments, number_of_files

In [None]:
number_of_experiments, number_of_files = get_size_of_encode()
print(f"Total number of experiments on ENCODE: {number_of_experiments}")
print(f"Total number of files ENCODE: {number_of_files}")

In [None]:
# Working !

def all_files_to_static_json():
    url = 'https://www.encodeproject.org/files/?format=json&limit=all'
    headers = {'accept': 'application/json'}
    response = requests.get(url, headers=headers)
    data = response.json()
    file_graph = data.get('@graph', [])
    with open('encode.json', 'w') as f:
        json.dump(file_graph, f, indent=4)
        
    print('File graph saved to encode.json')

In [None]:
all_files_to_static_json()

In [None]:
def all_experiments_to_static_json():
    url = 'https://www.encodeproject.org/experiments/?format=json&limit=all'
    headers = {'accept': 'application/json'}
    response = requests.get(url, headers=headers)
    data = response.json()
    file_graph = data.get('@graph', [])
    with open('encode_experiments.json', 'w') as f:
        json.dump(file_graph, f, indent=4)
        
    print('File graph saved to encode_experiments.json')

In [None]:
all_experiments_to_static_json()

In [None]:
# Explode encode_experiments.json by files to get mapping of assay title to current json
## Files is a dictionary of file ids to file accessions only want the values and then convert explode by list of values

In [None]:
import json
import polars as pl

with open('../encode_experiments.json') as f:
    data = json.load(f)

processed_data = []
for item in data:
    processed_item = {}
    for key, value in item.items():
        if isinstance(value, (dict, list, tuple, set)):
            processed_item[key] = json.dumps(value)
        else:
            processed_item[key] = value
    processed_data.append(processed_item)
    
pl.json_normalize(processed_data, max_level=0, strict=False).write_parquet('../encode_experiments.parquet')

In [None]:
encode_experiments = pl.read_parquet("../encode_experiments.parquet")
display(encode_experiments)

In [37]:
files_col_to_list = encode_experiments.with_columns(pl.col("files").str.json_decode())
files_col_exploded = files_col_to_list.explode("files")
unpacked_files = files_col_exploded.with_columns(pl.col("files").struct.field("*").alias("@id"))
unpacked_files_shortened = unpacked_files.select("@id", "assay_title", "biosample_summary", "dbxrefs", "description")
display(unpacked_files_shortened)

@id,assay_title,biosample_summary,dbxrefs,description
str,str,str,str,str
"""/files/ENCFF210GNK/""","""intact Hi-C""","""Homo sapiens left colon tissue…","""[""GEO:GSE238046""]""","""w72 left colon intact dnase hi…"
"""/files/ENCFF819BWS/""","""intact Hi-C""","""Homo sapiens left colon tissue…","""[""GEO:GSE238046""]""","""w72 left colon intact dnase hi…"
"""/files/ENCFF112DDP/""","""intact Hi-C""","""Homo sapiens left colon tissue…","""[""GEO:GSE238046""]""","""w72 left colon intact dnase hi…"
"""/files/ENCFF268TQA/""","""intact Hi-C""","""Homo sapiens left colon tissue…","""[""GEO:GSE238046""]""","""w72 left colon intact dnase hi…"
"""/files/ENCFF312BUG/""","""intact Hi-C""","""Homo sapiens left colon tissue…","""[""GEO:GSE238046""]""","""w72 left colon intact dnase hi…"
…,…,…,…,…
"""/files/ENCFF536RFU/""","""DNase-seq""","""Homo sapiens GM23338 originate…","""[""SCREEN-GRCh38:GM23338_male_a…",""""""
"""/files/ENCFF823RDH/""","""DNase-seq""","""Homo sapiens GM23338 originate…","""[""SCREEN-GRCh38:GM23338_male_a…",""""""
"""/files/ENCFF766CUM/""","""DNase-seq""","""Homo sapiens GM23338 originate…","""[""SCREEN-GRCh38:GM23338_male_a…",""""""
"""/files/ENCFF553JGR/""","""DNase-seq""","""Homo sapiens GM23338 originate…","""[""SCREEN-GRCh38:GM23338_male_a…",""""""


In [38]:
encode = pl.read_parquet("../encode.parquet")
display(encode)

@id,@type,accession,assay_term_name,audit,award,biological_replicates,biosample_ontology,dataset,date_created,file_format,file_size,file_type,href,index_of,lab,output_category,output_type,quality_metrics,read_length,read_length_units,replicate,run_type,simple_biosample_summary,status,technical_replicates,title,analysis_step_version,assembly,derived_from,file_format_type,genome_annotation,step_run,origin_batches,paired_end,paired_with,target,mapped_read_length,mapped_run_type,preferred_default
str,str,str,str,str,str,str,str,str,str,str,i64,str,str,str,str,str,str,str,i64,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,i64,str,bool
"""/files/ENCFF622ZWY/""","""[""File"", ""Item""]""","""ENCFF622ZWY""","""single-cell RNA sequencing ass…","""{""INTERNAL_ACTION"": [{""path"": …","""{""project"": ""ENCODE""}""","""[91]""","""{""organ_slims"": [""musculature …","""/experiments/ENCSR667UMR/""","""2022-01-28T22:18:02.027723+00:…","""fastq""",372815521,"""fastq""","""/files/ENCFF622ZWY/@@download/…","""[""/files/ENCFF402HCO/""]""","""{""title"": ""Ali Mortazavi, UCI""…","""raw data""","""index reads""","""[]""",86,"""nt""","""{""library"": {""accession"": ""ENC…","""single-ended""","""male postnatal (10 days) strai…","""revoked""","""[""91_1""]""","""ENCFF622ZWY""",,,,,,,,,,,,,
"""/files/ENCFF540BYK/""","""[""File"", ""Item""]""","""ENCFF540BYK""","""PAS-seq""","""{""INTERNAL_ACTION"": [{""path"": …","""{""project"": ""ENCODE""}""","""[1]""","""{""organ_slims"": [""brain""], ""te…","""/experiments/ENCSR432LIX/""","""2022-01-28T21:16:14.084277+00:…","""bed""",862868,"""bed bed3+""","""/files/ENCFF540BYK/@@download/…",,"""{""title"": ""Ali Mortazavi, UCI""…","""annotation""","""polyA sites""","""[]""",,,"""{""library"": {""accession"": ""ENC…",,"""adult (6 months) strain B6NCrl…","""released""","""[""1_1""]""","""ENCFF540BYK""","""{""schema_version"": ""4"", ""alias…","""mm10""","""[""/files/ENCFF222ZEY/"", ""/file…","""bed3+""","""M21""","""{""schema_version"": ""5"", ""alias…",,,,,,,
"""/files/ENCFF419EOR/""","""[""File"", ""Item""]""","""ENCFF419EOR""","""DNase-seq""",,"""{""project"": ""ENCODE""}""","""[2]""","""{""organ_slims"": [""kidney""], ""t…","""/experiments/ENCSR788KLG/""","""2022-02-09T07:23:59.093431+00:…","""bed""",773284,"""bed narrowPeak""","""/files/ENCFF419EOR/@@download/…",,"""{""title"": ""ENCODE Processing P…","""annotation""","""peaks""","""[{""tenth_of_one_percent_narrow…",,,,,"""female postnatal (36 days) str…","""released""","""[""2_1""]""","""ENCFF419EOR""","""{""schema_version"": ""4"", ""alias…","""mm10""","""[""/files/ENCFF843RGJ/"", ""/file…","""narrowPeak""",,"""{""schema_version"": ""5"", ""award…",,,,,,,
"""/files/ENCFF717ZXC/""","""[""File"", ""Item""]""","""ENCFF717ZXC""","""Mint-ChIP-seq""","""{""INTERNAL_ACTION"": [{""path"": …","""{""project"": ""ENCODE""}""","""[1]""","""{""organ_slims"": [""intestine"", …","""/experiments/ENCSR787WSK/""","""2022-02-10T21:33:07.978207+00:…","""fastq""",130776257,"""fastq""","""/files/ENCFF717ZXC/@@download/…",,"""{""title"": ""Bradley Bernstein, …","""raw data""","""reads""","""[]""",58,"""nt""","""{""library"": {""accession"": ""ENC…","""paired-ended""","""genetically modified (insertio…","""released""","""[""1_1""]""","""ENCFF717ZXC""",,,,,,,"""[""/biosamples/ENCBS842OLG/""]""","""2""","""/files/ENCFF812OOD/""","""{""label"": ""H3K4me3""}""",,,
"""/files/ENCFF600XFR/""","""[""File"", ""Item""]""","""ENCFF600XFR""","""Mint-ChIP-seq""","""{""INTERNAL_ACTION"": [{""path"": …","""{""project"": ""ENCODE""}""","""[1]""","""{""organ_slims"": [""intestine"", …","""/experiments/ENCSR848XKY/""","""2022-02-10T22:04:26.201539+00:…","""fastq""",193816284,"""fastq""","""/files/ENCFF600XFR/@@download/…",,"""{""title"": ""Bradley Bernstein, …","""raw data""","""reads""","""[]""",76,"""nt""","""{""library"": {""accession"": ""ENC…","""paired-ended""","""genetically modified (insertio…","""released""","""[""1_1""]""","""ENCFF600XFR""",,,,,,,"""[""/biosamples/ENCBS177ZJV/""]""","""1""","""/files/ENCFF354OUK/""",,,,
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""/files/ENCFF762LDF/""","""[""File"", ""Item""]""","""ENCFF762LDF""","""ChIP-seq""","""{""INTERNAL_ACTION"": [{""path"": …","""{""project"": ""ENCODE""}""","""[2]""","""{""organ_slims"": [""epithelium"",…","""/experiments/ENCSR000AHD/""","""2016-11-15T02:20:06.874942+00:…","""bed""",4135581,"""bed narrowPeak""","""/files/ENCFF762LDF/@@download/…",,"""{""title"": ""ENCODE Processing P…","""annotation""","""peaks and background as input …","""[]""",,,,,"""""","""released""","""[""2_1""]""","""ENCFF762LDF""","""{""schema_version"": ""4"", ""alias…","""hg19""","""[""/files/ENCFF335UKS/"", ""/file…","""narrowPeak""",,"""{""schema_version"": ""5"", ""alias…","""[""/biosamples/ENCBS001AAA/""]""",,,"""{""label"": ""CTCF""}""",,,
"""/files/ENCFF181YWG/""","""[""File"", ""Item""]""","""ENCFF181YWG""","""ChIP-seq""","""{""INTERNAL_ACTION"": [{""path"": …","""{""project"": ""ENCODE""}""","""[1, 2]""","""{""organ_slims"": [""epithelium"",…","""/experiments/ENCSR000AHD/""","""2016-11-15T02:20:40.615250+00:…","""bigBed""",1562444,"""bigBed narrowPeak""","""/files/ENCFF181YWG/@@download/…",,"""{""title"": ""ENCODE Processing P…","""annotation""","""conservative IDR thresholded p…","""[{""aliases"": [], ""frip"": 0.152…",,,,,"""""","""released""","""[""1_1"", ""2_1""]""","""ENCFF181YWG""","""{""schema_version"": ""4"", ""alias…","""hg19""","""[""/files/ENCFF444IOW/""]""","""narrowPeak""",,"""{""schema_version"": ""5"", ""alias…","""[""/biosamples/ENCBS001AAA/"", ""…",,,"""{""label"": ""CTCF""}""",,,
"""/files/ENCFF091FJE/""","""[""File"", ""Item""]""","""ENCFF091FJE""","""ChIP-seq""",,"""{""project"": ""ENCODE""}""","""[1]""","""{""organ_slims"": [""epithelium"",…","""/experiments/ENCSR000AHD/""","""2016-05-05T16:55:07.767550+00:…","""bam""",1275941691,"""bam""","""/files/ENCFF091FJE/@@download/…",,"""{""title"": ""ENCODE Processing P…","""alignment""","""alignments""","""[{""duplicates"": 0, ""aliases"": …",,"""nt""",,,"""""","""revoked""","""[""1_1""]""","""ENCFF091FJE""","""{""schema_version"": ""4"", ""alias…","""GRCh38""","""[""/files/ENCFF643CGH/"", ""/file…",,,"""{""schema_version"": ""5"", ""alias…","""[""/biosamples/ENCBS000AAA/""]""",,,"""{""label"": ""CTCF""}""",50,"""single-ended""",
"""/files/ENCFF572VGJ/""","""[""File"", ""Item""]""","""ENCFF572VGJ""","""ChIP-seq""","""{""INTERNAL_ACTION"": [{""path"": …","""{""project"": ""ENCODE""}""","""[2]""","""{""organ_slims"": [""epithelium"",…","""/experiments/ENCSR000AHD/""","""2020-09-26T08:07:07.130225+00:…","""bam""",1290288337,"""bam""","""/files/ENCFF572VGJ/@@download/…",,"""{""title"": ""ENCODE Processing P…","""alignment""","""unfiltered alignments""","""[{""duplicate_reads"": 0, ""diff_…",,"""nt""",,,"""""","""released""","""[""2_1""]""","""ENCFF572VGJ""","""{""schema_version"": ""4"", ""alias…","""GRCh38""","""[""/files/ENCFF110MCL/"", ""/file…",,,"""{""schema_version"": ""5"", ""alias…","""[""/biosamples/ENCBS001AAA/""]""",,,"""{""label"": ""CTCF""}""",50,"""single-ended""",


In [39]:
only_experiments = encode.filter(pl.col("dataset").str.starts_with("/experiments/").alias("files"))
display(only_experiments)

@id,@type,accession,assay_term_name,audit,award,biological_replicates,biosample_ontology,dataset,date_created,file_format,file_size,file_type,href,index_of,lab,output_category,output_type,quality_metrics,read_length,read_length_units,replicate,run_type,simple_biosample_summary,status,technical_replicates,title,analysis_step_version,assembly,derived_from,file_format_type,genome_annotation,step_run,origin_batches,paired_end,paired_with,target,mapped_read_length,mapped_run_type,preferred_default
str,str,str,str,str,str,str,str,str,str,str,i64,str,str,str,str,str,str,str,i64,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,i64,str,bool
"""/files/ENCFF622ZWY/""","""[""File"", ""Item""]""","""ENCFF622ZWY""","""single-cell RNA sequencing ass…","""{""INTERNAL_ACTION"": [{""path"": …","""{""project"": ""ENCODE""}""","""[91]""","""{""organ_slims"": [""musculature …","""/experiments/ENCSR667UMR/""","""2022-01-28T22:18:02.027723+00:…","""fastq""",372815521,"""fastq""","""/files/ENCFF622ZWY/@@download/…","""[""/files/ENCFF402HCO/""]""","""{""title"": ""Ali Mortazavi, UCI""…","""raw data""","""index reads""","""[]""",86,"""nt""","""{""library"": {""accession"": ""ENC…","""single-ended""","""male postnatal (10 days) strai…","""revoked""","""[""91_1""]""","""ENCFF622ZWY""",,,,,,,,,,,,,
"""/files/ENCFF540BYK/""","""[""File"", ""Item""]""","""ENCFF540BYK""","""PAS-seq""","""{""INTERNAL_ACTION"": [{""path"": …","""{""project"": ""ENCODE""}""","""[1]""","""{""organ_slims"": [""brain""], ""te…","""/experiments/ENCSR432LIX/""","""2022-01-28T21:16:14.084277+00:…","""bed""",862868,"""bed bed3+""","""/files/ENCFF540BYK/@@download/…",,"""{""title"": ""Ali Mortazavi, UCI""…","""annotation""","""polyA sites""","""[]""",,,"""{""library"": {""accession"": ""ENC…",,"""adult (6 months) strain B6NCrl…","""released""","""[""1_1""]""","""ENCFF540BYK""","""{""schema_version"": ""4"", ""alias…","""mm10""","""[""/files/ENCFF222ZEY/"", ""/file…","""bed3+""","""M21""","""{""schema_version"": ""5"", ""alias…",,,,,,,
"""/files/ENCFF419EOR/""","""[""File"", ""Item""]""","""ENCFF419EOR""","""DNase-seq""",,"""{""project"": ""ENCODE""}""","""[2]""","""{""organ_slims"": [""kidney""], ""t…","""/experiments/ENCSR788KLG/""","""2022-02-09T07:23:59.093431+00:…","""bed""",773284,"""bed narrowPeak""","""/files/ENCFF419EOR/@@download/…",,"""{""title"": ""ENCODE Processing P…","""annotation""","""peaks""","""[{""tenth_of_one_percent_narrow…",,,,,"""female postnatal (36 days) str…","""released""","""[""2_1""]""","""ENCFF419EOR""","""{""schema_version"": ""4"", ""alias…","""mm10""","""[""/files/ENCFF843RGJ/"", ""/file…","""narrowPeak""",,"""{""schema_version"": ""5"", ""award…",,,,,,,
"""/files/ENCFF717ZXC/""","""[""File"", ""Item""]""","""ENCFF717ZXC""","""Mint-ChIP-seq""","""{""INTERNAL_ACTION"": [{""path"": …","""{""project"": ""ENCODE""}""","""[1]""","""{""organ_slims"": [""intestine"", …","""/experiments/ENCSR787WSK/""","""2022-02-10T21:33:07.978207+00:…","""fastq""",130776257,"""fastq""","""/files/ENCFF717ZXC/@@download/…",,"""{""title"": ""Bradley Bernstein, …","""raw data""","""reads""","""[]""",58,"""nt""","""{""library"": {""accession"": ""ENC…","""paired-ended""","""genetically modified (insertio…","""released""","""[""1_1""]""","""ENCFF717ZXC""",,,,,,,"""[""/biosamples/ENCBS842OLG/""]""","""2""","""/files/ENCFF812OOD/""","""{""label"": ""H3K4me3""}""",,,
"""/files/ENCFF600XFR/""","""[""File"", ""Item""]""","""ENCFF600XFR""","""Mint-ChIP-seq""","""{""INTERNAL_ACTION"": [{""path"": …","""{""project"": ""ENCODE""}""","""[1]""","""{""organ_slims"": [""intestine"", …","""/experiments/ENCSR848XKY/""","""2022-02-10T22:04:26.201539+00:…","""fastq""",193816284,"""fastq""","""/files/ENCFF600XFR/@@download/…",,"""{""title"": ""Bradley Bernstein, …","""raw data""","""reads""","""[]""",76,"""nt""","""{""library"": {""accession"": ""ENC…","""paired-ended""","""genetically modified (insertio…","""released""","""[""1_1""]""","""ENCFF600XFR""",,,,,,,"""[""/biosamples/ENCBS177ZJV/""]""","""1""","""/files/ENCFF354OUK/""",,,,
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""/files/ENCFF762LDF/""","""[""File"", ""Item""]""","""ENCFF762LDF""","""ChIP-seq""","""{""INTERNAL_ACTION"": [{""path"": …","""{""project"": ""ENCODE""}""","""[2]""","""{""organ_slims"": [""epithelium"",…","""/experiments/ENCSR000AHD/""","""2016-11-15T02:20:06.874942+00:…","""bed""",4135581,"""bed narrowPeak""","""/files/ENCFF762LDF/@@download/…",,"""{""title"": ""ENCODE Processing P…","""annotation""","""peaks and background as input …","""[]""",,,,,"""""","""released""","""[""2_1""]""","""ENCFF762LDF""","""{""schema_version"": ""4"", ""alias…","""hg19""","""[""/files/ENCFF335UKS/"", ""/file…","""narrowPeak""",,"""{""schema_version"": ""5"", ""alias…","""[""/biosamples/ENCBS001AAA/""]""",,,"""{""label"": ""CTCF""}""",,,
"""/files/ENCFF181YWG/""","""[""File"", ""Item""]""","""ENCFF181YWG""","""ChIP-seq""","""{""INTERNAL_ACTION"": [{""path"": …","""{""project"": ""ENCODE""}""","""[1, 2]""","""{""organ_slims"": [""epithelium"",…","""/experiments/ENCSR000AHD/""","""2016-11-15T02:20:40.615250+00:…","""bigBed""",1562444,"""bigBed narrowPeak""","""/files/ENCFF181YWG/@@download/…",,"""{""title"": ""ENCODE Processing P…","""annotation""","""conservative IDR thresholded p…","""[{""aliases"": [], ""frip"": 0.152…",,,,,"""""","""released""","""[""1_1"", ""2_1""]""","""ENCFF181YWG""","""{""schema_version"": ""4"", ""alias…","""hg19""","""[""/files/ENCFF444IOW/""]""","""narrowPeak""",,"""{""schema_version"": ""5"", ""alias…","""[""/biosamples/ENCBS001AAA/"", ""…",,,"""{""label"": ""CTCF""}""",,,
"""/files/ENCFF091FJE/""","""[""File"", ""Item""]""","""ENCFF091FJE""","""ChIP-seq""",,"""{""project"": ""ENCODE""}""","""[1]""","""{""organ_slims"": [""epithelium"",…","""/experiments/ENCSR000AHD/""","""2016-05-05T16:55:07.767550+00:…","""bam""",1275941691,"""bam""","""/files/ENCFF091FJE/@@download/…",,"""{""title"": ""ENCODE Processing P…","""alignment""","""alignments""","""[{""duplicates"": 0, ""aliases"": …",,"""nt""",,,"""""","""revoked""","""[""1_1""]""","""ENCFF091FJE""","""{""schema_version"": ""4"", ""alias…","""GRCh38""","""[""/files/ENCFF643CGH/"", ""/file…",,,"""{""schema_version"": ""5"", ""alias…","""[""/biosamples/ENCBS000AAA/""]""",,,"""{""label"": ""CTCF""}""",50,"""single-ended""",
"""/files/ENCFF572VGJ/""","""[""File"", ""Item""]""","""ENCFF572VGJ""","""ChIP-seq""","""{""INTERNAL_ACTION"": [{""path"": …","""{""project"": ""ENCODE""}""","""[2]""","""{""organ_slims"": [""epithelium"",…","""/experiments/ENCSR000AHD/""","""2020-09-26T08:07:07.130225+00:…","""bam""",1290288337,"""bam""","""/files/ENCFF572VGJ/@@download/…",,"""{""title"": ""ENCODE Processing P…","""alignment""","""unfiltered alignments""","""[{""duplicate_reads"": 0, ""diff_…",,"""nt""",,,"""""","""released""","""[""2_1""]""","""ENCFF572VGJ""","""{""schema_version"": ""4"", ""alias…","""GRCh38""","""[""/files/ENCFF110MCL/"", ""/file…",,,"""{""schema_version"": ""5"", ""alias…","""[""/biosamples/ENCBS001AAA/""]""",,,"""{""label"": ""CTCF""}""",50,"""single-ended""",


In [40]:
joined = only_experiments.join(unpacked_files_shortened, on='@id', how='inner')
display(joined)

@id,@type,accession,assay_term_name,audit,award,biological_replicates,biosample_ontology,dataset,date_created,file_format,file_size,file_type,href,index_of,lab,output_category,output_type,quality_metrics,read_length,read_length_units,replicate,run_type,simple_biosample_summary,status,technical_replicates,title,analysis_step_version,assembly,derived_from,file_format_type,genome_annotation,step_run,origin_batches,paired_end,paired_with,target,mapped_read_length,mapped_run_type,preferred_default,assay_title,biosample_summary,dbxrefs,description
str,str,str,str,str,str,str,str,str,str,str,i64,str,str,str,str,str,str,str,i64,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,i64,str,bool,str,str,str,str
"""/files/ENCFF540BYK/""","""[""File"", ""Item""]""","""ENCFF540BYK""","""PAS-seq""","""{""INTERNAL_ACTION"": [{""path"": …","""{""project"": ""ENCODE""}""","""[1]""","""{""organ_slims"": [""brain""], ""te…","""/experiments/ENCSR432LIX/""","""2022-01-28T21:16:14.084277+00:…","""bed""",862868,"""bed bed3+""","""/files/ENCFF540BYK/@@download/…",,"""{""title"": ""Ali Mortazavi, UCI""…","""annotation""","""polyA sites""","""[]""",,,"""{""library"": {""accession"": ""ENC…",,"""adult (6 months) strain B6NCrl…","""released""","""[""1_1""]""","""ENCFF540BYK""","""{""schema_version"": ""4"", ""alias…","""mm10""","""[""/files/ENCFF222ZEY/"", ""/file…","""bed3+""","""M21""","""{""schema_version"": ""5"", ""alias…",,,,,,,,"""PAS-seq""","""Mus musculus strain B6NCrl lef…","""[""GEO:GSE219823""]""","""Mouse cortex QuantSeq"""
"""/files/ENCFF419EOR/""","""[""File"", ""Item""]""","""ENCFF419EOR""","""DNase-seq""",,"""{""project"": ""ENCODE""}""","""[2]""","""{""organ_slims"": [""kidney""], ""t…","""/experiments/ENCSR788KLG/""","""2022-02-09T07:23:59.093431+00:…","""bed""",773284,"""bed narrowPeak""","""/files/ENCFF419EOR/@@download/…",,"""{""title"": ""ENCODE Processing P…","""annotation""","""peaks""","""[{""tenth_of_one_percent_narrow…",,,,,"""female postnatal (36 days) str…","""released""","""[""2_1""]""","""ENCFF419EOR""","""{""schema_version"": ""4"", ""alias…","""mm10""","""[""/files/ENCFF843RGJ/"", ""/file…","""narrowPeak""",,"""{""schema_version"": ""5"", ""award…",,,,,,,,"""DNase-seq""","""Mus musculus strain B6CASTF1/J…","""[""GEO:GSE215736""]""",
"""/files/ENCFF717ZXC/""","""[""File"", ""Item""]""","""ENCFF717ZXC""","""Mint-ChIP-seq""","""{""INTERNAL_ACTION"": [{""path"": …","""{""project"": ""ENCODE""}""","""[1]""","""{""organ_slims"": [""intestine"", …","""/experiments/ENCSR787WSK/""","""2022-02-10T21:33:07.978207+00:…","""fastq""",130776257,"""fastq""","""/files/ENCFF717ZXC/@@download/…",,"""{""title"": ""Bradley Bernstein, …","""raw data""","""reads""","""[]""",58,"""nt""","""{""library"": {""accession"": ""ENC…","""paired-ended""","""genetically modified (insertio…","""released""","""[""1_1""]""","""ENCFF717ZXC""",,,,,,,"""[""/biosamples/ENCBS842OLG/""]""","""2""","""/files/ENCFF812OOD/""","""{""label"": ""H3K4me3""}""",,,,"""Mint-ChIP-seq""","""Homo sapiens HCT116 geneticall…","""[""GEO:GSE209137""]""",
"""/files/ENCFF600XFR/""","""[""File"", ""Item""]""","""ENCFF600XFR""","""Mint-ChIP-seq""","""{""INTERNAL_ACTION"": [{""path"": …","""{""project"": ""ENCODE""}""","""[1]""","""{""organ_slims"": [""intestine"", …","""/experiments/ENCSR848XKY/""","""2022-02-10T22:04:26.201539+00:…","""fastq""",193816284,"""fastq""","""/files/ENCFF600XFR/@@download/…",,"""{""title"": ""Bradley Bernstein, …","""raw data""","""reads""","""[]""",76,"""nt""","""{""library"": {""accession"": ""ENC…","""paired-ended""","""genetically modified (insertio…","""released""","""[""1_1""]""","""ENCFF600XFR""",,,,,,,"""[""/biosamples/ENCBS177ZJV/""]""","""1""","""/files/ENCFF354OUK/""",,,,,"""Control Mint-ChIP-seq""","""Homo sapiens HCT116 geneticall…","""[""GEO:GSE209350""]""",
"""/files/ENCFF933TWA/""","""[""File"", ""Item""]""","""ENCFF933TWA""","""DNase-seq""","""{""INTERNAL_ACTION"": [{""path"": …","""{""project"": ""ENCODE""}""","""[1]""","""{""organ_slims"": [""blood"", ""bod…","""/experiments/ENCSR794KUS/""","""2022-02-08T00:21:16.276034+00:…","""fastq""",1010036136,"""fastq""","""/files/ENCFF933TWA/@@download/…",,"""{""title"": ""John Stamatoyannopo…","""raw data""","""reads""","""[]""",151,"""nt""","""{""library"": {""accession"": ""ENC…","""paired-ended""","""with multiple sclerosis""","""released""","""[""1_1""]""","""ENCFF933TWA""",,,,,,,"""[""/biosamples/ENCBS030NVK/""]""","""2""","""/files/ENCFF091KZT/""",,,,,"""DNase-seq""","""Homo sapiens with multiple scl…","""[""GEO:GSE215737""]""",
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""/files/ENCFF843QSK/""","""[""File"", ""Item""]""","""ENCFF843QSK""","""ChIP-seq""","""{""INTERNAL_ACTION"": [{""path"": …","""{""project"": ""ENCODE""}""","""[1]""","""{""organ_slims"": [""epithelium"",…","""/experiments/ENCSR000AHD/""","""2020-09-26T08:07:23.714821+00:…","""bed""",795973,"""bed narrowPeak""","""/files/ENCFF843QSK/@@download/…",,"""{""title"": ""ENCODE Processing P…","""annotation""","""IDR thresholded peaks""","""[{""aliases"": [], ""frip"": 0.134…",,,,,"""""","""released""","""[""1_1""]""","""ENCFF843QSK""","""{""schema_version"": ""4"", ""alias…","""GRCh38""","""[""/files/ENCFF356LFX/"", ""/file…","""narrowPeak""",,"""{""schema_version"": ""5"", ""alias…","""[""/biosamples/ENCBS000AAA/""]""",,,"""{""label"": ""CTCF""}""",,,,"""TF ChIP-seq""","""Homo sapiens MCF-7""","""[""FactorBook:ENCSR000AHD"", ""GE…","""HAIB ChIP CTCF in MCF-7"""
"""/files/ENCFF762LDF/""","""[""File"", ""Item""]""","""ENCFF762LDF""","""ChIP-seq""","""{""INTERNAL_ACTION"": [{""path"": …","""{""project"": ""ENCODE""}""","""[2]""","""{""organ_slims"": [""epithelium"",…","""/experiments/ENCSR000AHD/""","""2016-11-15T02:20:06.874942+00:…","""bed""",4135581,"""bed narrowPeak""","""/files/ENCFF762LDF/@@download/…",,"""{""title"": ""ENCODE Processing P…","""annotation""","""peaks and background as input …","""[]""",,,,,"""""","""released""","""[""2_1""]""","""ENCFF762LDF""","""{""schema_version"": ""4"", ""alias…","""hg19""","""[""/files/ENCFF335UKS/"", ""/file…","""narrowPeak""",,"""{""schema_version"": ""5"", ""alias…","""[""/biosamples/ENCBS001AAA/""]""",,,"""{""label"": ""CTCF""}""",,,,"""TF ChIP-seq""","""Homo sapiens MCF-7""","""[""FactorBook:ENCSR000AHD"", ""GE…","""HAIB ChIP CTCF in MCF-7"""
"""/files/ENCFF181YWG/""","""[""File"", ""Item""]""","""ENCFF181YWG""","""ChIP-seq""","""{""INTERNAL_ACTION"": [{""path"": …","""{""project"": ""ENCODE""}""","""[1, 2]""","""{""organ_slims"": [""epithelium"",…","""/experiments/ENCSR000AHD/""","""2016-11-15T02:20:40.615250+00:…","""bigBed""",1562444,"""bigBed narrowPeak""","""/files/ENCFF181YWG/@@download/…",,"""{""title"": ""ENCODE Processing P…","""annotation""","""conservative IDR thresholded p…","""[{""aliases"": [], ""frip"": 0.152…",,,,,"""""","""released""","""[""1_1"", ""2_1""]""","""ENCFF181YWG""","""{""schema_version"": ""4"", ""alias…","""hg19""","""[""/files/ENCFF444IOW/""]""","""narrowPeak""",,"""{""schema_version"": ""5"", ""alias…","""[""/biosamples/ENCBS001AAA/"", ""…",,,"""{""label"": ""CTCF""}""",,,,"""TF ChIP-seq""","""Homo sapiens MCF-7""","""[""FactorBook:ENCSR000AHD"", ""GE…","""HAIB ChIP CTCF in MCF-7"""
"""/files/ENCFF572VGJ/""","""[""File"", ""Item""]""","""ENCFF572VGJ""","""ChIP-seq""","""{""INTERNAL_ACTION"": [{""path"": …","""{""project"": ""ENCODE""}""","""[2]""","""{""organ_slims"": [""epithelium"",…","""/experiments/ENCSR000AHD/""","""2020-09-26T08:07:07.130225+00:…","""bam""",1290288337,"""bam""","""/files/ENCFF572VGJ/@@download/…",,"""{""title"": ""ENCODE Processing P…","""alignment""","""unfiltered alignments""","""[{""duplicate_reads"": 0, ""diff_…",,"""nt""",,,"""""","""released""","""[""2_1""]""","""ENCFF572VGJ""","""{""schema_version"": ""4"", ""alias…","""GRCh38""","""[""/files/ENCFF110MCL/"", ""/file…",,,"""{""schema_version"": ""5"", ""alias…","""[""/biosamples/ENCBS001AAA/""]""",,,"""{""label"": ""CTCF""}""",50,"""single-ended""",,"""TF ChIP-seq""","""Homo sapiens MCF-7""","""[""FactorBook:ENCSR000AHD"", ""GE…","""HAIB ChIP CTCF in MCF-7"""


In [67]:
# output_type_list = ['plus strand signal of unique reads', 'minus strand signal of unique reads', 'signal of unique reads']
output_type_list = ['plus strand signal of unique reads', 'minus strand signal of unique reads']

filter_by_assay = joined.filter(pl.col("assay_term_name").is_in(["RNA-seq", "RAMPAGE", "CAGE", "BruChase-seq", "PRO-cap", "BruUV-seq", "Bru-seq"]))
filter_by_biosample = filter_by_assay.filter(pl.col("biosample_ontology").str.json_path_match("$.term_name").str.contains("K562"))
filter_by_assembly = filter_by_biosample.filter(pl.col("assembly").str.contains("GRCh38"))
filter_by_status = filter_by_assembly.filter(pl.col("status").str.contains("released"))
filter_by_file_type = filter_by_status.filter(pl.col("file_type").str.contains("bigWig"))
filter_by_output_type = filter_by_file_type.filter(pl.col("output_type").is_in(output_type_list))

display(filter_by_output_type.sort("dataset"))

@id,@type,accession,assay_term_name,audit,award,biological_replicates,biosample_ontology,dataset,date_created,file_format,file_size,file_type,href,index_of,lab,output_category,output_type,quality_metrics,read_length,read_length_units,replicate,run_type,simple_biosample_summary,status,technical_replicates,title,analysis_step_version,assembly,derived_from,file_format_type,genome_annotation,step_run,origin_batches,paired_end,paired_with,target,mapped_read_length,mapped_run_type,preferred_default,assay_title,biosample_summary,dbxrefs,description
str,str,str,str,str,str,str,str,str,str,str,i64,str,str,str,str,str,str,str,i64,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,i64,str,bool,str,str,str,str
"""/files/ENCFF440DXU/""","""[""File"", ""Item""]""","""ENCFF440DXU""","""RNA-seq""","""{""INTERNAL_ACTION"": [{""path"": …","""{""project"": ""ENCODE""}""","""[1]""","""{""organ_slims"": [""blood"", ""bod…","""/experiments/ENCSR000AEL/""","""2016-02-04T02:15:58.486965+00:…","""bigWig""",122429375,"""bigWig""","""/files/ENCFF440DXU/@@download/…",,"""{""title"": ""ENCODE Processing P…","""signal""","""plus strand signal of unique r…","""[]""",,,,,"""""","""released""","""[""1_1""]""","""ENCFF440DXU""","""{""schema_version"": ""4"", ""alias…","""GRCh38""","""[""/files/ENCFF796BVP/""]""",,"""V24""","""{""schema_version"": ""5"", ""alias…","""[""/biosamples/ENCBS087RNA/""]""",,,,,,,"""total RNA-seq""","""Homo sapiens K562""","""[""GEO:GSE78556""]""","""The libraries contained in thi…"
"""/files/ENCFF886IDW/""","""[""File"", ""Item""]""","""ENCFF886IDW""","""RNA-seq""","""{""INTERNAL_ACTION"": [{""path"": …","""{""project"": ""ENCODE""}""","""[2]""","""{""organ_slims"": [""blood"", ""bod…","""/experiments/ENCSR000AEL/""","""2016-02-04T02:18:53.281031+00:…","""bigWig""",97062639,"""bigWig""","""/files/ENCFF886IDW/@@download/…",,"""{""title"": ""ENCODE Processing P…","""signal""","""minus strand signal of unique …","""[]""",,,,,"""""","""released""","""[""2_1""]""","""ENCFF886IDW""","""{""schema_version"": ""4"", ""alias…","""GRCh38""","""[""/files/ENCFF340LGI/""]""",,"""V24""","""{""schema_version"": ""5"", ""alias…","""[""/biosamples/ENCBS088RNA/""]""",,,,,,true,"""total RNA-seq""","""Homo sapiens K562""","""[""GEO:GSE78556""]""","""The libraries contained in thi…"
"""/files/ENCFF710RYW/""","""[""File"", ""Item""]""","""ENCFF710RYW""","""RNA-seq""","""{""INTERNAL_ACTION"": [{""path"": …","""{""project"": ""ENCODE""}""","""[2]""","""{""organ_slims"": [""blood"", ""bod…","""/experiments/ENCSR000AEL/""","""2016-02-04T02:19:44.172006+00:…","""bigWig""",98822715,"""bigWig""","""/files/ENCFF710RYW/@@download/…",,"""{""title"": ""ENCODE Processing P…","""signal""","""plus strand signal of unique r…","""[]""",,,,,"""""","""released""","""[""2_1""]""","""ENCFF710RYW""","""{""schema_version"": ""4"", ""alias…","""GRCh38""","""[""/files/ENCFF340LGI/""]""",,"""V24""","""{""schema_version"": ""5"", ""alias…","""[""/biosamples/ENCBS088RNA/""]""",,,,,,true,"""total RNA-seq""","""Homo sapiens K562""","""[""GEO:GSE78556""]""","""The libraries contained in thi…"
"""/files/ENCFF335LVS/""","""[""File"", ""Item""]""","""ENCFF335LVS""","""RNA-seq""","""{""INTERNAL_ACTION"": [{""path"": …","""{""project"": ""ENCODE""}""","""[2]""","""{""organ_slims"": [""blood"", ""bod…","""/experiments/ENCSR000AEL/""","""2021-04-06T00:00:52.399081+00:…","""bigWig""",98830937,"""bigWig""","""/files/ENCFF335LVS/@@download/…",,"""{""title"": ""ENCODE Processing P…","""signal""","""plus strand signal of unique r…","""[]""",,,,,"""""","""released""","""[""2_1""]""","""ENCFF335LVS""","""{""schema_version"": ""4"", ""alias…","""GRCh38""","""[""/files/ENCFF724WTD/"", ""/file…",,"""V29""","""{""schema_version"": ""5"", ""award…","""[""/biosamples/ENCBS088RNA/""]""",,,,,,true,"""total RNA-seq""","""Homo sapiens K562""","""[""GEO:GSE78556""]""","""The libraries contained in thi…"
"""/files/ENCFF980ZHM/""","""[""File"", ""Item""]""","""ENCFF980ZHM""","""RNA-seq""","""{""INTERNAL_ACTION"": [{""path"": …","""{""project"": ""ENCODE""}""","""[1]""","""{""organ_slims"": [""blood"", ""bod…","""/experiments/ENCSR000AEL/""","""2021-04-06T00:00:50.442045+00:…","""bigWig""",122878251,"""bigWig""","""/files/ENCFF980ZHM/@@download/…",,"""{""title"": ""ENCODE Processing P…","""signal""","""plus strand signal of unique r…","""[]""",,,,,"""""","""released""","""[""1_1""]""","""ENCFF980ZHM""","""{""schema_version"": ""4"", ""alias…","""GRCh38""","""[""/files/ENCFF595XJM/"", ""/file…",,"""V29""","""{""schema_version"": ""5"", ""award…","""[""/biosamples/ENCBS087RNA/""]""",,,,,,,"""total RNA-seq""","""Homo sapiens K562""","""[""GEO:GSE78556""]""","""The libraries contained in thi…"
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""/files/ENCFF667CZO/""","""[""File"", ""Item""]""","""ENCFF667CZO""","""RNA-seq""","""{""INTERNAL_ACTION"": [{""path"": …","""{""project"": ""ENCODE""}""","""[1]""","""{""organ_slims"": [""blood"", ""bod…","""/experiments/ENCSR885DVH/""","""2016-02-19T18:31:39.476897+00:…","""bigWig""",121747822,"""bigWig""","""/files/ENCFF667CZO/@@download/…",,"""{""title"": ""ENCODE Processing P…","""signal""","""minus strand signal of unique …","""[]""",,,,,"""""","""released""","""[""1_1""]""","""ENCFF667CZO""","""{""schema_version"": ""4"", ""alias…","""GRCh38""","""[""/files/ENCFF625ZBS/""]""",,"""V24""","""{""schema_version"": ""5"", ""alias…","""[""/biosamples/ENCBS036OIX/""]""",,,,,,,"""total RNA-seq""","""Homo sapiens K562""","""[""GEO:GSE88622""]""","""Initial total fractions on K56…"
"""/files/ENCFF785VUS/""","""[""File"", ""Item""]""","""ENCFF785VUS""","""BruUV-seq""","""{""INTERNAL_ACTION"": [{""path"": …","""{""project"": ""ENCODE""}""","""[5]""","""{""organ_slims"": [""blood"", ""bod…","""/experiments/ENCSR974RZE/""","""2021-09-03T00:31:06.784167+00:…","""bigWig""",249042505,"""bigWig""","""/files/ENCFF785VUS/@@download/…",,"""{""title"": ""Mats Ljungman, UMic…","""signal""","""plus strand signal of unique r…","""[]""",,,,,"""""","""released""","""[""5_1""]""","""ENCFF785VUS""","""{""schema_version"": ""4"", ""alias…","""GRCh38""","""[""/files/ENCFF591SQM/""]""",,,"""{""schema_version"": ""5"", ""alias…","""[""/biosamples/ENCBS327ZDR/""]""",,,,,,true,"""BruUV-seq""","""Homo sapiens K562""","""[""GEO:GSE187894""]""",
"""/files/ENCFF182BOL/""","""[""File"", ""Item""]""","""ENCFF182BOL""","""BruUV-seq""","""{""INTERNAL_ACTION"": [{""path"": …","""{""project"": ""ENCODE""}""","""[5]""","""{""organ_slims"": [""blood"", ""bod…","""/experiments/ENCSR974RZE/""","""2021-09-03T00:31:31.918547+00:…","""bigWig""",243840409,"""bigWig""","""/files/ENCFF182BOL/@@download/…",,"""{""title"": ""Mats Ljungman, UMic…","""signal""","""minus strand signal of unique …","""[]""",,,,,"""""","""released""","""[""5_1""]""","""ENCFF182BOL""","""{""schema_version"": ""4"", ""alias…","""GRCh38""","""[""/files/ENCFF591SQM/""]""",,,"""{""schema_version"": ""5"", ""alias…","""[""/biosamples/ENCBS327ZDR/""]""",,,,,,true,"""BruUV-seq""","""Homo sapiens K562""","""[""GEO:GSE187894""]""",
"""/files/ENCFF300NKB/""","""[""File"", ""Item""]""","""ENCFF300NKB""","""BruUV-seq""","""{""INTERNAL_ACTION"": [{""path"": …","""{""project"": ""ENCODE""}""","""[4]""","""{""organ_slims"": [""blood"", ""bod…","""/experiments/ENCSR974RZE/""","""2021-09-03T00:17:32.840926+00:…","""bigWig""",184701519,"""bigWig""","""/files/ENCFF300NKB/@@download/…",,"""{""title"": ""Mats Ljungman, UMic…","""signal""","""plus strand signal of unique r…","""[]""",,,,,"""""","""released""","""[""4_1""]""","""ENCFF300NKB""","""{""schema_version"": ""4"", ""alias…","""GRCh38""","""[""/files/ENCFF819ACA/""]""",,,"""{""schema_version"": ""5"", ""alias…","""[""/biosamples/ENCBS708GCD/""]""",,,,,,,"""BruUV-seq""","""Homo sapiens K562""","""[""GEO:GSE187894""]""",


In [66]:
print(filter_by_output_type["assay_title"].sort().value_counts())

shape: (7, 2)
┌───────────────┬───────┐
│ assay_title   ┆ count │
│ ---           ┆ ---   │
│ str           ┆ u32   │
╞═══════════════╪═══════╡
│ Bru-seq       ┆ 8     │
│ BruChase-seq  ┆ 8     │
│ BruUV-seq     ┆ 4     │
│ CAGE          ┆ 12    │
│ PRO-cap       ┆ 40    │
│ RAMPAGE       ┆ 4     │
│ total RNA-seq ┆ 94    │
└───────────────┴───────┘


In [None]:
"""
shape: (8, 2)
┌───────────────────┬───────┐
│ assay             ┆ count │
│ ---               ┆ ---   │
│ str               ┆ u32   │
╞═══════════════════╪═══════╡
│ BruUV-seq         ┆ 4     │
│ RAMPAGE           ┆ 4     │
│ Bru-seq           ┆ 8     │
│ CAGE              ┆ 12    │
│ PRO-cap           ┆ 40    │
│ RNA-seq           ┆ 190   │
│ BruChase-seq      ┆ 8     │
│ long read RNA-seq ┆ 7     │
└───────────────────┴───────┘
"""

In [None]:
formatted_time = only_experiments.with_columns(pl.col("date_created").cast(pl.Datetime))
print(formatted_time["date_created"].max())
print(formatted_time["date_created"].min())

In [None]:
import pickle

with open('../accessions.pkl', 'rb') as f:
    accessions = pickle.load(f)
    
print(len(accessions))

In [None]:
encode = pl.read_parquet('../clean_encode.parquet')
display(encode)
# print(encode.select("biological_replicates"))
# display(encode)
for column in encode.iter_columns():
    display(encode.select(column).unique())
    
print(encode.select("output_type").unique().to_list())


In [None]:
encode = pl.read_parquet('../clean_encode.parquet')

In [None]:
display(encode)

In [None]:
import json

json_objects = []

with open('../encode.jsonl') as f:
    for line in f:
        file = json.loads(line)
        json_objects.append(file)

json_string = json.dumps(json_objects)

from io import StringIO
json_file_obj = StringIO(json_string)

encode_json = pl.read_json(json_file_obj)

In [None]:
import json 

url = "https://www.encodeproject.org/experiments/?format=json&limit=1"
response = requests.get(url)
data = response.json()
experiment = data['@graph'][0]
with open('encode_experiment_example.json', 'w') as outfile:
    json.dump(experiment, outfile, indent=4)


In [None]:
"/experiments/ENCSR220XSM/".split('/')