# Single End Library

In [1]:
import json, requests

expr_accession = "ENCSR877MSN"

# Force return from the server in JSON format
headers = {"accept": "application/json"}

# This searches the ENCODE database for the phrase "bone chip"
url = f'https://www.encodeproject.org/experiments/{expr_accession}'

# GET the search result
response = requests.get(url, headers=headers)

# Extract the JSON response as a python dictionary
search_results = response.json()

In [3]:
len(search_results.keys())

53

In [20]:
controls = list(
        [search_results["possible_controls"][i]["accession"] for i in range(len(search_results["possible_controls"]))]
    )
controls

['ENCSR148TRG']

In [33]:
search_results['files'][0]['run_type']

'single-ended'

In [3]:
se_file_keys = search_results['files'][1].keys()

In [8]:
file_dict = search_results['files'][1]
file_dict['accession']

'ENCFF556ZXY'

In [19]:
file_dict['replicate']['library'][11:-1]
file_dict['platform']['term_name']

'Illumina NextSeq 500'

In [21]:
print([file_dict.get("accession"),
file_dict.get("read_count"),
file_dict.get("no_file_available"),
file_dict.get("platform", {}).get('term_name'), 
file_dict.get("read_length"),
file_dict.get('replicate', {}).get("library")[11:-1],
file_dict.get("href"),
file_dict.get("run_type"),
file_dict.get("paired_end"),
file_dict.get("paired_with")])

['ENCFF556ZXY', 11890542, False, 'Illumina NextSeq 500', 76, '/libraries/ENCLB148AZB/', '/files/ENCFF556ZXY/@@download/ENCFF556ZXY.fastq.gz', 'single-ended', None, None]


In [26]:
type_held_by_each_key = lambda key: (key ,type(search_results[key]))

In [9]:
list([type_held_by_each_key(key) for key in search_results.keys()])

[('assay_term_name', str),
 ('biosample_ontology', dict),
 ('documents', list),
 ('references', list),
 ('schema_version', str),
 ('accession', str),
 ('alternate_accessions', list),
 ('analyses', list),
 ('dbxrefs', list),
 ('date_released', str),
 ('doi', str),
 ('internal_tags', list),
 ('status', str),
 ('date_created', str),
 ('submitted_by', dict),
 ('lab', dict),
 ('award', dict),
 ('aliases', list),
 ('target', dict),
 ('possible_controls', list),
 ('supersedes', list),
 ('related_files', list),
 ('internal_status', str),
 ('bio_replicate_count', int),
 ('tech_replicate_count', int),
 ('replication_type', str),
 ('objective_slims', list),
 ('type_slims', list),
 ('category_slims', list),
 ('assay_title', str),
 ('assay_slims', list),
 ('replicates', list),
 ('simple_biosample_summary', str),
 ('biosample_summary', str),
 ('assay_term_id', str),
 ('@id', str),
 ('@type', list),
 ('uuid', str),
 ('original_files', list),
 ('contributing_files', list),
 ('files', list),
 ('revoked

 ## Important fields in the search results
'possible_controls', list  **Super Important** Ideallly it should just be one!(log it)
'bio_replicate_count', int
'tech_replicate_count', int
'replication_type'
('replicates', list),  **Super Important** Has info about all the libraries.(log it)
('life_stage_age', str),
('perturbed', bool),
('related_series', list),

In [2]:
search_results['biosample_ontology']

{'aliases': [],
 'references': [],
 'term_id': 'UBERON:0002305',
 'term_name': 'layer of hippocampus',
 'schema_version': '1',
 'status': 'released',
 'classification': 'tissue',
 'dbxrefs': [],
 '@id': '/biosample-types/tissue_UBERON_0002305/',
 '@type': ['BiosampleType', 'Item'],
 'uuid': 'b09d9668-4bf2-4db6-a60a-12b137e947ea',
 'name': 'tissue_UBERON_0002305',
 'organ_slims': ['brain'],
 'cell_slims': [],
 'developmental_slims': ['ectoderm'],
 'system_slims': ['central nervous system'],
 'synonyms': ['hippocampus layer',
  'cytoarchitectural fields of hippocampal formation']}

# Important filed in search_results['biosample_ontology']
'term_name'
'classification'
'organ_slims'
'cell_slims'

In [30]:
search_results['submitted_by']

{'@id': '/users/73ca8b5c-8883-4d4f-9e45-7841411cb6a0/',
 '@type': ['User', 'Item'],
 'uuid': '73ca8b5c-8883-4d4f-9e45-7841411cb6a0',
 'lab': '/labs/bradley-bernstein/',
 'title': 'Siddarth Wekhande',
 'submits_for': ['/labs/bradley-bernstein/', '/labs/david-hafler/']}

# Exploring the library ['replicates'] dictionary

In [25]:
## Important fields under replicate (library)
search_results['replicates'][0].keys()

dict_keys(['date_created', 'submitted_by', 'aliases', 'schema_version', 'antibody', 'biological_replicate_number', 'technical_replicate_number', 'experiment', 'library', 'status', '@id', '@type', 'uuid'])

In [29]:
[(key,type(search_results['replicates'][0][key])) for key in search_results['replicates'][0].keys()]

[('date_created', str),
 ('submitted_by', str),
 ('aliases', list),
 ('schema_version', str),
 ('antibody', dict),
 ('biological_replicate_number', int),
 ('technical_replicate_number', int),
 ('experiment', str),
 ('library', dict),
 ('status', str),
 ('@id', str),
 ('@type', list),
 ('uuid', str)]

In [28]:
search_results['replicates'][0]['library']

{'documents': [],
 'date_created': '2022-06-03T14:35:53.148801+00:00',
 'submitted_by': '/users/73ca8b5c-8883-4d4f-9e45-7841411cb6a0/',
 'status': 'released',
 'lab': '/labs/bradley-bernstein/',
 'award': '/awards/UM1HG009390/',
 'aliases': ['bradley-bernstein:DNA_Lib 13051'],
 'accession': 'ENCLB330FTF',
 'schema_version': '20',
 'alternate_accessions': [],
 'spikeins_used': [],
 'barcode_details': [],
 'biosample': {'accession': 'ENCBS169TOO',
  'aliases': ['bradley-bernstein:BioSam 4609'],
  'schema_version': '26',
  'status': 'released',
  'lab': '/labs/barbara-wold/',
  'award': '/awards/UM1HG009443/',
  'date_created': '2022-01-27T23:28:32.110534+00:00',
  'submitted_by': {'@id': '/users/bc5b62f7-ce28-4a1e-b6b3-81c9c5a86d7a/',
   '@type': ['User', 'Item'],
   'uuid': 'bc5b62f7-ce28-4a1e-b6b3-81c9c5a86d7a',
   'lab': '/labs/barbara-wold/',
   'title': 'Diane Trout',
   'submits_for': ['/labs/barbara-wold/',
    '/labs/richard-myers/',
    '/labs/ali-mortazavi/',
    '/labs/roderic

## Important fields under replicate (library)

('antibody', dict),
 ('biological_replicate_number', int),
 ('technical_replicate_number', int),
 ('experiment', str),
 ['library']['biosample'] # Info about the biosample.
 list of files (files attribute.)

In [11]:
search_results['possible_controls'][0]['accession']

'ENCSR148TRG'

In [8]:
list([search_results['replicates'][i]['library']['accession'] for i in range(4)]) # Important to store this

['ENCLB330FTF', 'ENCLB148AZB', 'ENCLB600OJP', 'ENCLB143GUD']

In [45]:
list([search_results['files'][i]['accession'] for i in range(len(search_results['files']))])

['ENCFF891MBD',
 'ENCFF556ZXY',
 'ENCFF321ZMD',
 'ENCFF521IGB',
 'ENCFF502RGN',
 'ENCFF240DJN',
 'ENCFF836DMR',
 'ENCFF120JAX',
 'ENCFF565TBX',
 'ENCFF511CFZ',
 'ENCFF352TCQ',
 'ENCFF419HOS',
 'ENCFF326QNQ',
 'ENCFF921ILM',
 'ENCFF646CPE',
 'ENCFF399PIX',
 'ENCFF548IWN',
 'ENCFF459AXW',
 'ENCFF197WOO',
 'ENCFF143RZH',
 'ENCFF288ELJ',
 'ENCFF880DHW',
 'ENCFF197ZPX',
 'ENCFF796SWT',
 'ENCFF402HIW',
 'ENCFF688HKS',
 'ENCFF613YEG',
 'ENCFF811FLZ',
 'ENCFF958RQK',
 'ENCFF342OJX',
 'ENCFF110KGZ',
 'ENCFF729OOK',
 'ENCFF172GXC',
 'ENCFF042ZVD',
 'ENCFF656RDZ',
 'ENCFF929GBA',
 'ENCFF711VWC',
 'ENCFF706BZC',
 'ENCFF700FJI',
 'ENCFF654PZZ',
 'ENCFF121WNF',
 'ENCFF044NEJ',
 'ENCFF553EIY',
 'ENCFF257NKH',
 'ENCFF008YIW',
 'ENCFF458MEM',
 'ENCFF488URG',
 'ENCFF093ZRU',
 'ENCFF750SFW',
 'ENCFF353WRL',
 'ENCFF588UXR',
 'ENCFF759SYQ']

In [42]:
search_results['files'][1]['accession'], search_results['files'][2]['accession'], search_results['files'][3]['accession'], search_results['files'][4]['accession'], search_results['files'][5]['accession']

('ENCFF556ZXY', 'ENCFF321ZMD', 'ENCFF521IGB', 'ENCFF502RGN', 'ENCFF240DJN')

In [None]:
search_results['files'][1]['flowcell_details'], search_results['files'][2]['flowcell_details'], search_results['files'][3]['flowcell_details'], search_results['files'][4]['flowcell_details'], search_results['files'][5]['flowcell_details']

# Exploring the file attribute of the Experiment. Everyhting I need is here.

In [8]:
search_results['files'][1]['run_type']

'single-ended'

In [48]:
len(search_results['files'][1].keys())

49

In [None]:
print(json.dumps(search_results['files'][1], indent=4))

In [2]:
search_results['files'][1]['replicate']['library']

'/libraries/ENCLB148AZB/'

In [4]:
search_results['files'][1]['accession']

'ENCFF556ZXY'

In [52]:
type_held_by_each_key = lambda key: (key ,type(search_results['files'][1][key]))
list([type_held_by_each_key(key) for key in search_results['files'][1].keys()])


[('accession', str),
 ('aliases', list),
 ('schema_version', str),
 ('lab', dict),
 ('award', str),
 ('date_created', str),
 ('submitted_by', dict),
 ('alternate_accessions', list),
 ('read_count', int),
 ('file_format', str),
 ('no_file_available', bool),
 ('submitted_file_name', str),
 ('md5sum', str),
 ('content_md5sum', str),
 ('fastq_signature', list),
 ('file_size', int),
 ('platform', dict),
 ('read_length', int),
 ('run_type', str),
 ('flowcell_details', list),
 ('output_type', str),
 ('dataset', str),
 ('replicate', dict),
 ('status', str),
 ('dbxrefs', list),
 ('@id', str),
 ('@type', list),
 ('uuid', str),
 ('title', str),
 ('href', str),
 ('read_length_units', str),
 ('biological_replicates', list),
 ('technical_replicates', list),
 ('biological_replicates_formatted', str),
 ('donors', list),
 ('output_category', str),
 ('quality_metrics', list),
 ('file_type', str),
 ('superseded_by', list),
 ('cloud_metadata', dict),
 ('s3_uri', str),
 ('azure_uri', str),
 ('assay_title',

## Relevant fields

1. 'accession', str
2. 'read_count', int **Verify its over 2 million**
3. 'file_format', str **Super Important**
4. 'no_file_available', bool
5. 'platform', dict
6. 'read_length', int
7. 'run_type', str
8. 'flowcell_details', list
9. ['replicate']['library'] **Super Important**
10. 'href', str   **This is the downlaod link**
11. 's3_uri', str **Another download link**
12. 'dbxrefs' (Has SRA id)
13. ['run_type'] **Important to distinguish between SE and PE**



# Paired End Library

In [8]:
import json, requests

expr_accession = "ENCSR419OOD"

# Force return from the server in JSON format
headers = {"accept": "application/json"}

# This searches the ENCODE database for the phrase "bone chip"
url = f'https://www.encodeproject.org/experiments/{expr_accession}'

# GET the search result
response = requests.get(url, headers=headers)

# Extract the JSON response as a python dictionary
search_results = response.json()

In [15]:
pe_file_keys = search_results['files'][4].keys()

In [20]:
search_results['files'][4]['paired_end']

'1'

In [13]:
pe_file_keys - se_file_keys 

{'origin_batches', 'paired_end', 'paired_with'}

# Exploring the JSON for a control experiment

In [3]:
import json, requests

expr_accession = "ENCSR621CSR"

# Force return from the server in JSON format
headers = {"accept": "application/json"}

# This searches the ENCODE database for the phrase "bone chip"
url = f'https://www.encodeproject.org/experiments/{expr_accession}'

# GET the search result
response = requests.get(url, headers=headers)

# Extract the JSON response as a python dictionary
expr_data = response.json()

In [4]:
libraries = list(
        [
            expr_data["replicates"][i]["library"]["accession"]
            for i in range(len(expr_data["replicates"]))
        ]
    )
    

In [8]:
#Gather fastq files for each library
library_fastq_files = dict(zip(libraries,[[] for i in range(len(libraries))]))

for file in  expr_data['files']:
    if 'replicate' in file.keys() and file['file_format']=='fastq':
        library_fastq_files[file['replicate']['library'][11:-1]].append(file)

In [9]:
library_fastq_files

{'ENCLB917GEI': [{'accession': 'ENCFF977SGP',
   'aliases': [],
   'schema_version': '31',
   'lab': {'fax': '256-327-0978',
    'institute_label': 'HAIB',
    'address1': '601 Genome Way',
    'state': 'AL',
    'institute_name': 'HudsonAlpha Institute for Biotechnology',
    'awards': ['/awards/U54HG006998/',
     '/awards/U54HG004576/',
     '/awards/UM1HG009411/'],
    'title': 'Richard Myers, HAIB',
    'phone1': '256-327-5220',
    'phone2': '',
    'schema_version': '5',
    'pi': '/users/a62cfec5-57a0-45ab-b943-8ca0e0057bb6/',
    'postal_code': '35806',
    'name': 'richard-myers',
    'city': 'Huntsville',
    'country': 'USA',
    'status': 'current',
    '@id': '/labs/richard-myers/',
    '@type': ['Lab', 'Item'],
    'uuid': 'c0a3540e-8ef0-4d4d-a449-ae47c2475838'},
   'award': '/awards/U54HG006998/',
   'date_created': '2016-01-13T21:05:49.280768+00:00',
   'submitted_by': {'@id': '/users/28bb3427-86e6-480a-be97-922c8b489f6a/',
    '@type': ['User', 'Item'],
    'uuid': '2

In [10]:
len(library_fastq_files)

1

In [15]:
library_fastq_files[list(library_fastq_files.keys())[0]][0]['run_type']

'single-ended'

In [16]:
controls = list(
        [expr_data["possible_controls"][i]["accession"]]
        for i in range(len(expr_data["possible_controls"]))
    )
controls
# I don't mind an empty list, as long as the key is there.

[]