### Retrieve example from Gen3 Metadata Service

The following is an example response from the Gen3 Metadata service for a specific file

In [24]:
import requests
service_url = 'https://gen3.biodatacatalyst.nhlbi.nih.gov/mds/metadata/'
example_id = 'dg.4503/00000027-a1c4-4587-a6c9-d35f5818a724'
call_url = f'{service_url}{example_id}'

response = requests.get(call_url)

In [25]:
response.json()['dbgap']

{'sex': 'female',
 'body_site': 'peripheral blood',
 'repository': 'GeneSTAR',
 'sample_use': ['Seq_DNA_SNP_CNV'],
 'analyte_type': 'DNA',
 'biosample_id': 'SAMN06062404',
 'consent_code': 2,
 'dbgap_status': 'Loaded',
 'sra_sample_id': 'SAMN06062404',
 'dbgap_sample_id': 2170791,
 'study_accession': 'phs001218.v2.p1',
 'dbgap_subject_id': 1797782,
 'sra_data_details': {'runs': '2',
  'bases': '260957674200',
  'center': 'Macrogen',
  'status': 'public',
  'size_Gb': '49',
  'platform': 'ILLUMINA',
  'experiments': '2',
  'experiment_type': 'WGS'},
 'study_subject_id': 'phs001218.v2_GS08900974',
 'consent_short_name': 'DS-CVD-IRB-NPU-MDS',
 'study_with_consent': 'phs001218.c2',
 'submitted_sample_id': 'NWD403016',
 'submitted_subject_id': 'GS08900974',
 'study_accession_with_consent': 'phs001218.v2.p1.c2'}

### Querying SRA for the same data via Data Connect

In [26]:
from fasp.search import DataConnectClient
searchClient = DataConnectClient('http://localhost:8089',
                                passport='~/Downloads/task-specific-token-20.txt')
#searchClient = DataConnectClient('https://collection-service.staging.dnastack.com/data-connect')

In [27]:
searchClient.list_table_info('nih_sra_datastore.sra.metadata', verbose=True)

_Schema for tablenih_sra_datastore.sra.metadata_
{
   "name": "nih_sra_datastore.sra.metadata",
   "description": "Metadata Table (sra.metadata) contains information about the run and biological samples. Metadata Table (sra.metadata) contains information about the run and biological samples. The biological sample data is stored in two different columns.  See Record (array) column that you need to use the command UNNEST to query. See https://www.ncbi.nlm.nih.gov/sra/docs/sra-cloud-based-metadata-table/",
   "data_model": {
      "$id": "",
      "description": "Metadata Table (sra.metadata) contains information about the run and biological samples. Metadata Table (sra.metadata) contains information about the run and biological samples. The biological sample data is stored in two different columns.  See Record (array) column that you need to use the command UNNEST to query. See https://www.ncbi.nlm.nih.gov/sra/docs/sra-cloud-based-metadata-table/",
      "$schema": "http://json-schema.or

<fasp.search.data_connect_client.SearchSchema at 0x122125f40>

### Query the SRA table on the biosample id


In [11]:
query = f"select * from nih_sra_datastore.sra.metadata where biosample='{response.json()['dbgap']['sra_sample_id']}'"
query

"select * from nih_sra_datastore.sra.metadata where biosample='SAMN06062404'"

In [14]:
from ipywidgets import IntProgress
from IPython.display import display
max_count = 300

progress_bar = IntProgress(min=0, max=max_count) # instantiate the bar
display(progress_bar) # display the bar

IntProgress(value=0, max=300)

In [16]:
df = searchClient.run_query(query, return_type='dataframe', progessIndicator=progress_bar)

In [18]:
df.transpose()

Unnamed: 0,0,1
acc,SRR8016410,SRR5154616
assay_type,WGS,WGS
center_name,Macrogen,Macrogen
consent,DS-CVD-IRB-NPU-MDS,DS-CVD-IRB-NPU-MDS
experiment,SRX4849106,SRX2472219
sample_name,NWD403016,NWD403016
instrument,HiSeq X Ten,HiSeq X Ten
librarylayout,PAIRED,PAIRED
libraryselection,RANDOM,RANDOM
librarysource,GENOMIC,GENOMIC


In [19]:
response.json()['dbgap']

{'sex': 'female',
 'body_site': 'peripheral blood',
 'repository': 'GeneSTAR',
 'sample_use': ['Seq_DNA_SNP_CNV'],
 'analyte_type': 'DNA',
 'biosample_id': 'SAMN06062404',
 'consent_code': 2,
 'dbgap_status': 'Loaded',
 'sra_sample_id': 'SAMN06062404',
 'dbgap_sample_id': 2170791,
 'study_accession': 'phs001218.v2.p1',
 'dbgap_subject_id': 1797782,
 'sra_data_details': {'runs': '2',
  'bases': '260957674200',
  'center': 'Macrogen',
  'status': 'public',
  'size_Gb': '49',
  'platform': 'ILLUMINA',
  'experiments': '2',
  'experiment_type': 'WGS'},
 'study_subject_id': 'phs001218.v2_GS08900974',
 'consent_short_name': 'DS-CVD-IRB-NPU-MDS',
 'study_with_consent': 'phs001218.c2',
 'submitted_sample_id': 'NWD403016',
 'submitted_subject_id': 'GS08900974',
 'study_accession_with_consent': 'phs001218.v2.p1.c2'}

In [28]:
query = """
    SELECT att.k attribute, att.v value
 FROM nih_sra_datastore.sra.metadata s, unnest(attributes) att
where s.acc = 'SRR5154616' and att.k != 'primary_search' """

df = searchClient.run_query(query, return_type='dataframe', progessIndicator=progress_bar)

df

Unnamed: 0,attribute,value
0,sex_calc,female
1,assemblyname,GCA_000001405.13
2,bases,130478837100
3,bytes,30580440819
4,consent_code,2
5,run_file_create_date,2018-10-18T05:31:00.000Z
6,run_file_version,2
7,alignment_provider_run,Sequencing Center
8,analyte_type_sam,DNA
9,biospecimen_repository_sam,GeneSTAR


### Query same files in NCBI DRS

In [20]:
from fasp.loc import SRADRSClient
dcl = SRADRSClient('https://locate.be-md.ncbi.nlm.nih.gov')
drs_id = dcl.acc2drsID('SRR8016410')

In [21]:
run_response = dcl.get_object(drs_id)

In [23]:
for r in run_response['contents']:
    file = dcl.get_object(r['id'])
    print(file)
    for a in file['access_methods']:
        print(a['region'])
        #dcl.get_access_url(r['id'], a['access_id'])
    print('_'*80)

{'access_methods': [{'access_id': '01f5af5012b771d644dc9d66655e5b35371d09daad04096ed610580ef2f3af18', 'region': 'gs.US', 'type': 'https'}, {'access_id': 'a9dd5a3b85e5c3289692315cb59d1b679e4f0c1a49a8b242b30efd10619b50e8', 'region': 's3.us-east-1', 'type': 'https'}], 'checksums': [{'checksum': '2f5a2cbcaefefefd89cd6ffcfb84b959', 'type': 'md5'}], 'created_time': '2018-10-11T12:09:44Z', 'id': '2f5a2cbcaefefefd89cd6ffcfb84b959', 'name': 'NWD403016.freeze5.v1.vcf.gz.csi', 'self_url': 'drs://locate.be-md.ncbi.nlm.nih.gov/2f5a2cbcaefefefd89cd6ffcfb84b959', 'size': 1711170}
gs.US
s3.us-east-1
________________________________________________________________________________
{'access_methods': [{'access_id': 'd2be5fff49bec88fb8d2395bb95c14fc95aa02539a0564f3e199e7eeca0d701a', 'region': 'gs.US', 'type': 'https'}, {'access_id': '8a9085401febe6114a84743c3ea6f52760a9aca1661cdf22f96a6d6d3a5f99a1', 'region': 's3.us-east-1', 'type': 'https'}], 'checksums': [{'checksum': '308fa51b0b2ba6aab94315278341343d',