## Using GA4GH Search with dbGaP data dictionaries

The following shows an example of listing the schema for data 

In [1]:
from fasp.search import DiscoverySearchClient
cl = DiscoverySearchClient('https://ga4gh-search-adapter-presto-public.prod.dnastack.com/')
cl.listTableInfo('dbgap_demo.scr_gecco_susceptibility.subject_phenotypes_multi')

{'name': 'dbgap_demo.scr_gecco_susceptibility.subject_phenotypes_multi',
 'description': 'Scrambled version of subject data for phs001554 Colorectal cancer susceptibility study.',
 'data_model': {'$id': 'phs001554.v1.pht007609.v1.GECCO_CRC_Susceptibility_Subject_Phenotypes',
  'description': 'Scrambled version of subject data for phs001554 Colorectal cancer susceptibility study.',
  '$schema': 'http://json-schema.org/draft-07/schema',
  'properties': {'age': {'type': 'number',
    '$comment': "UNIT 'Years'",
    'maximum': 98.0,
    'minimum': 37.0,
    'description': 'Participant reference age'},
   'sex': {'type': 'string',
    'oneOf': [{'const': 'Female'}, {'const': 'Male'}],
    'description': 'Sex of participant'},
   'race': {'type': 'string',
    'oneOf': [{'const': 'White'}],
    'description': 'Race of participant'},
   'study': {'type': 'string',
    'oneOf': [{'const': 'CPS-II'},
     {'const': 'DACHS'},
     {'const': 'HPFS'},
     {'const': 'NHS'},
     {'const': 'PLCO'},

In [2]:
cl.listTableColumns('dbgap_demo.scr_gecco_susceptibility.subject_phenotypes_multi',descriptions=True)

age
Participant reference age
UNIT 'Years'
_______________________________________
sex
Sex of participant
_______________________________________
race
Race of participant
_______________________________________
study
Study acronym
_______________________________________
ethnicity
Ethnicity of participant
_______________________________________
subject_id
De-identified subject ID
_______________________________________
affection_status
Case control status of the subject for colorectal cancer
_______________________________________


In [3]:
cl.listTables('dbgap_demo')

_Retrieving the table list_
____Page1_______________
dbgap_demo.scr_ega.scr_egapancreatic_files
dbgap_demo.scr_ega.scr_egapancreatic_sample_multi
dbgap_demo.scr_gecco_susceptibility.sample_attributes_multi
dbgap_demo.scr_gecco_susceptibility.sample_multi
dbgap_demo.scr_gecco_susceptibility.sb_drs_index
dbgap_demo.scr_gecco_susceptibility.subject_multi
dbgap_demo.scr_gecco_susceptibility.subject_phenotypes_multi


['dbgap_demo.scr_ega.scr_egapancreatic_files',
 'dbgap_demo.scr_ega.scr_egapancreatic_sample_multi',
 'dbgap_demo.scr_gecco_susceptibility.sample_attributes_multi',
 'dbgap_demo.scr_gecco_susceptibility.sample_multi',
 'dbgap_demo.scr_gecco_susceptibility.sb_drs_index',
 'dbgap_demo.scr_gecco_susceptibility.subject_multi',
 'dbgap_demo.scr_gecco_susceptibility.subject_phenotypes_multi']

In [4]:
cl.listTableInfo('dbgap_demo.scr_gecco_susceptibility.sample_attributes_multi')

{'name': 'dbgap_demo.scr_gecco_susceptibility.sample_attributes_multi',
 'description': 'Scrambled version of sample data for phs001554 Colorectal cancer susceptibility study.',
 'data_model': {'$id': 'phs001554.v1.pht007610.v1.GECCO_CRC_Susceptibility_Sample_Attributes',
  'description': 'Scrambled version of sample data for phs001554 Colorectal cancer susceptibility study.',
  '$schema': 'http://json-schema.org/draft-07/schema',
  'properties': {'batch': {'type': 'string',
    'description': 'Sequencing batch'},
   'is_tumor': {'type': 'string',
    'oneOf': [{'const': 'No'}],
    'description': 'Tumor status'},
   'body_site': {'type': 'string',
    'oneOf': [{'const': 'Blood'}],
    'description': 'Body site where sample was collected'},
   'sample_id': {'type': 'string', 'description': 'De-identified Sample ID'},
   'dna_source': {'type': 'string',
    'oneOf': [{'const': 'Blood'},
     {'const': 'Buffy coat'},
     {'const': 'Unknown'},
     {'const': 'Whole blood'}],
    'descri

In [5]:
cl.listTableColumns('dbgap_demo.scr_gecco_susceptibility.sample_attributes_multi',descriptions=True)

batch
Sequencing batch
_______________________________________
is_tumor
Tumor status
_______________________________________
body_site
Body site where sample was collected
_______________________________________
sample_id
De-identified Sample ID
_______________________________________
dna_source
DNA origin
_______________________________________
read_depth
Sequencing coverage
UNIT 'fold coverage'
_______________________________________
analyte_type
Analyte Type
_______________________________________
sequencing_center
Name of the center that conducted sequencing
_______________________________________


### An EGA Table
The same format of data dictionary was created for an EGA table

In [6]:
cl.listTableInfo('dbgap_demo.scr_ega.scr_egapancreatic_sample_multi')

{'name': 'dbgap_demo.scr_ega.scr_egapancreatic_sample_multi',
 'data_model': {'$id': 'scr_EGApancreatic_sample.data_dict.xml.scr_EGApancreatic_sample',
  '$schema': 'http://json-schema.org/draft-07/schema',
  'properties': {'gender': {'type': 'string',
    'oneOf': [{'const': 'female'}, {'const': 'male'}],
    'description': 'Sex of participant'},
   'bam_file': {'type': 'string',
    'description': 'Relative path for alignment file'},
   'phenotype': {'type': 'string',
    'oneOf': [{'const': 'lung tissue'},
     {'const': 'muscle tissue'},
     {'const': 'pancreatic adenocarcinoma'},
     {'const': 'skin tissue'},
     {'const': 'spleen tissue'}],
    'description': 'Site independent tissue type'},
   'subject_id': {'type': 'string',
    'description': 'Subject Id used locally by submitter'},
   'disease_site': {'type': 'string',
    'oneOf': [{'const': 'Liver_Metastasis'},
     {'const': 'Lung_Metastasis'},
     {'const': 'Lung_Normal'},
     {'const': 'Lymph_Node_Metastasis'},
    

In [7]:
cl.listTableColumns('dbgap_demo.scr_ega.scr_egapancreatic_sample_multi', descriptions=True, enums=True)

gender
Sex of participant
		female
		male
_______________________________________
bam_file
Relative path for alignment file
_______________________________________
phenotype
Site independent tissue type
		lung tissue
		muscle tissue
		pancreatic adenocarcinoma
		skin tissue
		spleen tissue
_______________________________________
subject_id
Subject Id used locally by submitter
_______________________________________
disease_site
body site sample was taken from
		Liver_Metastasis
		Lung_Metastasis
		Lung_Normal
		Lymph_Node_Metastasis
		Muscle_Normal
		Normal Pancreas
		Pancreas_Primary_Tumor
		Peritoneal_Metastasis
		Skin_Normal
		Spleen_Normal
_______________________________________
ena-checklist
Minimal information checklist used in this study
_______________________________________
sample_primary_id
De-identified sample ID
_______________________________________
sample_submitter_id
Sample id used locally by submitter
_______________________________________
