In [1]:
from fasp.search import DiscoverySearchClient
from fasp.search import mapping as mp

cl = DiscoverySearchClient('https://ga4gh-search-adapter-presto-public.prod.dnastack.com')

#### Specify the table to search and run the query, specifying the columns to retrieve

The table being searched is a deidentifed (scrambled) version of the data for dbGaP study phs001611. This study is represented in the NCI Genomic Data Commons. 

A data frame is returned.

In [2]:
table_name = 'search_cloud.cshcodeathon.organoid_profiling_pc_subject_phenotypes_gru'
res = cl.runOneTableQuery(column_list=['dbgap_subject_id', 'age', 'race', 'sex'],table=table_name,limit=10)
res

_Retrieving the query_
____Page1_______________
____Page2_______________
____Page3_______________
____Page4_______________
____Page5_______________


Unnamed: 0,dbgap_subject_id,age,race,sex
0,2675511,24,W,F
1,2675537,43,AA,F
2,2675497,52,W,F
3,2675520,55,W,F
4,2675517,57,AA,F
5,2675504,57,W,F
6,2675552,61,W,F
7,2675502,61,W,F
8,2675550,62,W,F
9,2675494,62,,F


This indicates codings for race and gender. Relying on column names alone gives minimal information.

In this case, reasonable guesses might be made at what those codes mean. The age column too might reasoanbly be guessed at. But what is it's meaning.

dbGaP also provides a data dictionary. These are imported by GA4GH and the information provided in JSON Schema, the standard form which is the standard form for describing data in GA4GH.

In [4]:
po_schema = cl.listTableInfo(table_name)
po_schema

{'name': 'search_cloud.cshcodeathon.organoid_profiling_pc_subject_phenotypes_gru',
 'data_model': {'$id': 'phs001611.v1.pht009160.v1.Organoid_Profiling_PC_Subject_Phenotypes',
  '$schema': 'http://json-schema.org/draft-07/schema',
  'properties': {'age': {'type': 'integer, encoded value',
    '$comment': "UNIT 'Years'",
    'oneOf': [{'const': 'N/A', 'title': 'Not vailable'}],
    'maximum': 92.0,
    'minimum': 24.0,
    'description': "Subject's age"},
   'sex': {'type': 'string',
    'oneOf': [{'const': 'F', 'title': 'Female'},
     {'const': 'N/A', 'title': 'Not Applicable'},
     {'const': 'M', 'title': 'Male'}],
    'description': 'Sex of participant'},
   'race': {'type': 'string',
    'oneOf': [{'const': 'AA', 'title': 'African American'},
     {'const': 'A', 'title': 'Asian'},
     {'const': 'W', 'title': 'White, Caucasian'},
     {'const': 'H', 'title': 'Hispanic'},
     {'const': 'N/A', 'title': 'Not vailable'}],
    'description': 'Race of participant'},
   'subject_id': {'

#### Get a mapping for the sex column in the table queried above 

In [6]:
map_col='sex'
mapping = mp.getMapping(cl, table_name, map_col)
print(mapping)

_Retrieving the query_
____Page1_______________
____Page2_______________
____Page3_______________
____Page4_______________
____Page5_______________
{'F': 'Female', 'N/A': 'N/A', 'M': 'Male'}


In [7]:
res = cl.runOneTableQuery(column_list=['dbgap_subject_id', 'age', 'race', 'sex'],table=table_name,limit=10)
res

_Retrieving the query_
____Page1_______________
____Page2_______________
____Page3_______________
____Page4_______________
____Page5_______________


Unnamed: 0,dbgap_subject_id,age,race,sex
0,2675511,24,W,F
1,2675537,43,AA,F
2,2675497,52,W,F
3,2675520,55,W,F
4,2675517,57,AA,F
5,2675504,57,W,F
6,2675552,61,W,F
7,2675502,61,W,F
8,2675550,62,W,F
9,2675494,62,,F


#### Transform the column
Use the replace function of the dataframe to use the data provided by the mapping retrieved above.

In [8]:
res[map_col] = res[map_col].replace(mapping.keys(),mapping.values())
res

Unnamed: 0,dbgap_subject_id,age,race,sex
0,2675511,24,W,Female
1,2675537,43,AA,Female
2,2675497,52,W,Female
3,2675520,55,W,Female
4,2675517,57,AA,Female
5,2675504,57,W,Female
6,2675552,61,W,Female
7,2675502,61,W,Female
8,2675550,62,W,Female
9,2675494,62,,Female
