### Demonstration of Schema Registry exploratory implementation

See https://github.com/ianfore/ga4gh-starter-schema-repository for details of the implementation.


In [229]:
import requests;

import json;

import xml.etree.ElementTree as ET

def prettyprint(a_dict):
    print(json.dumps(a_dict, indent=3))

def printline(char="_"):
    print(char*80)

### Docker build and deployment to follow
This will allow local deployment as follows

In [285]:
base = "http://localhost:8080"

### Get namespaces

In [298]:
url = f"{base}/namespaces"
response = requests.get(url)
namespaces = response.json()['namespaces']
for namespace in namespaces:
    print( namespace['namespace_name'])

dbGaP
dataconnect-demo
expt-metadata


In [287]:
prettyprint(response.json())

{
   "server": "http://localCatalog",
   "namespaces": [
      {
         "namespace_name": "dbGaP",
         "latest_version": "https://ncbi.nlm.gov",
         "contact_url": "1.0.0"
      },
      {
         "namespace_name": "dataconnect-demo",
         "latest_version": "https://localhost",
         "contact_url": "1.0.0"
      },
      {
         "namespace_name": "expt-metadata",
         "latest_version": "https://localhost",
         "contact_url": "1.0.0"
      }
   ]
}


In [288]:
namespace = namespaces[2]['namespace_name']
namespace

'expt-metadata'

In [292]:
endpoint = f"/schemas/{namespace}/"
print(endpoint)
response = requests.get(f"{base}{endpoint}")
schemas = response.json()['schemas']
for schema in schemas:
    print( schema['schema_name'])

/schemas/expt-metadata/
sra_PRJEB10573
sra_PRJEB1985
sra_PRJEB37886
sra_phs001554
sra_scr_icac
sra_scr_udn_v5


In [274]:
prettyprint(response.json())

{
   "namespace": "expt-metadata",
   "schemas": [
      {
         "schema_name": "sra_PRJEB10573",
         "latest_version": "v1",
         "maintainer": [
            "local"
         ],
         "lifecycle_stage": "released"
      },
      {
         "schema_name": "sra_PRJEB1985",
         "latest_version": "v1",
         "maintainer": [
            "local"
         ],
         "lifecycle_stage": "released"
      },
      {
         "schema_name": "sra_PRJEB37886",
         "latest_version": "v1",
         "maintainer": [
            "local"
         ],
         "lifecycle_stage": "released"
      },
      {
         "schema_name": "sra_phs001554",
         "latest_version": "v1",
         "maintainer": [
            "local"
         ],
         "lifecycle_stage": "released"
      },
      {
         "schema_name": "sra_scr_icac",
         "latest_version": "v1",
         "maintainer": [
            "local"
         ],
         "lifecycle_stage": "released"
      },
      {
     

In [295]:
namespaces = ["dataconnect-demo","expt-metadata"]
for namespace in namespaces:
    printline("=")
    print(f"getting schema for namespace {namespace}")
    endpoint = f"/schemas/{namespace}/"
    print(endpoint)
    response = requests.get(f"{base}{endpoint}")
    schemas = response.json()['schemas']
    print("get the schemas")
    printline()
    for schema in schemas:
        print( f"schema:{schema['schema_name']}")
        endpoint = f"/schemas/{namespace}/{schema['schema_name']}/versions/v2"
        print(f"endpoint: {endpoint}")
        response = requests.get(f"{base}{endpoint}")
        print("fetching schema")
        response = requests.get(url)
        schema = response.json()
        #prettyprint( schema)
        printline()

getting schema for namespace dataconnect-demo
/schemas/dataconnect-demo/
get the schemas
________________________________________________________________________________
schema:bigquery_public.covid19_genome_sequence.metadata
endpoint: /schemas/dataconnect-demo/bigquery_public.covid19_genome_sequence.metadata/versions/v2
fetching schema
________________________________________________________________________________
schema:nih_sra_datastore.sra.metadata
endpoint: /schemas/dataconnect-demo/nih_sra_datastore.sra.metadata/versions/v2
fetching schema
________________________________________________________________________________
schema:sra.sra.metadata
endpoint: /schemas/dataconnect-demo/sra.sra.metadata/versions/v2
fetching schema
________________________________________________________________________________
schema:tutorial.phs002409.CAMP_CData
endpoint: /schemas/dataconnect-demo/tutorial.phs002409.CAMP_CData/versions/v2
fetching schema
_________________________________________________

## dbGaP

### Get schemas for a study

In [299]:
namespace = "dbGaP"
endpoint = f"/schemas/{namespace}/?study=phs002921&study_version=v2.p1"
print(endpoint)
response = requests.get(f"{base}{endpoint}")
schemas = response.json()['schemas']
for schema in schemas:
    print( schema['schema_name'])

/schemas/dbGaP/?study=phs002921&study_version=v2.p1
phs002921.v2.pht012614.v1.ICAC_Subject_Phenotypes
phs002921.v2.pht012612.v1.ICAC_Subject
phs002921.v2.pht012613.v1.ICAC_Sample
phs002921.v2.pht012615.v1.ICAC_Sample_Attributes


In [278]:
prettyprint(response.json())

{
   "namespace": "dbGaP",
   "schemas": [
      {
         "schema_name": "phs002921.v2.pht012614.v1.ICAC_Subject_Phenotypes",
         "latest_version": "v1",
         "maintainer": [
            "dbGaP"
         ],
         "lifecycle_stage": "released"
      },
      {
         "schema_name": "phs002921.v2.pht012612.v1.ICAC_Subject",
         "latest_version": "v1",
         "maintainer": [
            "dbGaP"
         ],
         "lifecycle_stage": "released"
      },
      {
         "schema_name": "phs002921.v2.pht012613.v1.ICAC_Sample",
         "latest_version": "v1",
         "maintainer": [
            "dbGaP"
         ],
         "lifecycle_stage": "released"
      },
      {
         "schema_name": "phs002921.v2.pht012615.v1.ICAC_Sample_Attributes",
         "latest_version": "v1",
         "maintainer": [
            "dbGaP"
         ],
         "lifecycle_stage": "released"
      }
   ]
}


### Get dbGaP schema as XML data dictionary

In [279]:
schema_name = "phs001554.v2.pht007609.v1.GECCO_CRC_Susceptibility_Subject_Phenotypes"
url = f"{base}/dicts/{namespace}/{schema_name}/versions/v2"
response = requests.get(url)

In [280]:
response.text

'<?xml version="1.0" encoding="UTF-8"?>\n<?xml-stylesheet type="text/xsl" href="./datadict_v2.xsl"?><data_table id="pht007609.v1" study_id="phs001554.v2" participant_set="1" date_created="Thu Sep 15 11:39:07 2022"><description>This subject phenotype table contains subject IDs, case control status of the subject for colorectal cancer, sex, age, race, ethnicity, and study acronym.</description><variable id="phv00357182.v1"><name>SUBJECT_ID</name><description>De-identified subject ID</description><type>string</type></variable><variable id="phv00357183.v1"><name>AFFECTION_STATUS</name><description>Case control status of the subject for colorectal cancer</description><type>string</type><value>Case</value><value>Control</value></variable><variable id="phv00357184.v1"><name>SEX</name><description>Sex of participant</description><type>string</type><value>Female</value><value>Male</value></variable><variable id="phv00357185.v1"><name>AGE</name><description>Participant reference age</description

In [281]:

tree = ET.fromstring(response.text)

ET.indent(tree, space='   ', level=0)
ET.dump(tree)

<data_table id="pht007609.v1" study_id="phs001554.v2" participant_set="1" date_created="Thu Sep 15 11:39:07 2022">
   <description>This subject phenotype table contains subject IDs, case control status of the subject for colorectal cancer, sex, age, race, ethnicity, and study acronym.</description>
   <variable id="phv00357182.v1">
      <name>SUBJECT_ID</name>
      <description>De-identified subject ID</description>
      <type>string</type>
   </variable>
   <variable id="phv00357183.v1">
      <name>AFFECTION_STATUS</name>
      <description>Case control status of the subject for colorectal cancer</description>
      <type>string</type>
      <value>Case</value>
      <value>Control</value>
   </variable>
   <variable id="phv00357184.v1">
      <name>SEX</name>
      <description>Sex of participant</description>
      <type>string</type>
      <value>Female</value>
      <value>Male</value>
   </variable>
   <variable id="phv00357185.v1">
      <name>AGE</name>
      <description>P

### Get schema as JSON Schema

In [282]:
schema_name = "phs001554.v2.pht007609.v1.GECCO_CRC_Susceptibility_Subject_Phenotypes"
url = f"{base}/schemas/{namespace}/{schema_name}/versions/v2"
response = requests.get(url)

In [283]:
namespace 

'dbGaP'

In [284]:
prettyprint(response.json())

{
   "description": "This subject phenotype table contains subject IDs, case control status of the subject for colorectal cancer, sex, age, race, ethnicity, and study acronym.",
   "$id": "dbgap:pht007609.v1",
   "properties": {
      "STUDY": {
         "$id": "dbgap:phv00357188.v1",
         "description": "Study acronym",
         "type": "string",
         "$unit": null,
         "oneOf": [
            {
               "const": "CPS-II",
               "title": "Cancer Prevention Study II"
            },
            {
               "const": "DACHS",
               "title": "Darmkrebs: Chancen der Verhutung durch Screening"
            },
            {
               "const": "HPFS",
               "title": "Health Professionals Follow-up Study"
            },
            {
               "const": "NHS",
               "title": "Nurses Health Study"
            },
            {
               "const": "PLCO",
               "title": "Prostate, Lung, Colorectal and Ovarian Cancer Sc