# Data Connect
**Find the detailed GA4GH Data Connect Specification [here](https://github.com/ga4gh-discovery/data-connect).**

In [7]:
import requests
import json

dc_port = "4800"
dc_base_url = "http://localhost:{}".format(dc_port)


service_info_path = "/service-info"
tables_path = "/tables"
table_info_path = "/table/{}/info"
table_data_path = "/table/{}/data"
search_path = "/search"

def pretty_print_json(response):
    print(json.dumps(response.json(), indent=4))

## 🔷 GET `/service-info`
**Retrieve information about this service**

In [8]:
dc_service_info_resp = requests.request("GET", dc_base_url+service_info_path)
pretty_print_json(dc_service_info_resp)

{
    "id": "org.ga4gh.starterkit.dataconnect",
    "name": "GA4GH Starter Kit Data Connect Service",
    "description": "Starter Kit implementation of the  Data Connect API specification. Gives researchers access to the data  model of given datasets/tables, and enables them to perform search  queries on the datasets using the model.",
    "contactUrl": "mailto:info@ga4gh.org",
    "documentationUrl": "https://github.com/ga4gh/ga4gh-starter-kit-data-connect",
    "createdAt": "2022-04-27T09:00:00Z",
    "updatedAt": "2022-04-27T09:00:00Z",
    "environment": "test",
    "version": "0.1.0",
    "type": {
        "group": "org.ga4gh",
        "artifact": "data-connect",
        "version": "1.0.0"
    },
    "organization": {
        "name": "Global Alliance for Genomics and Health",
        "url": "https://ga4gh.org"
    }
}


## 🔷 List the tables
**Returns a list of all table names**

In [9]:
dc_list_tables_resp = requests.request("GET", dc_base_url+tables_path)
pretty_print_json(dc_list_tables_resp)

{
    "tables": [
        {
            "name": "one_thousand_genomes_sample",
            "description": "Table / directory containing JSON files for one thousand genomes sample from https://www.internationalgenome.org",
            "data_model": {
                "$ref": "http://localhost:4500/table/one_thousand_genomes_sample/info"
            }
        },
        {
            "name": "phenopacket_v1",
            "description": "Table / directory containing JSON files for phenopackets",
            "data_model": {
                "$ref": "http://localhost:4500/table/phenopacket_v1/info"
            }
        }
    ]
}


## 🔷 GET table information of the specificied table
**Returns information about the specified table**

In [10]:
table_name = "one_thousand_genomes_sample"
dc_table_info_resp = requests.request("GET", dc_base_url+table_info_path.format(table_name))
pretty_print_json(dc_table_info_resp)

{
    "name": "one_thousand_genomes_sample",
    "description": "Table / directory containing JSON files for one thousand genomes sample from https://www.internationalgenome.org",
    "data_model": {
        "$id": "/table/one_thousand_genomes_sample/info",
        "$schema": "http://json-schema.org/draft-07/schema#",
        "description": "one thousand genomes sample JSON data model",
        "properties": {
            "sample_name": {
                "type": "string",
                "description": "An identifier specific for this genome sample"
            },
            "sex": {
                "type": "string",
                "enum": [
                    "male",
                    "female"
                ]
            },
            "biosample_id": {
                "type": "string",
                "description": "bio sample identifier"
            },
            "population_code": {
                "type": "string",
                "enum": [
                    "ITU",
    

## 🔷 GET table data of the specified table
**Returns all the data in the specified table**

In [11]:
table_name = "one_thousand_genomes_sample"
dc_table_data_resp = requests.request("GET", dc_base_url+table_data_path.format(table_name))
pretty_print_json(dc_table_data_resp)

{
    "data_model": {
        "$id": "/table/one_thousand_genomes_sample/info",
        "$schema": "http://json-schema.org/draft-07/schema#",
        "description": "one thousand genomes sample JSON data model",
        "properties": {
            "sample_name": {
                "type": "string",
                "description": "An identifier specific for this genome sample"
            },
            "sex": {
                "type": "string",
                "enum": [
                    "male",
                    "female"
                ]
            },
            "biosample_id": {
                "type": "string",
                "description": "bio sample identifier"
            },
            "population_code": {
                "type": "string",
                "enum": [
                    "ITU",
                    "ASW",
                    "JPT",
                    "MSL",
                    "CHS",
                    "CDX",
                    "YRI",
                    

## 🔷 Search for data satisfying specific conditions
**Search for samples with population_code = "PUR" and sex = "female"**

In [23]:
header = {"content-type":"application/json"}
request_body = {
  "query": "select sample_name , sex , population_code , population_name from one_thousand_genomes_sample where population_code=? and sex=?;",
  "parameters": [ "PUR", "female" ]
}
dc_search_resp = requests.post(dc_base_url+search_path, json = request_body, headers = header)
pretty_print_json(dc_search_resp)

{
    "data": [
        {
            "sample_name": "HG00740",
            "sex": "female",
            "population_code": "PUR",
            "population_name": "Puerto Rican"
        },
        {
            "sample_name": "HG01070",
            "sex": "female",
            "population_code": "PUR",
            "population_name": "Puerto Rican"
        },
        {
            "sample_name": "HG01326",
            "sex": "female",
            "population_code": "PUR",
            "population_name": "Puerto Rican"
        }
    ]
}


## 🔷 Exercise questions
**Use the data from the sample One Thousand Genomes table above and refer to the search example above for the following questions. These questions will require some basic SQL knowledge.**

**Tips:**

**- Use "select" to choose which fields to display**\
**- Use "from" to indicate the dataset you will be making the query to**\
**- Use "where" to list the conditions of your query**\
**- Data Connect queries should always end with ";"**

**Q1. Can you formulate a query to retrieve the cram_drs_uri and crai_drs_uri fields from one_thousand_genomes_sample table where population_name="Yoruba" and sex="male"?**

In [None]:
#Write your code here


**Q2. Can you formulate a query to retrieve the biosample_id and bundle_drs_uri fields from one_thousand_genomes_sample table where population_name="African Caribbean" and superpopulation_name="African Ancestry"?**

In [32]:
#Write your code here
