## A notebook to get HuBMAP and SenNet datasets with RUI registration, publication status, organ++

For each dataset, would like to get (at least):
- dataset ID
- publication status
- organ
- RUI registration status
- assay type

In [27]:
# install and import libraries
%pip install requests pandas hubmap_sdk

import requests
import pandas as pd
from pprint import pprint
import json


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


## HuBMAP

In [28]:
# Initialize an empty list to capture information from the API response
rows = []

In [29]:
url = 'https://ingest.api.hubmapconsortium.org/datasets/data-status'

with open('tokens.txt', 'r+') as f:
  TOKENS = json.load(f)

headers = {
    "authentication": f'Bearer {TOKENS['hubmap']}'
}

response = requests.get(url, headers=headers).json()

for dataset in response['data']:
  
  # grab relevant data
  rows.append({
      'dataset_id': dataset['hubmap_id'],
      'organ': dataset['organ'],
      'is_rui_registered': dataset['has_rui_info'],
      'assay_type':dataset['dataset_type'],
      'publication_status': dataset['status'],
      'published_time_stamp': dataset['published_timestamp'],
      'group_name': dataset['group_name'],
      'has_data': dataset['has_data'],
      'uuid':dataset['uuid']
  })

In [30]:
# Convert to DataFrame
df_hubmap = pd.DataFrame(rows)
df_hubmap

Unnamed: 0,dataset_id,organ,is_rui_registered,assay_type,publication_status,published_time_stamp,group_name,has_data,uuid
0,HBM575.XFCT.276,Kidney (Right),True,RNAseq,Published,1643320101710,University of California San Diego TMC,True,421007293469db7b528ce6478c00348d
1,HBM645.XLLN.924,Heart,True,RNAseq,Published,1646159320866,California Institute of Technology TMC,True,a44a78bfbe0e702cdc172707b6061a16
2,HBM243.HRTG.365,Thymus,True,RNAseq [Salmon],Published,1598109513159,University of Florida TMC,True,81a9fa68b2b4ea3e5f7cb17554149473
3,HBM538.PHSC.677,Thymus,True,RNAseq [Salmon],Published,1598109493904,University of Florida TMC,True,3ac0768d61c6c84f0ec59d766e123e05
4,HBM628.HGGF.468,Lymph Node,True,RNAseq [Salmon],Published,1598109522765,University of Florida TMC,True,0576b972e074074b4c51a61c3d17a6e3
...,...,...,...,...,...,...,...,...,...
7491,HBM258.GZJQ.498,Placenta,True,RNAseq [Salmon],QA,,TMC - University of California San Diego focus...,True,1a5b02501abb4086c51954c17c671833
7492,HBM759.SPVP.899,Placenta,True,RNAseq [Salmon],QA,,TMC - University of California San Diego focus...,True,9c7ee9633dae16bc1ffc4620774aed7d
7493,HBM659.GMSX.726,Placenta,True,RNAseq [Salmon],QA,,TMC - University of California San Diego focus...,True,88ff518297adc03b6dbd998d7a4a733b
7494,HBM234.BVTP.358,Placenta,True,RNAseq [Salmon],QA,,TMC - University of California San Diego focus...,True,815633e155a4c5aa368f7c749ead599f


In [31]:
# and export
df_hubmap.to_csv('output/hubmap_datasets.csv', index=False)

In [32]:
# Convert to DataFrame
df_hubmap = pd.DataFrame(rows)
df_hubmap

Unnamed: 0,dataset_id,organ,is_rui_registered,assay_type,publication_status,published_time_stamp,group_name,has_data,uuid
0,HBM575.XFCT.276,Kidney (Right),True,RNAseq,Published,1643320101710,University of California San Diego TMC,True,421007293469db7b528ce6478c00348d
1,HBM645.XLLN.924,Heart,True,RNAseq,Published,1646159320866,California Institute of Technology TMC,True,a44a78bfbe0e702cdc172707b6061a16
2,HBM243.HRTG.365,Thymus,True,RNAseq [Salmon],Published,1598109513159,University of Florida TMC,True,81a9fa68b2b4ea3e5f7cb17554149473
3,HBM538.PHSC.677,Thymus,True,RNAseq [Salmon],Published,1598109493904,University of Florida TMC,True,3ac0768d61c6c84f0ec59d766e123e05
4,HBM628.HGGF.468,Lymph Node,True,RNAseq [Salmon],Published,1598109522765,University of Florida TMC,True,0576b972e074074b4c51a61c3d17a6e3
...,...,...,...,...,...,...,...,...,...
7491,HBM258.GZJQ.498,Placenta,True,RNAseq [Salmon],QA,,TMC - University of California San Diego focus...,True,1a5b02501abb4086c51954c17c671833
7492,HBM759.SPVP.899,Placenta,True,RNAseq [Salmon],QA,,TMC - University of California San Diego focus...,True,9c7ee9633dae16bc1ffc4620774aed7d
7493,HBM659.GMSX.726,Placenta,True,RNAseq [Salmon],QA,,TMC - University of California San Diego focus...,True,88ff518297adc03b6dbd998d7a4a733b
7494,HBM234.BVTP.358,Placenta,True,RNAseq [Salmon],QA,,TMC - University of California San Diego focus...,True,815633e155a4c5aa368f7c749ead599f


In [33]:
# and export
df_hubmap.to_csv('output/hubmap_datasets.csv', index=False)

In [34]:
url = 'https://search.api.sennetconsortium.org/search'

headers = {
  'authentication': f'Bearer {TOKENS['sennet']}'
}

query = {
    "query": {
        "bool": {
            "must": {
                "match_all": {}
            },
            "filter": [
              {
                "terms": {
                  "entity_type.keyword": [
                    "Dataset"
                  ]
                }
              },
              {
                "terms": {
                  "sources.source_type.keyword": [
                    "Human"
                  ]
                }
              }
            ],
            "must_not": [
                {
                    "terms": {
                        "dataset_category.keyword": [
                            "codcc-processed",
                            "lab-processed",
                            "component"
                        ]
                    }
                },
            ]
        }
    },
    "_source": {
        "includes": [
            "sennet_id",
            "uuid",
            "origin_samples.organ",
            "status",
            "has_rui_information",
            "dataset_type",
            "group_name",
            "group_uuid",
            "published_timestamp"
        ]
    },
    "from": 0,
    "size": 1000,
    "sort": [
        {
            "last_modified_timestamp": "desc"
        }
    ],
    "track_total_hits": True
}

response = requests.post(
    url,
    headers=headers,
    data=json.dumps(query)
).json()

In [35]:
for hit in response['hits']['hits']:
  pprint(hit)

{'_id': 'fcf72357d083cb8edcfce70f5ba055ef',
 '_index': 'sn_prod_public_entities',
 '_score': None,
 '_source': {'dataset_type': 'RNAseq',
             'group_name': 'TMC - University of Pittsburgh',
             'group_uuid': '28db7a2b-ed8a-11ec-8b0a-9fe9b51132b1',
             'has_rui_information': 'True',
             'origin_samples': [{'organ': 'LL'}],
             'published_timestamp': 1739461194239,
             'sennet_id': 'SNT493.TKJD.678',
             'status': 'Published',
             'uuid': 'fcf72357d083cb8edcfce70f5ba055ef'},
 '_type': '_doc',
 'sort': [1739461230000.0]}
{'_id': 'a0f2be2b1dfa1178fa926259bf5a455f',
 '_index': 'sn_prod_public_entities',
 '_score': None,
 '_source': {'dataset_type': 'RNAseq',
             'group_name': 'TMC - University of Pittsburgh',
             'group_uuid': '28db7a2b-ed8a-11ec-8b0a-9fe9b51132b1',
             'has_rui_information': 'True',
             'origin_samples': [{'organ': 'RL'}],
             'published_timestamp': 17394575

In [36]:
# Initialize an empty list to capture information from the API response
rows = []

for hit in response['hits']['hits']:
  dataset = hit['_source']

  # grab relevant data
  rows.append({
      'dataset_id': dataset['sennet_id'],
      'organ': dataset['origin_samples'],
      'is_rui_registered': dataset['has_rui_information'],
      'assay_type':dataset['dataset_type'],
      'publication_status': dataset['status'],
      'organ': dataset['origin_samples'][0]['organ'],
      'group_name': dataset['group_name'],
      'published_timestamp': dataset['published_timestamp'],
      'uuid':dataset['uuid']
  })

In [37]:
# Convert to DataFrame
df_sennet = pd.DataFrame(rows)
df_sennet

Unnamed: 0,dataset_id,organ,is_rui_registered,assay_type,publication_status,group_name,published_timestamp,uuid
0,SNT493.TKJD.678,LL,True,RNAseq,Published,TMC - University of Pittsburgh,1739461194239,fcf72357d083cb8edcfce70f5ba055ef
1,SNT253.MKNV.455,RL,True,RNAseq,Published,TMC - University of Pittsburgh,1739457503879,a0f2be2b1dfa1178fa926259bf5a455f
2,SNT498.XKRG.893,LL,True,RNAseq,Published,TMC - University of Pittsburgh,1739457546445,13f4861319fe3877df8547985531b68a
3,SNT489.CCVQ.324,RL,True,RNAseq,Published,TMC - University of Pittsburgh,1739457495091,93c809496bcc6479991ba891c7e631f7
4,SNT776.QQGW.623,RL,True,RNAseq,Published,TMC - University of Pittsburgh,1739457512374,3c0a1c7216859aef385acdae61e5664b
...,...,...,...,...,...,...,...,...
743,SNT775.XPKV.283,PA,True,Histology,Published,TMC - UConn Health,1709840585683,d9d62675a4b39622c1e8dbbc91f8eb51
744,SNT759.KJJZ.639,PA,True,Histology,Published,TMC - UConn Health,1709840052237,907bce178070caa28b60a6c3ad552076
745,SNT899.HZKD.452,PA,True,Histology,Published,TMC - UConn Health,1709838971889,bde9bb17ff8e618ba97951d8176c1a3e
746,SNT577.GFXV.499,LV,True,RNAseq,Published,TMC - Washington University,1692026366326,6e24083668bdaf96c154077c84b37266


In [38]:
# and export
df_sennet.to_csv('output/sennet_datasets.csv', index=False)