## Get download links for h5ad files from HuBMAP and SenNet (HRApop)

## Install and import libraries

In [None]:

%pip install pandas

import pandas as pd
import os

## Global settings

In [None]:
hra_pop_version = 'v0.11.1'

## Load data and extract Entity IDs

In [None]:
sankey_full = pd.read_csv(
    f'https://raw.githubusercontent.com/x-atlas-consortia/hra-pop/refs/heads/main/output-data/{hra_pop_version}/reports/universe-ad-hoc/sankey.csv')

sankey_full

In [None]:
# filter, only keep sc-transcriptomics data that was run through at least one cell type annotation tool
sankey = sankey_full[((sankey_full['cell_type_annotation_tool']).notna()) & (sankey_full['cell_type_annotation_tool'] != 'sc_proteomics')]
sankey

In [None]:
# extract UUIDs
# https://entity.api.hubmapconsortium.org/entities/ebaa609a1819b22767471082d7baa0d9

def get_uuids(column:pd.core.series.Series, effort:str):
  """Gets UUIDs from dataset IDs

  Args:
      column (pd.core.series.Series): a Pandas series
      effort (str): effort name

  Returns:
      result: list with UUIDs
  """
  result = set(dataset_id.split('/')[len(dataset_id.split('/'))-1]
            for dataset_id in column if effort in dataset_id)
  return result


hubmap_ids = get_uuids(sankey['dataset_id'], 'hubmap')
sennet_ids = get_uuids(sankey['dataset_id'], 'sennet')
print(f'hubmap_ids: {len(hubmap_ids)}, sennet_ids: {len(sennet_ids)}')

## Build URLs for download, export

In [None]:
assets_api_hubmap = 'https://assets.hubmapconsortium.org/'
assets_api_sennet = 'https://assets.api.sennetconsortium.org/'
filename = 'expr.h5ad'

result = {
  'effort':[],
  'uuid': [],
  'download_url':[]
}

def assemble_url(ids:list, url:str, effort:str):
  for id in ids:
    result['effort'].append(effort)
    result['uuid'].append(id)
    result['download_url'].append(f'{url}{id}/{filename}')  
    
assemble_url(hubmap_ids, assets_api_hubmap, 'hubmap')
assemble_url(sennet_ids, assets_api_sennet, 'sennet')

df_result = pd.DataFrame(result)
df_result

In [None]:
# export to csv
df_result.to_csv('download_urls.csv', index=False)

## Download h5ad files

In [None]:
# Make sure the data folder is present
folder_path = "data"

if not os.path.exists(folder_path):
    os.makedirs(folder_path)
    print(f"Folder '{folder_path}' created.")
else:
    print(f"Folder '{folder_path}' already exists.")

def download_h5ad(uuid:str, url:str):
  """Downloads h5ad files given download URL and UUID

  Args:
      uuid (str): UUID
      url (str): download URL
  """
  # Define the path to the file. 
  file_path = f'{folder_path}/{uuid}.h5ad'

    # Check if the file exists
  if not os.path.exists(file_path):
      # If the file doesn't exist, run the curl command
      !curl -L {url} -o {file_path}
      print(f"File downloaded and saved at {file_path}")
  else:
      print(f"File already exists at {file_path}")

In [None]:
# loop through df and download if needed
for uuid, download_url in zip(df_result['uuid'], df_result['download_url']):
  print(f'Attempting to download: {download_url}')
  download_h5ad(uuid, download_url)