## Preprocess and generate Sankey diagrams for UNIVERSE and ATLAS

The original implementation of this plot in R is available in [sankey_universe_atlas.Rmd](https://github.com/cns-iu/hra-cell-type-populations-supporting-information/blob/main/paper_plots/sankey_universe_atlas.Rmd).

## Install and import libraries

In [43]:

%pip install pandas plotly

import pandas as pd
import re
from pprint import pprint
import plotly.graph_objects as go

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.0 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


## Global settings

In [None]:
hra_pop_version = "v1.0"
branch = 'main'

only_atlas = False  # set to True for atlas
# set to True to export new HTML files (WARNING: The HRA Dashboard pulls from the HTML files exported to /docs on the main branch!)
export_html_for_deployment = False
# to export an HTML file that is not live on the HRA Dashboard
export_html_for_inspection = False

## Load data

In [45]:
sankey_universe = pd.read_csv(
    f"https://raw.githubusercontent.com/x-atlas-consortia/hra-pop/refs/heads/{branch}/output-data/{hra_pop_version}/reports/universe-ad-hoc/sankey.csv")

sankey_universe


Columns (1,3) have mixed types. Specify dtype option on import or set low_memory=False.



Unnamed: 0,portal,study_paper,doi,lead_author,is_azimuth_reference,donor_id,donor_sex,donor_age,donor_development_stage,donor_race,...,unique_dataset_id,link_to_h5ad_file,sc_transcriptomics_or_sc_proteomics,cell_type_annotation_tool,omap_id,number_of_cells_total,number_of_unique_cell_types,hubmap_dataset_publication_status,is_rui_registered,is_atlas_dataset
0,HCA,,,,,TSP27,Female,56.0,,,...,hhttps://api.cellxgene.cziscience.com/dp/v1/co...,https://cellxgene.cziscience.com/e/a357414d-20...,,,,,,,True,False
1,KPMP,,http://dx.doi.org/10.1681/ASN.2016091027,,,Donor1,Male,,,,...,http://dx.doi.org/10.1681/ASN.2016091027#Donor...,http://dx.doi.org/10.1681/ASN.2016091027,,,,,,,True,False
2,KPMP,,http://dx.doi.org/10.1681/ASN.2016091027,,,Donor2,Male,,,,...,http://dx.doi.org/10.1681/ASN.2016091027#Donor...,http://dx.doi.org/10.1681/ASN.2016091027,,,,,,,True,False
3,KPMP,,http://dx.doi.org/10.1681/ASN.2016091027,,,Donor3,Male,,,,...,http://dx.doi.org/10.1681/ASN.2016091027#Donor...,http://dx.doi.org/10.1016/j.trsl.2017.07.006,,,,,,,True,False
4,HRA,,,,,Donor1,Female,38.0,,,...,http://purl.org/ccf/1.5/omap-1#Donor1_TissueBl...,https://hubmapconsortium.github.io/ccf-release...,,,,,,,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22189,SPARC,,https://sparc.science/datasets/390/version/1?d...,,,Donor1,Female,49.0,,,...,https://sparc.science/datasets/390/version/1?d...,https://sparc.science/datasets/390/version/1?d...,,,,,,,True,False
22190,SPARC,,https://sparc.science/datasets/390/version/1?d...,,,Donor1,Female,49.0,,,...,https://sparc.science/datasets/390/version/1?d...,https://sparc.science/datasets/390/version/1?d...,,,,,,,True,False
22191,SPARC,,https://sparc.science/datasets/390/version/1?d...,,,Donor1,Female,49.0,,,...,https://sparc.science/datasets/390/version/1?d...,https://sparc.science/datasets/390/version/1?d...,,,,,,,True,False
22192,KPMP,,https://www.nature.com/articles/s41467-023-389...,,,Donor1,Male,,,,...,https://zenodo.org/records/7653239#Donor1_Tiss...,https://zenodo.org/records/7653239,,,,,,,True,False


## Preprocess data

In [46]:
def add_bins(original_column:pd.Series, bins:list, labels:list):
  """ Create new column with bins

  Args:
      original_column (pd.Series): Column to bin
      bins (list): Bin edges
      labels (list): Bin labels
  """
  result = pd.cut(original_column, bins = bins, labels = labels, include_lowest=True)
  return result

In [47]:
sankey_universe_with_bins = sankey_universe

# Define bins and labels
bins_age = [0, 1, 5, 10, 15, 20, 25, 30, 35, 40,45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100]
labels_age = ["<1 Year", "1-4 Years", "5-9 Years", "10-14 Years", "15-19 Years", "20-24 Years", "25-29 Years","30-34 Years", "35-39 Years", "40-44 Years", "45-49 Years", "50-54 Years", "55-59 Years","60-64 Years", "65-69 Years", "70-74 Years", "75-79 Years", "80-84 Years", "85-89 Years","90-94 Years", "95-99 Years"]


bins_bmi = [0, 18.5, 24.9, 29.9, 30.0]
labels_bmi = ["underweight", "healthy", "overweight", "obese"]

# Apply the binning function to create new columns
sankey_universe_with_bins['donor_age_binned'] = add_bins(
  sankey_universe['donor_age'], bins_age, labels_age)

sankey_universe_with_bins['donor_bmi_binned'] = add_bins(
    sankey_universe['donor_bmi'], bins_bmi, labels_bmi)


sankey_universe_with_bins

Unnamed: 0,portal,study_paper,doi,lead_author,is_azimuth_reference,donor_id,donor_sex,donor_age,donor_development_stage,donor_race,...,sc_transcriptomics_or_sc_proteomics,cell_type_annotation_tool,omap_id,number_of_cells_total,number_of_unique_cell_types,hubmap_dataset_publication_status,is_rui_registered,is_atlas_dataset,donor_age_binned,donor_bmi_binned
0,HCA,,,,,TSP27,Female,56.0,,,...,,,,,,,True,False,55-59 Years,
1,KPMP,,http://dx.doi.org/10.1681/ASN.2016091027,,,Donor1,Male,,,,...,,,,,,,True,False,,
2,KPMP,,http://dx.doi.org/10.1681/ASN.2016091027,,,Donor2,Male,,,,...,,,,,,,True,False,,
3,KPMP,,http://dx.doi.org/10.1681/ASN.2016091027,,,Donor3,Male,,,,...,,,,,,,True,False,,
4,HRA,,,,,Donor1,Female,38.0,,,...,,,,,,,True,False,35-39 Years,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22189,SPARC,,https://sparc.science/datasets/390/version/1?d...,,,Donor1,Female,49.0,,,...,,,,,,,True,False,45-49 Years,
22190,SPARC,,https://sparc.science/datasets/390/version/1?d...,,,Donor1,Female,49.0,,,...,,,,,,,True,False,45-49 Years,
22191,SPARC,,https://sparc.science/datasets/390/version/1?d...,,,Donor1,Female,49.0,,,...,,,,,,,True,False,45-49 Years,
22192,KPMP,,https://www.nature.com/articles/s41467-023-389...,,,Donor1,Male,,,,...,,,,,,,True,False,,


In [48]:
# Create subset and replace NAs, unify unknown values, adjust portal values

organ_not_supported_text = "Organ Not Supported"

# Select relevant columns
subset_sankey = sankey_universe_with_bins[['portal', 'donor_sex', 'organ_name', 'dataset_id', 'unique_dataset_id',
                                           'cell_type_annotation_tool', 'donor_race', 'donor_bmi_binned', 'donor_age_binned', 
                                           'is_rui_registered', 'is_atlas_dataset']]

# Replace NAs with specified values
subset_sankey['donor_sex'].fillna('Unknown Sex', inplace=True)
subset_sankey['donor_race'].fillna('Unknown Race', inplace=True)
subset_sankey['organ_name'].fillna(organ_not_supported_text, inplace=True)
subset_sankey['cell_type_annotation_tool'].fillna('No Cell Summary', inplace=True)

# Replace NAs for binned variables
# Convert to object type to allow setting new category values
subset_sankey['donor_bmi_binned'] = subset_sankey['donor_bmi_binned'].astype(
    'object')
subset_sankey['donor_age_binned'] = subset_sankey['donor_age_binned'].astype(
    'object')

subset_sankey['donor_bmi_binned'].fillna('Unknown BMI', inplace=True)
subset_sankey['donor_age_binned'].fillna('Unknown Age', inplace=True)

# Make organs lowercase (title case)
subset_sankey['organ_name'] = subset_sankey['organ_name'].str.title()

# Unify left and right kidney
subset_sankey['organ_name'] = subset_sankey['organ_name'].replace({'Left Kidney': 'Kidney', 'Right Kidney': 'Kidney'})

# Unify unknown values in race and sex
subset_sankey['donor_race'] = subset_sankey['donor_race'].replace({'unknown': 'Unknown Race', 'na': 'Unknown Race'})
subset_sankey['donor_sex'] = subset_sankey['donor_sex'].replace({'Unknown': 'Unknown Sex'})

# Fix portal names
subset_sankey['portal'] = subset_sankey['portal'].replace({'HCA': 'CZ CELLxGENE', 
                                                           'NHLBI/LungMap': 'LungMap', 
                                                           'CxG': 'CZ CELLxGENE'})

# Replace portal 'HRA' with 'HRA-OMAP'
subset_sankey['portal'] = subset_sankey['portal'].replace({'HRA': 'HRA-OMAP'})

# Turn LGL into meaningful CHAR for rui and atlas
subset_sankey['is_rui_registered'] = subset_sankey['is_rui_registered'].replace({True: 'RUI-registered', False: 'Not RUI-registered'})
subset_sankey['is_atlas_dataset'] = subset_sankey['is_atlas_dataset'].replace({True: 'Atlas Dataset', False: 'Not Atlas Dataset'})

subset_sankey


A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.





A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For exampl

Unnamed: 0,portal,donor_sex,organ_name,dataset_id,unique_dataset_id,cell_type_annotation_tool,donor_race,donor_bmi_binned,donor_age_binned,is_rui_registered,is_atlas_dataset
0,CZ CELLxGENE,Female,Small Intestine,hhttps://api.cellxgene.cziscience.com/dp/v1/co...,hhttps://api.cellxgene.cziscience.com/dp/v1/co...,No Cell Summary,Unknown Race,Unknown BMI,55-59 Years,RUI-registered,Not Atlas Dataset
1,KPMP,Male,Kidney,http://dx.doi.org/10.1681/ASN.2016091027#Donor...,http://dx.doi.org/10.1681/ASN.2016091027#Donor...,No Cell Summary,Unknown Race,Unknown BMI,Unknown Age,RUI-registered,Not Atlas Dataset
2,KPMP,Male,Kidney,http://dx.doi.org/10.1681/ASN.2016091027#Donor...,http://dx.doi.org/10.1681/ASN.2016091027#Donor...,No Cell Summary,Unknown Race,Unknown BMI,Unknown Age,RUI-registered,Not Atlas Dataset
3,KPMP,Male,Kidney,http://dx.doi.org/10.1681/ASN.2016091027#Donor...,http://dx.doi.org/10.1681/ASN.2016091027#Donor...,No Cell Summary,Unknown Race,Unknown BMI,Unknown Age,RUI-registered,Not Atlas Dataset
4,HRA-OMAP,Female,Mesenteric Lymph Node,http://purl.org/ccf/1.5/omap-1#Donor1_TissueBl...,http://purl.org/ccf/1.5/omap-1#Donor1_TissueBl...,No Cell Summary,Unknown Race,Unknown BMI,35-39 Years,RUI-registered,Not Atlas Dataset
...,...,...,...,...,...,...,...,...,...,...,...
22189,SPARC,Female,Heart,https://sparc.science/datasets/390/version/1?d...,https://sparc.science/datasets/390/version/1?d...,No Cell Summary,Unknown Race,Unknown BMI,45-49 Years,RUI-registered,Not Atlas Dataset
22190,SPARC,Female,Heart,https://sparc.science/datasets/390/version/1?d...,https://sparc.science/datasets/390/version/1?d...,No Cell Summary,Unknown Race,Unknown BMI,45-49 Years,RUI-registered,Not Atlas Dataset
22191,SPARC,Female,Heart,https://sparc.science/datasets/390/version/1?d...,https://sparc.science/datasets/390/version/1?d...,No Cell Summary,Unknown Race,Unknown BMI,45-49 Years,RUI-registered,Not Atlas Dataset
22192,KPMP,Male,Kidney,https://zenodo.org/records/7653239#Donor1_Tiss...,https://zenodo.org/records/7653239#Donor1_Tiss...,No Cell Summary,Unknown Race,Unknown BMI,Unknown Age,RUI-registered,Not Atlas Dataset


In [49]:
# remove duplicate rows
# Define the tool replacement string
tool_replacement = "sc_transcriptomics with Cell Summary"

# Replace the specified tools with the new tool name
subset_sankey['cell_type_annotation_tool'] = subset_sankey['cell_type_annotation_tool'].replace({
    'azimuth': tool_replacement,
    'celltypist': tool_replacement,
    'popv': tool_replacement
})

# Find duplicates in the DataFrame
duplicates = subset_sankey[subset_sankey.duplicated()]

# Remove duplicates from the DataFrame
subset_sankey = subset_sankey.drop_duplicates()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [50]:
# Temporary fix for SenNet atlas datasets
# Apply the tool replacement conditionally based on 'portal' and 'is_atlas_dataset'
subset_sankey.loc[(subset_sankey['portal'] == 'SenNet') & (
    subset_sankey['is_atlas_dataset'] == 'Atlas Dataset'), 'cell_type_annotation_tool'] = tool_replacement

In [51]:
# Map race/ethnicity values to race values
# Export donor_race as CSV
donor_race = pd.DataFrame(
    subset_sankey['donor_race'].unique(), columns=['donor_race'])

# Export to CSV
donor_race.to_csv('output/donor_race_python.csv', index=False)

# Read the manually mapped CSV file
race_mapped = pd.read_csv("data/donor_race_mapped.csv")

# Perform the left join (merge) on 'donor_race'
subset_sankey = pd.merge(subset_sankey, race_mapped,
                         on='donor_race', how='left')

# Replace 'donor_race' with 'mapped_donor_race' after the merge
subset_sankey['donor_race'] = subset_sankey['mapped_donor_race']

# Print the updated DataFrame
subset_sankey

Unnamed: 0,portal,donor_sex,organ_name,dataset_id,unique_dataset_id,cell_type_annotation_tool,donor_race,donor_bmi_binned,donor_age_binned,is_rui_registered,is_atlas_dataset,mapped_donor_race
0,CZ CELLxGENE,Female,Small Intestine,hhttps://api.cellxgene.cziscience.com/dp/v1/co...,hhttps://api.cellxgene.cziscience.com/dp/v1/co...,No Cell Summary,Unknown Race,Unknown BMI,55-59 Years,RUI-registered,Not Atlas Dataset,Unknown Race
1,KPMP,Male,Kidney,http://dx.doi.org/10.1681/ASN.2016091027#Donor...,http://dx.doi.org/10.1681/ASN.2016091027#Donor...,No Cell Summary,Unknown Race,Unknown BMI,Unknown Age,RUI-registered,Not Atlas Dataset,Unknown Race
2,KPMP,Male,Kidney,http://dx.doi.org/10.1681/ASN.2016091027#Donor...,http://dx.doi.org/10.1681/ASN.2016091027#Donor...,No Cell Summary,Unknown Race,Unknown BMI,Unknown Age,RUI-registered,Not Atlas Dataset,Unknown Race
3,KPMP,Male,Kidney,http://dx.doi.org/10.1681/ASN.2016091027#Donor...,http://dx.doi.org/10.1681/ASN.2016091027#Donor...,No Cell Summary,Unknown Race,Unknown BMI,Unknown Age,RUI-registered,Not Atlas Dataset,Unknown Race
4,HRA-OMAP,Female,Mesenteric Lymph Node,http://purl.org/ccf/1.5/omap-1#Donor1_TissueBl...,http://purl.org/ccf/1.5/omap-1#Donor1_TissueBl...,No Cell Summary,Unknown Race,Unknown BMI,35-39 Years,RUI-registered,Not Atlas Dataset,Unknown Race
...,...,...,...,...,...,...,...,...,...,...,...,...
16288,SPARC,Female,Heart,https://sparc.science/datasets/390/version/1?d...,https://sparc.science/datasets/390/version/1?d...,No Cell Summary,Unknown Race,Unknown BMI,45-49 Years,RUI-registered,Not Atlas Dataset,Unknown Race
16289,SPARC,Female,Heart,https://sparc.science/datasets/390/version/1?d...,https://sparc.science/datasets/390/version/1?d...,No Cell Summary,Unknown Race,Unknown BMI,45-49 Years,RUI-registered,Not Atlas Dataset,Unknown Race
16290,SPARC,Female,Heart,https://sparc.science/datasets/390/version/1?d...,https://sparc.science/datasets/390/version/1?d...,No Cell Summary,Unknown Race,Unknown BMI,45-49 Years,RUI-registered,Not Atlas Dataset,Unknown Race
16291,KPMP,Male,Kidney,https://zenodo.org/records/7653239#Donor1_Tiss...,https://zenodo.org/records/7653239#Donor1_Tiss...,No Cell Summary,Unknown Race,Unknown BMI,Unknown Age,RUI-registered,Not Atlas Dataset,Unknown Race


In [52]:
# OPTIONAL: Filter for only atlas data
if only_atlas: 
  subset_sankey = subset_sankey[subset_sankey['is_atlas_dataset'] == "Atlas Dataset"]

## Manually fix consortium name issue

In [53]:
subset_sankey.loc[subset_sankey['portal'] == '<Consortium Name>', 'portal'] = 'UNC'

## Create nodes and edges

In [54]:
# Group by and summarize for each category
p = subset_sankey.groupby('portal').size().reset_index(
    name='count').drop('count', axis=1)
d = subset_sankey.groupby('donor_sex').size().reset_index(
    name='count').drop('count', axis=1)
a = subset_sankey.groupby('donor_age_binned').size(
).reset_index(name='count').drop('count', axis=1)
b = subset_sankey.groupby('donor_bmi_binned').size(
).reset_index(name='count').drop('count', axis=1)
r = subset_sankey.groupby('donor_race').size().reset_index(
    name='count').drop('count', axis=1)
o = subset_sankey.groupby('organ_name').size().reset_index(
    name='count').drop('count', axis=1)
c = subset_sankey.groupby('cell_type_annotation_tool').size(
).reset_index(name='count').drop('count', axis=1)
rui = subset_sankey.groupby('is_rui_registered').size(
).reset_index(name='count').drop('count', axis=1)
atlas = subset_sankey.groupby('is_atlas_dataset').size(
).reset_index(name='count').drop('count', axis=1)

# Create list of unique names
unique_name = []
for df in [p, d, a, b, r, o, c, rui, atlas]:
    unique_name.extend(df.iloc[:, 0].tolist())

# Create empty DataFrame for nodes
nodes = pd.DataFrame({
    'name': unique_name
})

# Print nodes DataFrame
nodes

Unnamed: 0,name
0,CZ CELLxGENE
1,GTEx
2,HuBMAP
3,LungMap
4,SenNet
5,Female
6,Male
7,15-19 Years
8,20-24 Years
9,25-29 Years


In [55]:
# Creating a DataFrame for nodes
nodes = pd.DataFrame({'name': unique_name})

# Add a new column 'index' with values starting from 0
nodes['index'] = range(len(nodes))
nodes

Unnamed: 0,name,index
0,CZ CELLxGENE,0
1,GTEx,1
2,HuBMAP,2
3,LungMap,3
4,SenNet,4
5,Female,5
6,Male,6
7,15-19 Years,7
8,20-24 Years,8
9,25-29 Years,9


In [56]:
# Grouping and renaming columns for various categories and creating count summaries
portal_sex = subset_sankey.groupby(
    ['portal', 'donor_sex']).size().reset_index(name='count')
portal_sex = portal_sex.rename(
    columns={'portal': 'source', 'donor_sex': 'target', 'count': 'value'})

sex_age = subset_sankey.groupby(
    ['donor_sex', 'donor_age_binned']).size().reset_index(name='count')
sex_age = sex_age.rename(
    columns={'donor_sex': 'source', 'donor_age_binned': 'target', 'count': 'value'})

age_bmi = subset_sankey.groupby(
    ['donor_age_binned', 'donor_race']).size().reset_index(name='count')
age_bmi = age_bmi.rename(
    columns={'donor_age_binned': 'source', 'donor_race': 'target', 'count': 'value'})

race_bmi = subset_sankey.groupby(
    ['donor_race', 'donor_bmi_binned']).size().reset_index(name='count')
race_bmi = race_bmi.rename(
    columns={'donor_race': 'source', 'donor_bmi_binned': 'target', 'count': 'value'})

race_organ = subset_sankey.groupby(
    ['donor_bmi_binned', 'organ_name']).size().reset_index(name='count')
race_organ = race_organ.rename(
    columns={'donor_bmi_binned': 'source', 'organ_name': 'target', 'count': 'value'})

organ_ctann = subset_sankey.groupby(
    ['organ_name', 'cell_type_annotation_tool']).size().reset_index(name='count')
organ_ctann = organ_ctann.rename(columns={
                                 'organ_name': 'source', 'cell_type_annotation_tool': 'target', 'count': 'value'})

ctann_rui = subset_sankey.groupby(
    ['cell_type_annotation_tool', 'is_rui_registered']).size().reset_index(name='count')
ctann_rui = ctann_rui.rename(columns={
                             'cell_type_annotation_tool': 'source', 'is_rui_registered': 'target', 'count': 'value'})

rui_atlas = subset_sankey.groupby(
    ['is_rui_registered', 'is_atlas_dataset']).size().reset_index(name='count')
rui_atlas = rui_atlas.rename(columns={
                             'is_rui_registered': 'source', 'is_atlas_dataset': 'target', 'count': 'value'})

# Concatenate all group summaries to create prep_links
prep_links = pd.concat([portal_sex, sex_age, age_bmi, race_bmi,
                       race_organ, organ_ctann, ctann_rui, rui_atlas], ignore_index=True)

# The final links DataFrame
links = prep_links

links

Unnamed: 0,source,target,value
0,CZ CELLxGENE,Female,63
1,CZ CELLxGENE,Male,58
2,GTEx,Female,7
3,GTEx,Male,8
4,HuBMAP,Female,134
...,...,...,...
146,Thymus,sc_transcriptomics with Cell Summary,4
147,Urinary Bladder,sc_transcriptomics with Cell Summary,12
148,sc_proteomics,RUI-registered,104
149,sc_transcriptomics with Cell Summary,RUI-registered,558


In [57]:
nodes_for_source = nodes.rename(columns={nodes.columns[0]: "source"})
nodes_for_source

Unnamed: 0,source,index
0,CZ CELLxGENE,0
1,GTEx,1
2,HuBMAP,2
3,LungMap,3
4,SenNet,4
5,Female,5
6,Male,6
7,15-19 Years,7
8,20-24 Years,8
9,25-29 Years,9


In [58]:
# Perform a left join on `prep_links` and `nodes` by the "source" column
prep_links_with_nodes = pd.merge(prep_links, nodes_for_source, how='left',
                      left_on='source', right_on='source')

prep_links_with_nodes

Unnamed: 0,source,target,value,index
0,CZ CELLxGENE,Female,63,0
1,CZ CELLxGENE,Male,58,0
2,GTEx,Female,7,1
3,GTEx,Male,8,1
4,HuBMAP,Female,134,2
...,...,...,...,...
146,Thymus,sc_transcriptomics with Cell Summary,4,45
147,Urinary Bladder,sc_transcriptomics with Cell Summary,12,46
148,sc_proteomics,RUI-registered,104,47
149,sc_transcriptomics with Cell Summary,RUI-registered,558,48


In [59]:
# Rename the first column in `nodes` to "target"
nodes_for_target = nodes.rename(columns={nodes.columns[0]: "target"})
nodes_for_target

Unnamed: 0,target,index
0,CZ CELLxGENE,0
1,GTEx,1
2,HuBMAP,2
3,LungMap,3
4,SenNet,4
5,Female,5
6,Male,6
7,15-19 Years,7
8,20-24 Years,8
9,25-29 Years,9


In [60]:

# Perform a left join on `prep_links` and `nodes` by the "target" column
prep_links_with_all = pd.merge(prep_links_with_nodes, nodes_for_target, how='left',
                      left_on='target', right_on='target')

prep_links_with_all

Unnamed: 0,source,target,value,index_x,index_y
0,CZ CELLxGENE,Female,63,0,5
1,CZ CELLxGENE,Male,58,0,6
2,GTEx,Female,7,1,5
3,GTEx,Male,8,1,6
4,HuBMAP,Female,134,2,5
...,...,...,...,...,...
146,Thymus,sc_transcriptomics with Cell Summary,4,45,48
147,Urinary Bladder,sc_transcriptomics with Cell Summary,12,46,48
148,sc_proteomics,RUI-registered,104,47,49
149,sc_transcriptomics with Cell Summary,RUI-registered,558,48,49


In [61]:
# Select specific columns from `prep_links` (columns 4, 5, and 3)
prep_links = prep_links_with_all.iloc[:, [3, 4, 2]]

# Rename the first two columns of `prep_links` to "source" and "target"
prep_links.columns = ['source', 'target', 'value']

# Rename the first column in `nodes` to "name"
nodes = nodes.rename(columns={nodes.columns[0]: "name"})

# Display the result
prep_links

Unnamed: 0,source,target,value
0,0,5,63
1,0,6,58
2,1,5,7
3,1,6,8
4,2,5,134
...,...,...,...
146,45,48,4
147,46,48,12
148,47,49,104
149,48,49,558


## Visualize

In [62]:
sankey_data = go.Sankey(
    node=dict(
        pad=30,  # Padding between nodes
        thickness=20,  # Node thickness
        line=dict(color="black", width=0.5),
        label=nodes['name'].tolist()  # Use the 'name' column as node labels
    ),
    link=dict(
        # Indices of source nodes from 'source_idx'
        source=prep_links['source'],
        # Indices of target nodes from 'target_idx'
        target=prep_links['target'],
        value=prep_links['value']  # Values for the links
    )
)

# Step 2: Create the layout for the Sankey diagram
layout = go.Layout(
    font=dict(size=15),
)

if not export_html_for_deployment:
    layout = go.Layout(
        # title="HRApop Universe",
        font=dict(size=20),
        width=1650,  # width in pixels
        height=800   # height in pixels
    )
  

# Step 3: Create the figure and plot
fig = go.Figure(data=[sankey_data], layout=layout)

# Make the figure responsive
fig.update_layout(
    autosize=True,
    margin=dict(l=16, r=16, t=16, b=16),
)

fig.show()

## Export

In [63]:
# set config for export
filename = 'sankey_universe_plotly' if not only_atlas else 'sankey_atlas_plotly'
include_plotlyjs='cdn'
include_mathjax = 'cdn'
full_html = True,
responsive = True

if export_html_for_deployment:
  fig.write_html(
      f'../docs/{filename}.html',  # this changes based on condition
      include_plotlyjs=include_plotlyjs,
      include_mathjax=include_mathjax,
      full_html=full_html,
      config={
          'responsive': responsive,
          'displayModeBar': False,  # this changes based on condition
      }
    )

elif export_html_for_inspection:
  fig.write_html(
    f'../docs/{filename}_inspect.html', # this changes based on condition
      include_plotlyjs=include_plotlyjs,
      include_mathjax=include_mathjax,
      full_html=full_html,
      config={
          'responsive': responsive,
          'displayModeBar': True,  # this changes based on condition
      }
    )