## Build a Sankey diagram for FTU data

## Install and import libraries

In [19]:
%pip install pandas plotly numpy nbformat

import pandas as pd
import re
import numpy as np
from pprint import pprint
import plotly.graph_objects as go
import nbformat

Note: you may need to restart the kernel to use updated packages.


## Load data

In [20]:
sankey_universe = pd.read_csv('../output/sankey_for_FTU.csv')
sankey_universe

Unnamed: 0,portal,study_paper,doi,lead_author,is_azimuth_reference,donor_id,donor_sex,donor_age,donor_development_stage,donor_race,...,unique_dataset_id,link_to_h5ad_file,sc_transcriptomics_or_sc_proteomics,cell_type_annotation_tool,omap_id,number_of_cells_total,number_of_unique_cell_types,hubmap_dataset_publication_status,is_rui_registered,is_atlas_dataset
0,NHLBI/LungMap,LungMAP ��� Human data from a broad age health...,https://doi.org/10.7554/eLife.62522,Allen Wang,,D032_Donor,Male,3.0,,,...,https://api.cellxgene.cziscience.com/dp/v1/col...,https://data-browser.lungmap.net/explore/proje...,,azimuth,,14910.0,70.0,,True,True
1,NHLBI/LungMap,LungMAP ��� Human data from a broad age health...,https://doi.org/10.7554/eLife.62522,Allen Wang,,D032_Donor,Male,3.0,,,...,https://api.cellxgene.cziscience.com/dp/v1/col...,https://data-browser.lungmap.net/explore/proje...,,celltypist,,14910.0,70.0,,True,True
2,NHLBI/LungMap,LungMAP ��� Human data from a broad age health...,https://doi.org/10.7554/eLife.62522,Allen Wang,,D032_Donor,Male,3.0,,,...,https://api.cellxgene.cziscience.com/dp/v1/col...,https://data-browser.lungmap.net/explore/proje...,,popv,,14910.0,70.0,,True,True
3,NHLBI/LungMap,LungMAP ��� Human data from a broad age health...,https://doi.org/10.7554/eLife.62522,Allen Wang,,D046_Donor,Male,3.0,,,...,https://api.cellxgene.cziscience.com/dp/v1/col...,https://data-browser.lungmap.net/explore/proje...,,azimuth,,26682.0,76.0,,True,True
4,NHLBI/LungMap,LungMAP ��� Human data from a broad age health...,https://doi.org/10.7554/eLife.62522,Allen Wang,,D046_Donor,Male,3.0,,,...,https://api.cellxgene.cziscience.com/dp/v1/col...,https://data-browser.lungmap.net/explore/proje...,,celltypist,,26682.0,76.0,,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
542,EBI - SCEA - Anatomogram,Single cell transcriptional and chromatin acce...,https://doi.org/10.1038/s41467-021-22368-w,Yoshiharu Muto,,,Male,54.0,adult,European,...,https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi...,https://www.ebi.ac.uk/gxa/sc/experiments/E-CUR...,sc_transcriptomics,Seurat's label transfer,,,,,False,False
543,EBI - SCEA - Anatomogram,Single cell transcriptional and chromatin acce...,https://doi.org/10.1038/s41467-021-22368-w,Yoshiharu Muto,,,Male,62.0,adult,Hispanic or Latin American,...,https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi...,https://www.ebi.ac.uk/gxa/sc/experiments/E-CUR...,sc_transcriptomics,Seurat's label transfer,,,,,False,False
544,EBI - SCEA - Anatomogram,Single cell transcriptional and chromatin acce...,https://doi.org/10.1038/s41467-021-22368-w,Yoshiharu Muto,,,Female,61.0,adult,European,...,https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi...,https://www.ebi.ac.uk/gxa/sc/experiments/E-CUR...,sc_transcriptomics,Seurat's label transfer,,,,,False,False
545,EBI - SCEA - Anatomogram,Single cell transcriptional and chromatin acce...,https://doi.org/10.1038/s41467-021-22368-w,Yoshiharu Muto,,,Male,50.0,adult,European,...,https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi...,https://www.ebi.ac.uk/gxa/sc/experiments/E-CUR...,sc_transcriptomics,Seurat's label transfer,,,,,False,False


## Handle bins for age, BMI, values for race

In [21]:
def add_bins(original_column: pd.Series, bins: list, labels: list):
  """ Create new column with bins

  Args:
      original_column (pd.Series): Column to bin
      bins (list): Bin edges
      labels (list): Bin labels
  """
  result = pd.cut(original_column, bins=bins,
                  labels=labels, include_lowest=True)
  return result

In [22]:
# handle age values
# load sheet with CxG age values
cxg_age = pd.read_csv("data/cxg-development-stage.csv")

# add column headers
cxg_age.columns = ['unique_dataset_id', 'donor_developmental_stage_cxg']

cxg_age

Unnamed: 0,unique_dataset_id,donor_developmental_stage_cxg
0,https://api.cellxgene.cziscience.com/dp/v1/col...,unknown
1,https://api.cellxgene.cziscience.com/dp/v1/col...,unknown
2,https://api.cellxgene.cziscience.com/dp/v1/col...,human adult stage
3,https://api.cellxgene.cziscience.com/dp/v1/col...,human adult stage
4,https://api.cellxgene.cziscience.com/dp/v1/col...,65-year-old human stage
...,...,...
6490,https://api.cellxgene.cziscience.com/dp/v1/col...,73-year-old human stage
6491,https://api.cellxgene.cziscience.com/dp/v1/col...,41-year-old human stage
6492,https://api.cellxgene.cziscience.com/dp/v1/col...,41-year-old human stage
6493,https://api.cellxgene.cziscience.com/dp/v1/col...,68-year-old human stage


In [23]:
# Extract numbers and assign as age
# Extract age
cxg_age['contains_number'] = cxg_age['donor_developmental_stage_cxg'].str.contains(
    r'\d', regex=True)
cxg_age['extracted_age_number'] = cxg_age['donor_developmental_stage_cxg'].str.extract(
    r'(\d+)').astype(float)

cxg_age

Unnamed: 0,unique_dataset_id,donor_developmental_stage_cxg,contains_number,extracted_age_number
0,https://api.cellxgene.cziscience.com/dp/v1/col...,unknown,False,
1,https://api.cellxgene.cziscience.com/dp/v1/col...,unknown,False,
2,https://api.cellxgene.cziscience.com/dp/v1/col...,human adult stage,False,
3,https://api.cellxgene.cziscience.com/dp/v1/col...,human adult stage,False,
4,https://api.cellxgene.cziscience.com/dp/v1/col...,65-year-old human stage,True,65.0
...,...,...,...,...
6490,https://api.cellxgene.cziscience.com/dp/v1/col...,73-year-old human stage,True,73.0
6491,https://api.cellxgene.cziscience.com/dp/v1/col...,41-year-old human stage,True,41.0
6492,https://api.cellxgene.cziscience.com/dp/v1/col...,41-year-old human stage,True,41.0
6493,https://api.cellxgene.cziscience.com/dp/v1/col...,68-year-old human stage,True,68.0


In [24]:
sankey_universe = sankey_universe.merge(
    cxg_age, on='unique_dataset_id', how='left')

# Apply conditions to update 'donor_age'
sankey_universe['donor_age'] = sankey_universe.apply(
    lambda row: row['extracted_age_number'] if pd.notna(
        row['extracted_age_number']) and row['contains_number'] else row['donor_age'],
    axis=1
)

# Drop the columns 'extracted_age_number' and 'contains_number'
sankey_universe = sankey_universe.drop(
    columns=['extracted_age_number', 'contains_number'])

sankey_universe

Unnamed: 0,portal,study_paper,doi,lead_author,is_azimuth_reference,donor_id,donor_sex,donor_age,donor_development_stage,donor_race,...,link_to_h5ad_file,sc_transcriptomics_or_sc_proteomics,cell_type_annotation_tool,omap_id,number_of_cells_total,number_of_unique_cell_types,hubmap_dataset_publication_status,is_rui_registered,is_atlas_dataset,donor_developmental_stage_cxg
0,NHLBI/LungMap,LungMAP ��� Human data from a broad age health...,https://doi.org/10.7554/eLife.62522,Allen Wang,,D032_Donor,Male,3.0,,,...,https://data-browser.lungmap.net/explore/proje...,,azimuth,,14910.0,70.0,,True,True,3-year-old human stage
1,NHLBI/LungMap,LungMAP ��� Human data from a broad age health...,https://doi.org/10.7554/eLife.62522,Allen Wang,,D032_Donor,Male,3.0,,,...,https://data-browser.lungmap.net/explore/proje...,,celltypist,,14910.0,70.0,,True,True,3-year-old human stage
2,NHLBI/LungMap,LungMAP ��� Human data from a broad age health...,https://doi.org/10.7554/eLife.62522,Allen Wang,,D032_Donor,Male,3.0,,,...,https://data-browser.lungmap.net/explore/proje...,,popv,,14910.0,70.0,,True,True,3-year-old human stage
3,NHLBI/LungMap,LungMAP ��� Human data from a broad age health...,https://doi.org/10.7554/eLife.62522,Allen Wang,,D046_Donor,Male,3.0,,,...,https://data-browser.lungmap.net/explore/proje...,,azimuth,,26682.0,76.0,,True,True,3-year-old human stage
4,NHLBI/LungMap,LungMAP ��� Human data from a broad age health...,https://doi.org/10.7554/eLife.62522,Allen Wang,,D046_Donor,Male,3.0,,,...,https://data-browser.lungmap.net/explore/proje...,,celltypist,,26682.0,76.0,,True,True,3-year-old human stage
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
542,EBI - SCEA - Anatomogram,Single cell transcriptional and chromatin acce...,https://doi.org/10.1038/s41467-021-22368-w,Yoshiharu Muto,,,Male,54.0,adult,European,...,https://www.ebi.ac.uk/gxa/sc/experiments/E-CUR...,sc_transcriptomics,Seurat's label transfer,,,,,False,False,
543,EBI - SCEA - Anatomogram,Single cell transcriptional and chromatin acce...,https://doi.org/10.1038/s41467-021-22368-w,Yoshiharu Muto,,,Male,62.0,adult,Hispanic or Latin American,...,https://www.ebi.ac.uk/gxa/sc/experiments/E-CUR...,sc_transcriptomics,Seurat's label transfer,,,,,False,False,
544,EBI - SCEA - Anatomogram,Single cell transcriptional and chromatin acce...,https://doi.org/10.1038/s41467-021-22368-w,Yoshiharu Muto,,,Female,61.0,adult,European,...,https://www.ebi.ac.uk/gxa/sc/experiments/E-CUR...,sc_transcriptomics,Seurat's label transfer,,,,,False,False,
545,EBI - SCEA - Anatomogram,Single cell transcriptional and chromatin acce...,https://doi.org/10.1038/s41467-021-22368-w,Yoshiharu Muto,,,Male,50.0,adult,European,...,https://www.ebi.ac.uk/gxa/sc/experiments/E-CUR...,sc_transcriptomics,Seurat's label transfer,,,,,False,False,


In [25]:
sankey_universe_with_bins = sankey_universe

# Define bins and labels
bins_age = [0, 1, 5, 10, 15, 20, 25, 30, 35, 40,
            45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100]
labels_age = ["<1 Year", "1-4 Years", "5-9 Years", "10-14 Years", "15-19 Years", "20-24 Years", "25-29 Years", "30-34 Years", "35-39 Years", "40-44 Years",
              "45-49 Years", "50-54 Years", "55-59 Years", "60-64 Years", "65-69 Years", "70-74 Years", "75-79 Years", "80-84 Years", "85-89 Years", "90-94 Years", "95-99 Years"]


bins_bmi = [0, 18.5, 24.9, 29.9, 30.0]
labels_bmi = ["underweight", "healthy", "overweight", "obese"]

# Apply the binning function to create new columns
sankey_universe_with_bins['donor_age_binned'] = add_bins(
    sankey_universe['donor_age'], bins_age, labels_age)

sankey_universe_with_bins['donor_bmi_binned'] = add_bins(
    sankey_universe['donor_bmi'], bins_bmi, labels_bmi)


sankey_universe_with_bins

Unnamed: 0,portal,study_paper,doi,lead_author,is_azimuth_reference,donor_id,donor_sex,donor_age,donor_development_stage,donor_race,...,cell_type_annotation_tool,omap_id,number_of_cells_total,number_of_unique_cell_types,hubmap_dataset_publication_status,is_rui_registered,is_atlas_dataset,donor_developmental_stage_cxg,donor_age_binned,donor_bmi_binned
0,NHLBI/LungMap,LungMAP ��� Human data from a broad age health...,https://doi.org/10.7554/eLife.62522,Allen Wang,,D032_Donor,Male,3.0,,,...,azimuth,,14910.0,70.0,,True,True,3-year-old human stage,1-4 Years,
1,NHLBI/LungMap,LungMAP ��� Human data from a broad age health...,https://doi.org/10.7554/eLife.62522,Allen Wang,,D032_Donor,Male,3.0,,,...,celltypist,,14910.0,70.0,,True,True,3-year-old human stage,1-4 Years,
2,NHLBI/LungMap,LungMAP ��� Human data from a broad age health...,https://doi.org/10.7554/eLife.62522,Allen Wang,,D032_Donor,Male,3.0,,,...,popv,,14910.0,70.0,,True,True,3-year-old human stage,1-4 Years,
3,NHLBI/LungMap,LungMAP ��� Human data from a broad age health...,https://doi.org/10.7554/eLife.62522,Allen Wang,,D046_Donor,Male,3.0,,,...,azimuth,,26682.0,76.0,,True,True,3-year-old human stage,1-4 Years,
4,NHLBI/LungMap,LungMAP ��� Human data from a broad age health...,https://doi.org/10.7554/eLife.62522,Allen Wang,,D046_Donor,Male,3.0,,,...,celltypist,,26682.0,76.0,,True,True,3-year-old human stage,1-4 Years,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
542,EBI - SCEA - Anatomogram,Single cell transcriptional and chromatin acce...,https://doi.org/10.1038/s41467-021-22368-w,Yoshiharu Muto,,,Male,54.0,adult,European,...,Seurat's label transfer,,,,,False,False,,50-54 Years,
543,EBI - SCEA - Anatomogram,Single cell transcriptional and chromatin acce...,https://doi.org/10.1038/s41467-021-22368-w,Yoshiharu Muto,,,Male,62.0,adult,Hispanic or Latin American,...,Seurat's label transfer,,,,,False,False,,60-64 Years,
544,EBI - SCEA - Anatomogram,Single cell transcriptional and chromatin acce...,https://doi.org/10.1038/s41467-021-22368-w,Yoshiharu Muto,,,Female,61.0,adult,European,...,Seurat's label transfer,,,,,False,False,,60-64 Years,
545,EBI - SCEA - Anatomogram,Single cell transcriptional and chromatin acce...,https://doi.org/10.1038/s41467-021-22368-w,Yoshiharu Muto,,,Male,50.0,adult,European,...,Seurat's label transfer,,,,,False,False,,45-49 Years,


## Apply other transformations

In [26]:

# Create subset and replace NAs, unify unknown values, adjust portal values

organ_not_supported_text = "Organ Not Supported"

# Select relevant columns
subset_sankey = sankey_universe_with_bins[['portal', 'donor_sex', 'organ_name', 'dataset_id', 'unique_dataset_id',
                                           'cell_type_annotation_tool', 'donor_race', 'donor_bmi_binned', 'donor_age_binned',
                                           'is_rui_registered', 'is_atlas_dataset', 'FTU']]

# Replace NAs with specified values
subset_sankey['donor_sex'].fillna('Unknown Sex', inplace=True)
subset_sankey['donor_race'].fillna('Unknown Race', inplace=True)
subset_sankey['organ_name'].fillna(organ_not_supported_text, inplace=True)
subset_sankey['cell_type_annotation_tool'].fillna(
    'No Cell Summary', inplace=True)
subset_sankey['dataset_id'].fillna('No Known Dataset ID')
subset_sankey['unique_dataset_id'].fillna('No Known Unique Dataset ID')

# Replace NAs for binned variables
# Convert to object type to allow setting new category values
subset_sankey['donor_bmi_binned'] = subset_sankey['donor_bmi_binned'].astype(
    'object')
subset_sankey['donor_age_binned'] = subset_sankey['donor_age_binned'].astype(
    'object')

subset_sankey['donor_bmi_binned'].fillna('Unknown BMI', inplace=True)
subset_sankey['donor_age_binned'].fillna('Unknown Age', inplace=True)

# Make organs lowercase (title case)
subset_sankey['organ_name'] = subset_sankey['organ_name'].str.title()

# Unify left and right kidney
subset_sankey['organ_name'] = subset_sankey['organ_name'].replace(
    {'Left Kidney': 'Kidney', 'Right Kidney': 'Kidney'})

# Unify unknown values in race and sex
subset_sankey['donor_race'] = subset_sankey['donor_race'].replace(
    {'unknown': 'Unknown Race', 'na': 'Unknown Race'})
subset_sankey['donor_sex'] = subset_sankey['donor_sex'].replace(
    {'Unknown': 'Unknown Sex'})

# Fix portal names
subset_sankey['portal'] = subset_sankey['portal'].replace({'HCA': 'CZ CELLxGENE',
                                                           'NHLBI/LungMap': 'LungMap',
                                                           'CxG': 'CZ CELLxGENE'})

# Add replacement for strings containing 'EBI'
subset_sankey['portal'] = subset_sankey['portal'].replace(
    r'EBI', 'EBI', regex=True)

# Replace portal 'HRA' with 'HRA-OMAP'
subset_sankey['portal'] = subset_sankey['portal'].replace({'HRA': 'HRA-OMAP'})


# Replace NaN values in 'FTU' with 'Unknown'
subset_sankey['FTU'] = subset_sankey['FTU'].fillna('Unknown FTU')

# Convert the 'is_rui_registered' column to meaningful strings
subset_sankey['is_rui_registered'] = subset_sankey['is_rui_registered'].replace(
    {True: 'RUI-registered', 'True': 'RUI-registered',
        False: 'Not RUI-registered', 'no': 'Not RUI-registered'}
)

# Convert the 'is_atlas_dataset' column to meaningful strings
subset_sankey['is_atlas_dataset'] = subset_sankey['is_atlas_dataset'].replace(
    {True: 'Atlas Dataset', False: 'Not Atlas Dataset'}
)

# For NaN values in 'is_atlas_dataset', replace them with "Not Atlas Dataset"
subset_sankey['is_atlas_dataset'] = subset_sankey['is_atlas_dataset'].fillna(
    'Not Atlas Dataset')

subset_sankey


A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.





A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For exampl

Unnamed: 0,portal,donor_sex,organ_name,dataset_id,unique_dataset_id,cell_type_annotation_tool,donor_race,donor_bmi_binned,donor_age_binned,is_rui_registered,is_atlas_dataset,FTU
0,LungMap,Male,Respiratory System,https://api.cellxgene.cziscience.com/dp/v1/col...,https://api.cellxgene.cziscience.com/dp/v1/col...,azimuth,Unknown Race,Unknown BMI,1-4 Years,RUI-registered,Atlas Dataset,Alveolus of Lung
1,LungMap,Male,Respiratory System,https://api.cellxgene.cziscience.com/dp/v1/col...,https://api.cellxgene.cziscience.com/dp/v1/col...,celltypist,Unknown Race,Unknown BMI,1-4 Years,RUI-registered,Atlas Dataset,Alveolus of Lung
2,LungMap,Male,Respiratory System,https://api.cellxgene.cziscience.com/dp/v1/col...,https://api.cellxgene.cziscience.com/dp/v1/col...,popv,Unknown Race,Unknown BMI,1-4 Years,RUI-registered,Atlas Dataset,Alveolus of Lung
3,LungMap,Male,Respiratory System,https://api.cellxgene.cziscience.com/dp/v1/col...,https://api.cellxgene.cziscience.com/dp/v1/col...,azimuth,Unknown Race,Unknown BMI,1-4 Years,RUI-registered,Atlas Dataset,Alveolus of Lung
4,LungMap,Male,Respiratory System,https://api.cellxgene.cziscience.com/dp/v1/col...,https://api.cellxgene.cziscience.com/dp/v1/col...,celltypist,Unknown Race,Unknown BMI,1-4 Years,RUI-registered,Atlas Dataset,Alveolus of Lung
...,...,...,...,...,...,...,...,...,...,...,...,...
542,EBI - SCEA - Anatomogram,Male,Kidney,GSM4572192,https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi...,Seurat's label transfer,European,Unknown BMI,50-54 Years,Not RUI-registered,Not Atlas Dataset,Nephron
543,EBI - SCEA - Anatomogram,Male,Kidney,GSM4572193,https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi...,Seurat's label transfer,Hispanic or Latin American,Unknown BMI,60-64 Years,Not RUI-registered,Not Atlas Dataset,Nephron
544,EBI - SCEA - Anatomogram,Female,Kidney,GSM4572194,https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi...,Seurat's label transfer,European,Unknown BMI,60-64 Years,Not RUI-registered,Not Atlas Dataset,Nephron
545,EBI - SCEA - Anatomogram,Male,Kidney,GSM4572195,https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi...,Seurat's label transfer,European,Unknown BMI,45-49 Years,Not RUI-registered,Not Atlas Dataset,Nephron


## Determine first CTann tool

In [27]:
# add new column for first ctann
subset_sankey['first_ctann_tool'] = np.nan  # Create the new column

# Apply logic to the new column based on the conditions
def determine_tool(cell_type_annotation_tool):
  if pd.isna(cell_type_annotation_tool):  # Check for NaN
        return "No Cell Summary"
  elif 'azimuth' in cell_type_annotation_tool:
      return 'azimuth'
  elif 'celltypist' in cell_type_annotation_tool:
      return 'celltypist'
  elif 'popv' in cell_type_annotation_tool:
      return 'popv'
  else:
      return "No Cell Summary"  


# Group by unique_dataset_id and apply the function
subset_sankey['first_ctann_tool'] = subset_sankey['cell_type_annotation_tool'].apply(
  determine_tool)

# Drop duplicates by unique_dataset_id
subset_sankey = subset_sankey.drop_duplicates(subset='unique_dataset_id', keep='first')
subset_sankey



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,portal,donor_sex,organ_name,dataset_id,unique_dataset_id,cell_type_annotation_tool,donor_race,donor_bmi_binned,donor_age_binned,is_rui_registered,is_atlas_dataset,FTU,first_ctann_tool
0,LungMap,Male,Respiratory System,https://api.cellxgene.cziscience.com/dp/v1/col...,https://api.cellxgene.cziscience.com/dp/v1/col...,azimuth,Unknown Race,Unknown BMI,1-4 Years,RUI-registered,Atlas Dataset,Alveolus of Lung,azimuth
3,LungMap,Male,Respiratory System,https://api.cellxgene.cziscience.com/dp/v1/col...,https://api.cellxgene.cziscience.com/dp/v1/col...,azimuth,Unknown Race,Unknown BMI,1-4 Years,RUI-registered,Atlas Dataset,Alveolus of Lung,azimuth
6,LungMap,Male,Respiratory System,https://api.cellxgene.cziscience.com/dp/v1/col...,https://api.cellxgene.cziscience.com/dp/v1/col...,azimuth,Unknown Race,Unknown BMI,30-34 Years,RUI-registered,Atlas Dataset,Alveolus of Lung,azimuth
9,LungMap,Male,Respiratory System,https://api.cellxgene.cziscience.com/dp/v1/col...,https://api.cellxgene.cziscience.com/dp/v1/col...,azimuth,Unknown Race,Unknown BMI,30-34 Years,RUI-registered,Atlas Dataset,Alveolus of Lung,azimuth
12,LungMap,Male,Respiratory System,https://api.cellxgene.cziscience.com/dp/v1/col...,https://api.cellxgene.cziscience.com/dp/v1/col...,azimuth,Unknown Race,Unknown BMI,30-34 Years,RUI-registered,Atlas Dataset,Alveolus of Lung,azimuth
...,...,...,...,...,...,...,...,...,...,...,...,...,...
542,EBI - SCEA - Anatomogram,Male,Kidney,GSM4572192,https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi...,Seurat's label transfer,European,Unknown BMI,50-54 Years,Not RUI-registered,Not Atlas Dataset,Nephron,No Cell Summary
543,EBI - SCEA - Anatomogram,Male,Kidney,GSM4572193,https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi...,Seurat's label transfer,Hispanic or Latin American,Unknown BMI,60-64 Years,Not RUI-registered,Not Atlas Dataset,Nephron,No Cell Summary
544,EBI - SCEA - Anatomogram,Female,Kidney,GSM4572194,https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi...,Seurat's label transfer,European,Unknown BMI,60-64 Years,Not RUI-registered,Not Atlas Dataset,Nephron,No Cell Summary
545,EBI - SCEA - Anatomogram,Male,Kidney,GSM4572195,https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi...,Seurat's label transfer,European,Unknown BMI,45-49 Years,Not RUI-registered,Not Atlas Dataset,Nephron,No Cell Summary


In [28]:
# Assuming subset_sankey is a pandas DataFrame
tool_replacement = "sc_transcriptomics with Cell Summary"

# Replace the values in the 'cell_type_annotation_tool' column
subset_sankey['cell_type_annotation_tool'] = subset_sankey['cell_type_annotation_tool'].replace(
    ['azimuth', 'celltypist', 'popv', 'n/a'], tool_replacement
)

# Find the duplicates
duplicates = subset_sankey[subset_sankey.duplicated()]

# Print duplicates (optional)
print(duplicates)

# Drop duplicate rows based on all columns (equivalent to distinct())
subset_sankey = subset_sankey.drop_duplicates()

# Print the updated DataFrame
subset_sankey

Empty DataFrame
Columns: [portal, donor_sex, organ_name, dataset_id, unique_dataset_id, cell_type_annotation_tool, donor_race, donor_bmi_binned, donor_age_binned, is_rui_registered, is_atlas_dataset, FTU, first_ctann_tool]
Index: []




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,portal,donor_sex,organ_name,dataset_id,unique_dataset_id,cell_type_annotation_tool,donor_race,donor_bmi_binned,donor_age_binned,is_rui_registered,is_atlas_dataset,FTU,first_ctann_tool
0,LungMap,Male,Respiratory System,https://api.cellxgene.cziscience.com/dp/v1/col...,https://api.cellxgene.cziscience.com/dp/v1/col...,sc_transcriptomics with Cell Summary,Unknown Race,Unknown BMI,1-4 Years,RUI-registered,Atlas Dataset,Alveolus of Lung,azimuth
3,LungMap,Male,Respiratory System,https://api.cellxgene.cziscience.com/dp/v1/col...,https://api.cellxgene.cziscience.com/dp/v1/col...,sc_transcriptomics with Cell Summary,Unknown Race,Unknown BMI,1-4 Years,RUI-registered,Atlas Dataset,Alveolus of Lung,azimuth
6,LungMap,Male,Respiratory System,https://api.cellxgene.cziscience.com/dp/v1/col...,https://api.cellxgene.cziscience.com/dp/v1/col...,sc_transcriptomics with Cell Summary,Unknown Race,Unknown BMI,30-34 Years,RUI-registered,Atlas Dataset,Alveolus of Lung,azimuth
9,LungMap,Male,Respiratory System,https://api.cellxgene.cziscience.com/dp/v1/col...,https://api.cellxgene.cziscience.com/dp/v1/col...,sc_transcriptomics with Cell Summary,Unknown Race,Unknown BMI,30-34 Years,RUI-registered,Atlas Dataset,Alveolus of Lung,azimuth
12,LungMap,Male,Respiratory System,https://api.cellxgene.cziscience.com/dp/v1/col...,https://api.cellxgene.cziscience.com/dp/v1/col...,sc_transcriptomics with Cell Summary,Unknown Race,Unknown BMI,30-34 Years,RUI-registered,Atlas Dataset,Alveolus of Lung,azimuth
...,...,...,...,...,...,...,...,...,...,...,...,...,...
542,EBI - SCEA - Anatomogram,Male,Kidney,GSM4572192,https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi...,Seurat's label transfer,European,Unknown BMI,50-54 Years,Not RUI-registered,Not Atlas Dataset,Nephron,No Cell Summary
543,EBI - SCEA - Anatomogram,Male,Kidney,GSM4572193,https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi...,Seurat's label transfer,Hispanic or Latin American,Unknown BMI,60-64 Years,Not RUI-registered,Not Atlas Dataset,Nephron,No Cell Summary
544,EBI - SCEA - Anatomogram,Female,Kidney,GSM4572194,https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi...,Seurat's label transfer,European,Unknown BMI,60-64 Years,Not RUI-registered,Not Atlas Dataset,Nephron,No Cell Summary
545,EBI - SCEA - Anatomogram,Male,Kidney,GSM4572195,https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi...,Seurat's label transfer,European,Unknown BMI,45-49 Years,Not RUI-registered,Not Atlas Dataset,Nephron,No Cell Summary


In [29]:
# fixing SenNet
# Assuming subset_sankey is a pandas DataFrame
tool_replacement = "sc_transcriptomics with Cell Summary"

# Replace values in 'cell_type_annotation_tool' based on conditions for 'portal' and 'is_atlas_dataset'
subset_sankey.loc[
    (subset_sankey['portal'] == "SenNet") & (
        subset_sankey['is_atlas_dataset'] == "Atlas Dataset"),
    'cell_type_annotation_tool'
] = tool_replacement

In [30]:
# Map race/ethnicity values to race values
# Export donor_race as CSV
donor_race = pd.DataFrame(
    subset_sankey['donor_race'].unique(), columns=['donor_race'])

# Export to CSV
donor_race.to_csv('output/donor_race_python.csv', index=False)

# Read the manually mapped CSV file
race_mapped = pd.read_csv("data/donor_race_mapped.csv")

# Perform the left join (merge) on 'donor_race'
subset_sankey = pd.merge(subset_sankey, race_mapped,
                         on='donor_race', how='left')

# Replace 'donor_race' with 'mapped_donor_race' after the merge
subset_sankey['donor_race'] = subset_sankey['mapped_donor_race']

# Print the updated DataFrame
subset_sankey

Unnamed: 0,portal,donor_sex,organ_name,dataset_id,unique_dataset_id,cell_type_annotation_tool,donor_race,donor_bmi_binned,donor_age_binned,is_rui_registered,is_atlas_dataset,FTU,first_ctann_tool,mapped_donor_race
0,LungMap,Male,Respiratory System,https://api.cellxgene.cziscience.com/dp/v1/col...,https://api.cellxgene.cziscience.com/dp/v1/col...,sc_transcriptomics with Cell Summary,Unknown Race,Unknown BMI,1-4 Years,RUI-registered,Atlas Dataset,Alveolus of Lung,azimuth,Unknown Race
1,LungMap,Male,Respiratory System,https://api.cellxgene.cziscience.com/dp/v1/col...,https://api.cellxgene.cziscience.com/dp/v1/col...,sc_transcriptomics with Cell Summary,Unknown Race,Unknown BMI,1-4 Years,RUI-registered,Atlas Dataset,Alveolus of Lung,azimuth,Unknown Race
2,LungMap,Male,Respiratory System,https://api.cellxgene.cziscience.com/dp/v1/col...,https://api.cellxgene.cziscience.com/dp/v1/col...,sc_transcriptomics with Cell Summary,Unknown Race,Unknown BMI,30-34 Years,RUI-registered,Atlas Dataset,Alveolus of Lung,azimuth,Unknown Race
3,LungMap,Male,Respiratory System,https://api.cellxgene.cziscience.com/dp/v1/col...,https://api.cellxgene.cziscience.com/dp/v1/col...,sc_transcriptomics with Cell Summary,Unknown Race,Unknown BMI,30-34 Years,RUI-registered,Atlas Dataset,Alveolus of Lung,azimuth,Unknown Race
4,LungMap,Male,Respiratory System,https://api.cellxgene.cziscience.com/dp/v1/col...,https://api.cellxgene.cziscience.com/dp/v1/col...,sc_transcriptomics with Cell Summary,Unknown Race,Unknown BMI,30-34 Years,RUI-registered,Atlas Dataset,Alveolus of Lung,azimuth,Unknown Race
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
181,EBI - SCEA - Anatomogram,Male,Kidney,GSM4572192,https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi...,Seurat's label transfer,White,Unknown BMI,50-54 Years,Not RUI-registered,Not Atlas Dataset,Nephron,No Cell Summary,White
182,EBI - SCEA - Anatomogram,Male,Kidney,GSM4572193,https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi...,Seurat's label transfer,Hispanic,Unknown BMI,60-64 Years,Not RUI-registered,Not Atlas Dataset,Nephron,No Cell Summary,Hispanic
183,EBI - SCEA - Anatomogram,Female,Kidney,GSM4572194,https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi...,Seurat's label transfer,White,Unknown BMI,60-64 Years,Not RUI-registered,Not Atlas Dataset,Nephron,No Cell Summary,White
184,EBI - SCEA - Anatomogram,Male,Kidney,GSM4572195,https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi...,Seurat's label transfer,White,Unknown BMI,45-49 Years,Not RUI-registered,Not Atlas Dataset,Nephron,No Cell Summary,White


## Build nodes

In [31]:
# Grouping and summarizing by different columns
p = subset_sankey.groupby('portal').size().reset_index(name='count')

d = subset_sankey.groupby('donor_sex').size().reset_index(name='count')

a = subset_sankey.groupby('donor_age_binned').size().reset_index(name='count')

b = subset_sankey.groupby('donor_bmi_binned').size().reset_index(name='count')

r = subset_sankey.groupby('donor_race').size().reset_index(name='count')

o = subset_sankey.groupby('organ_name').size().reset_index(name='count')

f = subset_sankey.groupby('FTU').size().reset_index(name='count')

c = subset_sankey.groupby(
    'cell_type_annotation_tool').size().reset_index(name='count')

rui = subset_sankey.groupby(
    'is_rui_registered').size().reset_index(name='count')

atlas = subset_sankey.groupby(
    'is_atlas_dataset').size().reset_index(name='count')

first_ctann = subset_sankey.groupby(
    'first_ctann_tool').size().reset_index(name='count')


# Create list of unique names
unique_name = []
for df in [p, d, a, b, r, o, f, c, rui, atlas, first_ctann]:
    unique_name.extend(df.iloc[:, 0].tolist())

# Create empty DataFrame for nodes
nodes = pd.DataFrame({
    'name': unique_name
})

# Print nodes DataFrame
nodes

Unnamed: 0,name
0,EBI - SCEA - Anatomogram
1,GTEx
2,HuBMAP
3,KPMP
4,LungMap
5,SenNet
6,Female
7,Male
8,1-4 Years
9,15-19 Years


In [32]:
# Creating a DataFrame for nodes
nodes = pd.DataFrame({'name': unique_name})

# Add a new column 'index' with values starting from 0
nodes['index'] = range(len(nodes))
nodes

Unnamed: 0,name,index
0,EBI - SCEA - Anatomogram,0
1,GTEx,1
2,HuBMAP,2
3,KPMP,3
4,LungMap,4
5,SenNet,5
6,Female,6
7,Male,7
8,1-4 Years,8
9,15-19 Years,9


## Build links

In [33]:
# Grouping and summarizing with renaming columns
portal_sex = subset_sankey.groupby(
    ['portal', 'donor_age_binned']).size().reset_index(name='count')
portal_sex = portal_sex.rename(
    columns={'portal': 'source', 'donor_age_binned': 'target', 'count': 'value'})

sex_age = subset_sankey.groupby(
    ['donor_age_binned', 'donor_sex']).size().reset_index(name='count')
sex_age = sex_age.rename(
    columns={'donor_age_binned': 'source', 'donor_sex': 'target', 'count': 'value'})

age_bmi = subset_sankey.groupby(
    ['donor_sex', 'donor_bmi_binned']).size().reset_index(name='count')
age_bmi = age_bmi.rename(
    columns={'donor_sex': 'source', 'donor_bmi_binned': 'target', 'count': 'value'})

bmi_race = subset_sankey.groupby(
    ['donor_bmi_binned', 'donor_race']).size().reset_index(name='count')
bmi_race = bmi_race.rename(
    columns={'donor_bmi_binned': 'source', 'donor_race': 'target', 'count': 'value'})

race_organ = subset_sankey.groupby(
    ['donor_race', 'organ_name']).size().reset_index(name='count')
race_organ = race_organ.rename(
    columns={'donor_race': 'source', 'organ_name': 'target', 'count': 'value'})

organ_ftu = subset_sankey.groupby(
    ['organ_name', 'FTU']).size().reset_index(name='count')
organ_ftu = organ_ftu.rename(
    columns={'organ_name': 'source', 'FTU': 'target', 'count': 'value'})

ftu_rui = subset_sankey.groupby(
    ['FTU', 'first_ctann_tool']).size().reset_index(name='count')
ftu_rui = ftu_rui.rename(
    columns={'FTU': 'source', 'first_ctann_tool': 'target', 'count': 'value'})

rui_first = subset_sankey.groupby(
    ['first_ctann_tool', 'is_rui_registered']).size().reset_index(name='count')
rui_first = rui_first.rename(columns={
                             'first_ctann_tool': 'source', 'is_rui_registered': 'target', 'count': 'value'})

# Concatenate all group summaries to create prep_links
prep_links = pd.concat([portal_sex, sex_age, age_bmi, bmi_race,
                       race_organ, organ_ftu, ftu_rui, rui_first], ignore_index=True)

# The final links DataFrame
links = prep_links

links

Unnamed: 0,source,target,value
0,EBI - SCEA - Anatomogram,45-49 Years,1
1,EBI - SCEA - Anatomogram,50-54 Years,2
2,EBI - SCEA - Anatomogram,55-59 Years,2
3,EBI - SCEA - Anatomogram,60-64 Years,3
4,EBI - SCEA - Anatomogram,65-69 Years,1
...,...,...,...
91,Villus,celltypist,27
92,No Cell Summary,Not RUI-registered,10
93,azimuth,RUI-registered,111
94,celltypist,RUI-registered,55


In [34]:
nodes_for_source = nodes.rename(columns={nodes.columns[0]: "source"})
nodes_for_source

# Perform a left join on `prep_links` and `nodes` by the "source" column
prep_links_with_nodes = pd.merge(prep_links, nodes_for_source, how='left',
                                 left_on='source', right_on='source')

prep_links_with_nodes

# Rename the first column in `nodes` to "target"
nodes_for_target = nodes.rename(columns={nodes.columns[0]: "target"})
nodes_for_target


# Perform a left join on `prep_links` and `nodes` by the "target" column
prep_links_with_all = pd.merge(prep_links_with_nodes, nodes_for_target, how='left',
                               left_on='target', right_on='target')

prep_links_with_all

# Select specific columns from `prep_links` (columns 4, 5, and 3)
prep_links = prep_links_with_all.iloc[:, [3, 4, 2]]

# Rename the first two columns of `prep_links` to "source" and "target"
prep_links.columns = ['source', 'target', 'value']

# Rename the first column in `nodes` to "name"
nodes = nodes.rename(columns={nodes.columns[0]: "name"})

# Display the result
prep_links

Unnamed: 0,source,target,value
0,0,14,1
1,0,16,2
2,0,17,2
3,0,18,3
4,0,19,1
...,...,...,...
91,46,57,27
92,55,51,10
93,56,52,111
94,57,52,55


## Visualize

In [35]:
sankey_data = go.Sankey(
    node=dict(
        pad=30,  # Padding between nodes
        thickness=20,  # Node thickness
        line=dict(color="black", width=0.5),
        label=nodes['name'].tolist()  # Use the 'name' column as node labels
    ),
    link=dict(
        # Indices of source nodes from 'source_idx'
        source=prep_links['source'],
        # Indices of target nodes from 'target_idx'
        target=prep_links['target'],
        value=prep_links['value']  # Values for the links
    )
)

# Step 2: Create the layout for the Sankey diagram
layout = go.Layout(
    title="Datasets in FTUs",
    font=dict(size=18),
    width=1650,  # width in pixels
    height=800   # height in pixels
)

# Step 3: Create the figure and plot
fig = go.Figure(data=[sankey_data], layout=layout)
fig.show()

## Export

In [36]:
fig.write_html('../docs/ftu_sankey_atlas.html')