## Setting up environment

In [86]:
import numpy as np
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import umap
%matplotlib inline

In [3]:
sns.set(style='white', context='notebook', rc={'figure.figsize':(14,10)})

## Loading data

In [51]:
data = pd.read_csv("https://raw.githubusercontent.com/x-atlas-consortia/hra-pop/main/output-data/v0.5/reports/atlas/validation-v5.csv")
data.head()

Unnamed: 0,consortium_name,dataset,tool,modality,reported_organ,organ,organId,rui_location_volume,cell_id,cell_label,cell_count,percentage
0,NHLBI/LungMap,https://api.cellxgene.cziscience.com/dp/v1/col...,azimuth,sc_bulk,http://purl.obolibrary.org/obo/UBERON_0002048,respiratory system,UBERON:0001004,75.0,CL:4028006,alveolar type 2 fibroblast cell,832,0.167404
1,NHLBI/LungMap,https://api.cellxgene.cziscience.com/dp/v1/col...,azimuth,sc_bulk,http://purl.obolibrary.org/obo/UBERON_0002048,respiratory system,UBERON:0001004,75.0,CL:4028004,alveolar type 1 fibroblast cell,742,0.149296
2,NHLBI/LungMap,https://api.cellxgene.cziscience.com/dp/v1/col...,azimuth,sc_bulk,http://purl.obolibrary.org/obo/UBERON_0002048,respiratory system,UBERON:0001004,75.0,CL:0000583,alveolar macrophage,693,0.139437
3,NHLBI/LungMap,https://api.cellxgene.cziscience.com/dp/v1/col...,azimuth,sc_bulk,http://purl.obolibrary.org/obo/UBERON_0002048,respiratory system,UBERON:0001004,75.0,CL:0002144,capillary endothelial cell,684,0.137626
4,NHLBI/LungMap,https://api.cellxgene.cziscience.com/dp/v1/col...,azimuth,sc_bulk,http://purl.obolibrary.org/obo/UBERON_0002048,respiratory system,UBERON:0001004,75.0,CL:0002553,fibroblast of lung:alveolar,637,0.128169


In [52]:
#Update organ names for clarity for labels
data.organ = data.organ.str.replace("left ","", regex=True)
data.organ = data.organ.str.replace("right ","", regex=True)
data.organ = data.organ.str.replace("Set of ","", regex=True)
data.organ = data.organ.str.title()
data.organ = data.organ.str.replace("In","in", regex=True)

# Set revised values to categories
data.organ = data.organ.astype("category")
data.organId = data.organId.astype("category")
data.reported_organ = data.reported_organ.astype("category")
data.consortium_name = data.consortium_name.astype("category")
data.dataset = data.dataset.astype("category")
data.modality = data.modality.astype("category")
data.cell_id = data.cell_id.astype("category")
data.cell_label = data.cell_label.astype("category")

## Create Pivot tables using datasets and cell_ids.

The first pivot table collects the measured cell type counts for each dataset. The second pivot table collects the percentage of cell types for each dataset. Both use the minimum aggregation function, as is only one measurement per cell type per data set.

In [73]:
data_celltype = data.pivot_table(index='dataset',
                                 columns='cell_id',
                                 values='cell_count',
                                 aggfunc='min',
                                 fill_value='0',
                                 observed=True,
                                 margins=False)

In [74]:
data_cellperc = data.pivot_table(index='dataset',
                                 columns='cell_id',
                                 values='percentage',
                                 aggfunc='min',
                                 fill_value='0',
                                 observed=True,
                                 margins=False)

In [120]:
data_organs = data[['dataset','organ']].drop_duplicates()




                                                 dataset               organ
0      https://api.cellxgene.cziscience.com/dp/v1/col...  Respiratory System
102    https://api.cellxgene.cziscience.com/dp/v1/col...  Respiratory System
219    https://api.cellxgene.cziscience.com/dp/v1/col...  Respiratory System
331    https://api.cellxgene.cziscience.com/dp/v1/col...  Respiratory System
446    https://api.cellxgene.cziscience.com/dp/v1/col...  Respiratory System
...                                                  ...                 ...
36407  https://entity.api.hubmapconsortium.org/entiti...              Kidney
36448  https://entity.api.hubmapconsortium.org/entiti...     Large intestine
36463  https://entity.api.hubmapconsortium.org/entiti...     Urinary Bladder
36477  https://entity.api.sennetconsortium.org/entiti...               Liver
36505  https://entity.api.sennetconsortium.org/entiti...               Liver

[707 rows x 2 columns]


In [83]:
scaled_penguin_data = StandardScaler().fit_transform(data_celltype)

In [90]:
reducer = umap.UMAP()
embedding = reducer.fit_transform(scaled_penguin_data)
embedding.shape

(707, 2)

In [121]:
plt.scatter(
    embedding[:, 0],
    embedding[:, 1],
    c=[sns.color_palette()[x] for x in data_organs.organ.map({"Heart":0, "Kidney":1, "Lactiferous Glands in Breast":2, "Large intestine":3,
                                                              "Liver":4, "Male Reproductive System":5, "Mesenteric Lymph Node":6, "Respiratory System":7,
                                                              "Skin Of Body":8, "Small intestine":9, "Spleen":10, "Uterer":11, "Urinary Bladder":12})])
plt.gca().set_aspect('equal', 'datalim')
plt.title('UMAP projection of the HRA-Pop Cell Type dataset', fontsize=24);

TypeError: list indices must be integers or slices, not float