# LABELATOR data prep

## overview

This notebook prepares the data from the source `XYLENA` `AnnData` blob.

```python

XYLENA_ANNDATA = "brain_atlas_anndata.h5ad"

```

### actions

- add metadata
- create updated `AnnData` with metadata
- create a sparse data matrix version

- train & test versions

### Imports and scvi-tools installation (colab)

In [1]:
import sys

IN_COLAB = "google.colab" in sys.modules
if IN_COLAB:
    !pip uninstall -y typing_extensions
    !pip install --quiet scvi-colab
    from scvi_colab import install
    install()
    !pip install --quiet scrublet

In [2]:
import sys
import warnings

import anndata as ad
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scanpy as sc
from pathlib import Path


In [3]:
if IN_COLAB:
    from google.colab import drive
    drive.mount('/content/drive')

In [4]:
warnings.simplefilter(action="ignore", category=FutureWarning)


sc.set_figure_params(figsize=(4, 4))

%config InlineBackend.print_figure_kwargs={'facecolor' : "w"}
%config InlineBackend.figure_format='retina'

## Data

### Load

In [5]:
XYLENA_ANNDATA = "brain_atlas_anndata.h5ad"
XYLENA_METADATA = "final_metadata.csv"
#alternate version made from .csv.  has different genes
XYLENA_ANNDATA2 = "brain_atlas_counts.h5ad"


In [6]:
if IN_COLAB:
    root_path = Path("/content/drive/MyDrive/")
    data_path = root_path / "SingleCellModel/data"
else:
    root_path = Path("../")
    if sys.platform == "darwin":
        data_path = root_path / "data/xylena_raw"
    else:
        data_path = root_path / "data/scdata/xylena"
        raw_data_path = root_path / "data/scdata/xylena_raw"

XYLENA_ANNDATA = "brain_atlas_anndata.h5ad"
XYLENA_METADATA = "final_metadata.csv"
XYLENA_ANNDATA2 = "brain_atlas_anndata_updated.h5ad"

XYLENA_TRAIN = XYLENA_ANNDATA.replace(".h5ad", "_train_cnt.h5ad")
XYLENA_TEST = XYLENA_ANNDATA.replace(".h5ad", "_test_cnt.h5ad")

XYLENA_TRAIN_SPARSE = XYLENA_TRAIN.replace(".h5ad", "_sparse.h5ad")
XYLENA_TEST_SPARSE = XYLENA_TEST.replace(".h5ad", "_sparse.h5ad")


In [7]:

data_file = raw_data_path / XYLENA_ANNDATA
data_file.exists()

True

In [8]:

raw_ad = ad.read_h5ad(data_file)


check that we have unique gene names

In [9]:
ogfeatures = raw_ad.var_names.to_list()
raw_ad.var_names_make_unique()

In [10]:

features = raw_ad.var_names.tolist()


In [11]:
shared_feats = list(set(features) & set(ogfeatures))
len(shared_feats), len(features), len(ogfeatures)

(3000, 3000, 3000)

### Add Metadata

In [12]:
metadat = pd.read_csv(raw_data_path / XYLENA_METADATA)
og_metadat = raw_ad.obs.copy()

In [13]:
metadat.head()


Unnamed: 0,cells,doublet_score,nCount_RNA,nFeature_RNA,percent.mt,percent.rb,batch,sample,S.Score,G2M.Score,...,ExN1,InN2,MG3,Astro4,Oligo5,OPC6,VC7,type,UMAP_1,UMAP_2
0,GGCCTAATCGATTTAG-1_1,0.163312,21670,6217,0.687587,0.56299,batch1,KEN-1070-ARC,0.003546,-0.010272,...,0.74697,0.02923,-0.121564,-0.421587,-0.665052,-0.169264,-0.119527,Mature neurons,1.518145,-11.242935
1,TAGTAACGTAGTCAAT-1_1,0.143924,20190,5488,0.029718,0.307083,batch1,KEN-1070-ARC,0.034954,-0.022838,...,0.761065,0.0385,-0.098816,-0.45502,-0.384784,-0.388421,0.015812,Mature neurons,1.569603,-1.677851
2,GAAAGCCAGCAGCTCA-1_1,0.168777,17677,5687,0.797647,0.543079,batch1,KEN-1070-ARC,-0.021208,-0.012252,...,0.879119,0.083963,-0.122479,-0.364199,-0.294441,-0.305501,-0.124843,Mature neurons,6.405315,4.732371
3,ACTCACCTCCTCCCTC-1_1,0.097057,17612,4954,0.062457,0.255508,batch1,KEN-1070-ARC,-0.045867,0.005147,...,0.893122,0.067002,-0.10179,-0.407095,-0.665777,-0.354619,-0.102641,Mature neurons,1.445644,-1.882242
4,CTTCATCCAATCGCAC-1_1,0.120637,17250,4837,0.011594,0.202899,batch1,KEN-1070-ARC,-0.056202,-0.019759,...,0.867374,0.120805,-0.09397,-0.422024,-0.742585,0.131618,-0.095371,Mature neurons,0.464842,-10.888965


In [14]:
obs = raw_ad.obs.copy()
obs.head()

Unnamed: 0_level_0,seurat_clusters,cell_type,sample
cells,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
GGCCTAATCGATTTAG-1_1,8,ExN,KEN-1070-ARC
TAGTAACGTAGTCAAT-1_1,2,ExN,KEN-1070-ARC
GAAAGCCAGCAGCTCA-1_1,2,ExN,KEN-1070-ARC
ACTCACCTCCTCCCTC-1_1,2,ExN,KEN-1070-ARC
CTTCATCCAATCGCAC-1_1,8,ExN,KEN-1070-ARC


Fix the 13 `cell_type` = "nan" entries

In [15]:
obs['cell_type'].astype(str).value_counts()

Oligo    321846
ExN      161024
InN       78813
Astro     61165
OPC       41691
MG        37581
VC         7678
nan          13
Name: cell_type, dtype: int64

In [16]:
obs.cell_type.value_counts()

Oligo    321846
ExN      161024
InN       78813
Astro     61165
OPC       41691
MG        37581
VC         7678
Name: cell_type, dtype: int64

In [17]:


newmeta = obs.join(metadat.set_index("cells"),lsuffix='', rsuffix='_other')
# tmp.head()
newmeta.head()

Unnamed: 0_level_0,seurat_clusters,cell_type,sample,doublet_score,nCount_RNA,nFeature_RNA,percent.mt,percent.rb,batch,sample_other,...,ExN1,InN2,MG3,Astro4,Oligo5,OPC6,VC7,type,UMAP_1,UMAP_2
cells,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
GGCCTAATCGATTTAG-1_1,8,ExN,KEN-1070-ARC,0.163312,21670,6217,0.687587,0.56299,batch1,KEN-1070-ARC,...,0.74697,0.02923,-0.121564,-0.421587,-0.665052,-0.169264,-0.119527,Mature neurons,1.518145,-11.242935
TAGTAACGTAGTCAAT-1_1,2,ExN,KEN-1070-ARC,0.143924,20190,5488,0.029718,0.307083,batch1,KEN-1070-ARC,...,0.761065,0.0385,-0.098816,-0.45502,-0.384784,-0.388421,0.015812,Mature neurons,1.569603,-1.677851
GAAAGCCAGCAGCTCA-1_1,2,ExN,KEN-1070-ARC,0.168777,17677,5687,0.797647,0.543079,batch1,KEN-1070-ARC,...,0.879119,0.083963,-0.122479,-0.364199,-0.294441,-0.305501,-0.124843,Mature neurons,6.405315,4.732371
ACTCACCTCCTCCCTC-1_1,2,ExN,KEN-1070-ARC,0.097057,17612,4954,0.062457,0.255508,batch1,KEN-1070-ARC,...,0.893122,0.067002,-0.10179,-0.407095,-0.665777,-0.354619,-0.102641,Mature neurons,1.445644,-1.882242
CTTCATCCAATCGCAC-1_1,8,ExN,KEN-1070-ARC,0.120637,17250,4837,0.011594,0.202899,batch1,KEN-1070-ARC,...,0.867374,0.120805,-0.09397,-0.422024,-0.742585,0.131618,-0.095371,Mature neurons,0.464842,-10.888965


In [18]:
obs.shape, newmeta.shape, metadat.shape

((709811, 3), (709811, 25), (709811, 23))

In [19]:
newmeta.filter(like="other")

Unnamed: 0_level_0,sample_other,seurat_clusters_other
cells,Unnamed: 1_level_1,Unnamed: 2_level_1
GGCCTAATCGATTTAG-1_1,KEN-1070-ARC,8
TAGTAACGTAGTCAAT-1_1,KEN-1070-ARC,2
GAAAGCCAGCAGCTCA-1_1,KEN-1070-ARC,2
ACTCACCTCCTCCCTC-1_1,KEN-1070-ARC,2
CTTCATCCAATCGCAC-1_1,KEN-1070-ARC,8
...,...,...
GTTGTGAGTCGCAATA-1_138,UMARY-933-ARC,3
GAAGTCAAGCCACAAT-1_138,UMARY-933-ARC,12
CTGGACCAGGCTGTGC-1_138,UMARY-933-ARC,3
TCCTCACAGGAGTAAT-1_138,UMARY-933-ARC,12


In [20]:
newmeta[newmeta.cell_type.isna()]

Unnamed: 0_level_0,seurat_clusters,cell_type,sample,doublet_score,nCount_RNA,nFeature_RNA,percent.mt,percent.rb,batch,sample_other,...,ExN1,InN2,MG3,Astro4,Oligo5,OPC6,VC7,type,UMAP_1,UMAP_2
cells,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
CGCTTACTCACAGCCA-1_2,1,,KEN-1092-ARC,0.031514,14630,4898,0.020506,0.287081,batch1,KEN-1092-ARC,...,0.872053,0.085859,-0.111636,-0.420828,-0.820315,-0.404582,-0.120384,Oligodendrocytes,4.294843,-6.207448
ATTGCTCGTTTGGGTA-1_3,1,,KEN-1095-ARC,0.118649,34437,6600,0.148097,0.209077,batch1,KEN-1095-ARC,...,0.635623,0.051017,-0.092411,-0.379622,-0.098312,-0.081187,-0.092319,Oligodendrocytes,3.644517,-4.374713
AAGCGAATCCTGAGTG-1_8,1,,KEN-1163-ARC,0.078659,17193,5123,0.0,0.203571,batch1,KEN-1163-ARC,...,0.516203,0.347202,-0.071235,-0.492124,-0.764765,-0.290564,-0.10461,Oligodendrocytes,4.560048,5.407686
CAGTATGGTCACCTAT-1_15,1,,KEN-1221-ARC,0.121194,4858,2373,0.123508,0.473446,batch2,KEN-1221-ARC,...,-0.112031,-0.004638,-0.060793,-0.268269,0.3768,-0.252827,-0.070174,Oligodendrocytes,2.938912,6.668988
GTTCTTGTCACAGGAA-1_16,1,,KEN-1229-ARC,0.155889,10412,3659,0.019209,0.182482,batch2,KEN-1229-ARC,...,0.245183,0.31858,-0.076302,-0.123405,-0.508391,2.110907,0.114409,Oligodendrocytes,-6.676789,-9.488004
TGCAGGCTCCTCACTA-1_19,1,,KEN-5034-ARC,0.030543,21067,5866,0.67404,1.17245,batch2,KEN-5034-ARC,...,0.729706,0.070132,-0.126831,-0.495572,-0.673278,-0.371601,-0.118928,Oligodendrocytes,2.926393,-8.648292
CTTAAGATCCTCCTAA-1_19,1,,KEN-5034-ARC,0.067103,17825,5606,0.516129,1.318373,batch2,KEN-5034-ARC,...,0.776774,0.080045,-0.124593,-0.431999,-0.734012,-0.375615,-0.138263,Oligodendrocytes,1.251569,-11.26194
GATGCGACACCGGCTA-1_19,1,,KEN-5034-ARC,0.047534,12142,4421,0.914182,1.095371,batch2,KEN-5034-ARC,...,0.992302,0.043303,-0.122686,-0.469985,-0.715996,-0.359051,-0.113387,Oligodendrocytes,3.566171,-4.397295
CCTTCGTAGGATGATG-1_39,1,,UMARY-1535-ARC,0.045777,3138,1572,0.350542,0.732951,batch2,UMARY-1535-ARC,...,-0.576912,-0.1646,-0.030638,-0.354151,1.433884,-0.182119,-0.043431,Oligodendrocytes,0.951665,5.399246
GCTAACAGTCACACCC-1_41,1,,UMARY-1541-ARC,0.089045,15218,5465,0.348272,2.037061,batch2,UMARY-1541-ARC,...,0.898336,0.107088,-0.137043,-0.369618,-0.337211,-0.300324,-0.145295,Oligodendrocytes,-12.127728,-9.488358


In [21]:
newmeta.loc[newmeta.cell_type.isna(),['ExN1', 'InN2', 'MG3', 'Astro4', 'Oligo5', 'OPC6', 'VC7']].idxmax(axis=1).str[:-1]

cells
CGCTTACTCACAGCCA-1_2        ExN
ATTGCTCGTTTGGGTA-1_3        ExN
AAGCGAATCCTGAGTG-1_8        ExN
CAGTATGGTCACCTAT-1_15     Oligo
GTTCTTGTCACAGGAA-1_16       OPC
TGCAGGCTCCTCACTA-1_19       ExN
CTTAAGATCCTCCTAA-1_19       ExN
GATGCGACACCGGCTA-1_19       ExN
CCTTCGTAGGATGATG-1_39     Oligo
GCTAACAGTCACACCC-1_41       ExN
CTTGCATGTTCGGGAT-1_66     Oligo
GTCTAACAGGTTAGCT-1_89       InN
CACATAATCAAACCGT-1_138      ExN
dtype: object

In [22]:
newmeta.loc[:,['cell_type','ExN1', 'InN2', 'MG3', 'Astro4', 'Oligo5', 'OPC6', 'VC7']].head()

Unnamed: 0_level_0,cell_type,ExN1,InN2,MG3,Astro4,Oligo5,OPC6,VC7
cells,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
GGCCTAATCGATTTAG-1_1,ExN,0.74697,0.02923,-0.121564,-0.421587,-0.665052,-0.169264,-0.119527
TAGTAACGTAGTCAAT-1_1,ExN,0.761065,0.0385,-0.098816,-0.45502,-0.384784,-0.388421,0.015812
GAAAGCCAGCAGCTCA-1_1,ExN,0.879119,0.083963,-0.122479,-0.364199,-0.294441,-0.305501,-0.124843
ACTCACCTCCTCCCTC-1_1,ExN,0.893122,0.067002,-0.10179,-0.407095,-0.665777,-0.354619,-0.102641
CTTCATCCAATCGCAC-1_1,ExN,0.867374,0.120805,-0.09397,-0.422024,-0.742585,0.131618,-0.095371


In [23]:
newmeta.columns

Index(['seurat_clusters', 'cell_type', 'sample', 'doublet_score', 'nCount_RNA',
       'nFeature_RNA', 'percent.mt', 'percent.rb', 'batch', 'sample_other',
       'S.Score', 'G2M.Score', 'Phase', 'RNA_snn_res.0.3',
       'seurat_clusters_other', 'ExN1', 'InN2', 'MG3', 'Astro4', 'Oligo5',
       'OPC6', 'VC7', 'type', 'UMAP_1', 'UMAP_2'],
      dtype='object')

In [24]:
newmeta[newmeta['type']=="Oligodendrocytes"]

Unnamed: 0_level_0,seurat_clusters,cell_type,sample,doublet_score,nCount_RNA,nFeature_RNA,percent.mt,percent.rb,batch,sample_other,...,ExN1,InN2,MG3,Astro4,Oligo5,OPC6,VC7,type,UMAP_1,UMAP_2
cells,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ATTTGCAAGGACCTTG-1_1,1,Oligo,KEN-1070-ARC,0.168777,16220,5423,0.184957,0.443896,batch1,KEN-1070-ARC,...,0.109137,0.295974,0.060602,0.326816,-0.405614,-0.172869,0.015316,Oligodendrocytes,6.587510,3.991664
CCACACAAGACAACAG-1_1,21,ExN,KEN-1070-ARC,0.158112,14483,4576,0.062142,0.352137,batch1,KEN-1070-ARC,...,0.736040,0.124407,-0.092193,-0.483688,-0.734667,-0.166714,-0.110623,Oligodendrocytes,0.150024,-15.214805
CCTTTAGTCTACCTCA-1_1,15,Oligo,KEN-1070-ARC,0.158112,13805,4410,0.144875,0.253531,batch1,KEN-1070-ARC,...,0.440131,0.103980,0.042900,0.574566,-0.786659,-0.379063,0.142551,Oligodendrocytes,9.710300,-4.016528
AAACAGCCAATTTAGC-1_1,15,Oligo,KEN-1070-ARC,0.163312,11458,4784,0.820388,0.890208,batch1,KEN-1070-ARC,...,0.589705,0.201384,-0.127507,0.460574,-0.639001,-0.131118,0.182299,Oligodendrocytes,12.352054,-1.716689
CTTCAAGCAGGCATCT-1_1,21,ExN,KEN-1070-ARC,0.158112,11235,4113,0.987984,0.445038,batch1,KEN-1070-ARC,...,0.770459,0.120954,-0.098665,-0.493509,-0.276506,-0.228136,-0.072055,Oligodendrocytes,1.167779,-15.360985
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GCTGACCAGTTATGGC-1_138,0,Oligo,UMARY-933-ARC,0.069502,1058,869,0.094518,4.631380,batch5,UMARY-933-ARC,...,0.538433,-0.133870,-0.075419,0.060038,1.452351,-0.161333,-0.108025,Oligodendrocytes,-2.408727,7.025146
TGAGGGAGTAGCCATA-1_138,21,MG,UMARY-933-ARC,0.012398,1044,711,0.383142,2.969349,batch5,UMARY-933-ARC,...,-0.311351,-0.123546,0.617438,-0.201591,-0.303397,-0.166077,-0.050781,Oligodendrocytes,0.384808,-13.859167
CAGCTATAGCTCCTTA-1_138,12,Oligo,UMARY-933-ARC,0.057813,1048,903,0.190840,3.435115,batch5,UMARY-933-ARC,...,-0.060555,-0.145141,0.469202,0.511153,0.872755,-0.174964,0.524642,Oligodendrocytes,-4.269923,-3.157500
GAAGTCAAGCCACAAT-1_138,12,ExN,UMARY-933-ARC,0.027593,1022,788,0.782779,2.348337,batch5,UMARY-933-ARC,...,0.343290,0.242140,-0.081679,0.112211,-0.311931,-0.171589,-0.134319,Oligodendrocytes,-4.665269,-3.478462


In [25]:
newmeta['cell_type'].astype(str).value_counts()


Oligo    321846
ExN      161024
InN       78813
Astro     61165
OPC       41691
MG        37581
VC         7678
nan          13
Name: cell_type, dtype: int64

In [26]:
newmeta['cell_type'].value_counts()


Oligo    321846
ExN      161024
InN       78813
Astro     61165
OPC       41691
MG        37581
VC         7678
Name: cell_type, dtype: int64

In [7]:
clean_samples_path = raw_data_path / "Model Combinations - clean_samples_138.csv"
clean_samples = pd.read_csv(clean_samples_path)

# all_samples_path = "/content/drive/MyDrive/SingleCellModel/Model Combinations - all_samples_199.csv"
# all_samples = pd.read_csv(all_samples_path)

# dirty_samples_path = "/content/drive/MyDrive/SingleCellModel/Model Combinations - dirty_samples_61.csv"
# dirty_samples = pd.read_csv(dirty_samples_path)

test_samples_path = raw_data_path / "Model Combinations - testing_set_41.csv"
test_samples = pd.read_csv(test_samples_path)

train_samples_path = raw_data_path / "Model Combinations - training_set_98.csv"
train_samples = pd.read_csv(train_samples_path)
clean_samples.head()



Unnamed: 0,sample,batch
0,KEN-1070-ARC,batch1
1,KEN-1092-ARC,batch1
2,KEN-1095-ARC,batch1
3,KEN-1127-ARC,batch1
4,KEN-1132-ARC,batch1


In [9]:
clean_samples.value_counts()

sample          batch 
KEN-1066-ARC    batch5    1
UMARY-1935-ARC  batch3    1
UMARY-1845-ARC  batch5    1
UMARY-1849-ARC  batch3    1
UMARY-1859-ARC  batch3    1
                         ..
UMARY-1135-ARC  batch1    1
UMARY-1134-ARC  batch5    1
UMARY-1133-ARC  batch5    1
SH-98-23-ARC    batch5    1
UMARY-933-ARC   batch5    1
Length: 138, dtype: int64

In [29]:
newmeta['clean'] =  [s in set(clean_samples['sample']) for s in newmeta['sample'] ]
newmeta['test'] =  [s in set(test_samples['sample']) for s in newmeta['sample'] ]
newmeta['train'] =  [s in set(train_samples['sample']) for s in newmeta['sample'] ]



### WARNING:  

Fix the missing `cell_type` by inferring label from the class logits columns in newmeta.  Not sure why these were missing.

In [30]:
newmeta.loc[newmeta.cell_type.isna(),'cell_type'] = newmeta.loc[newmeta.cell_type.isna(),['ExN1', 'InN2', 'MG3', 'Astro4', 'Oligo5', 'OPC6', 'VC7']].idxmax(axis=1).str[:-1]

In [31]:
# newmeta = newmeta.drop(columns=['seurat_clusters_other', 'sample_other'])
newmeta['tmp']=newmeta['cell_type'].astype(str) + " " + newmeta['type'].astype(str)


In [32]:
newmeta.tmp.value_counts()

Oligo Oligodendrocytes                   304502
ExN Mature neurons                       130801
InN GABAergic neurons                     78684
Astro Astrocytes                          61041
OPC Oligodendrocyte precursor cells       40036
MG Microglial cells                       37139
ExN Glutamatergic neurons                 27071
Oligo Mature neurons                      10579
VC Endothelial cells                       7012
Oligo Oligodendrocyte precursor cells      3928
ExN GABAergic neurons                      1678
ExN Oligodendrocytes                       1456
Oligo Astrocytes                           1252
Oligo GABAergic neurons                    1140
OPC Mature neurons                          863
OPC Oligodendrocytes                        750
VC Microglial cells                         499
MG Astrocytes                               270
Oligo Glutamatergic neurons                 237
MG Oligodendrocytes                         167
Oligo Endothelial cells                 

In [33]:
assert newmeta['clean'].sum() == raw_ad.shape[0]



In [34]:
# update anndata

raw_ad.obs = newmeta


In [35]:
# do this to hedge against empty var isues

raw_ad.var['feat'] = raw_ad.var_names

DEPRICATED:  code empty "cell_type" as "Unknown". We "inferred" the missing data above

In [54]:
# raw_ad.obs['cell_type'].cat.add_categories('Unknown', inplace=True)
# raw_ad.obs.loc[raw_ad.obs['cell_type'].isnull(),'cell_type'] = "Unknown"

In [36]:
raw_ad.obs['cell_type'].astype(str).value_counts()

Oligo    321849
ExN      161032
InN       78814
Astro     61165
OPC       41692
MG        37581
VC         7678
Name: cell_type, dtype: int64

In [37]:
outfilen = raw_data_path / XYLENA_ANNDATA.replace(".h5ad", "_updated.h5ad")

raw_ad.write_h5ad(outfilen)


# add categories for all batch, and sample


In [38]:
raw_ad.X[:10,:10]

array([[ 0.,  0.,  0.,  2.,  0., 15.,  0.,  1.,  0.,  1.],
       [ 0.,  0.,  0.,  1.,  0., 36.,  0.,  0.,  1.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  7.,  0.,  1.,  1.,  0.],
       [ 0.,  0.,  0.,  1.,  0., 34.,  0.,  0.,  2.,  0.],
       [ 0.,  0.,  0.,  0.,  0., 31.,  0.,  1.,  1.,  0.],
       [ 0.,  0.,  0.,  1.,  0., 24.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0., 48.,  0.,  1.,  1.,  0.],
       [ 0.,  0.,  0.,  0.,  0., 41.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0., 20.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0., 32.,  0.,  0.,  0.,  0.]], dtype=float32)

# make train and validation adatas

In [39]:
train_ad = raw_ad[raw_ad.obs['train']].copy()
test_ad = raw_ad[raw_ad.obs['test']].copy()


In [48]:
del raw_ad

In [49]:
del metadat, obs, og_metadat

In [40]:

raw_ad.obs.test.sum() == test_ad.shape[0], raw_ad.obs.train.sum() == train_ad.shape[0]


(True, True)

In [41]:
set(train_ad.obs['cell_type'].cat.categories.tolist()) - set(test_ad.obs['cell_type'].cat.categories.tolist())

set()

In [46]:

outfilen = raw_data_path / XYLENA_ANNDATA.replace(".h5ad", "_train.h5ad")
train_ad.write_h5ad(outfilen)


In [43]:

outfilen = raw_data_path / XYLENA_ANNDATA.replace(".h5ad", "_test.h5ad")
test_ad.write_h5ad(outfilen)

reload the data & compute pcs

In [50]:
del train_ad, test_ad

Train first

In [51]:
filen = raw_data_path / XYLENA_ANNDATA.replace(".h5ad", "_train.h5ad")
train_ad = ad.read_h5ad(filen)

In [52]:
# pcs
sc.pp.pca(train_ad)

In [53]:
train_ad

AnnData object with n_obs × n_vars = 502085 × 3000
    obs: 'seurat_clusters', 'cell_type', 'sample', 'doublet_score', 'nCount_RNA', 'nFeature_RNA', 'percent.mt', 'percent.rb', 'batch', 'sample_other', 'S.Score', 'G2M.Score', 'Phase', 'RNA_snn_res.0.3', 'seurat_clusters_other', 'ExN1', 'InN2', 'MG3', 'Astro4', 'Oligo5', 'OPC6', 'VC7', 'type', 'UMAP_1', 'UMAP_2', 'clean', 'test', 'train', 'tmp'
    var: 'feat'
    uns: 'pca'
    obsm: 'X_pca'
    varm: 'PCs'

In [54]:
# the *_cnt files have the PCAs computed.
outfilen = data_path / XYLENA_TRAIN
train_ad.write_h5ad(outfilen)


Now the test data

In [55]:
filen = raw_data_path / XYLENA_ANNDATA.replace(".h5ad", "_test.h5ad")
test_ad = ad.read_h5ad(filen)

# ?test_ad = ad.read_h5ad(filen)
test_ad

AnnData object with n_obs × n_vars = 207726 × 3000
    obs: 'seurat_clusters', 'cell_type', 'sample', 'doublet_score', 'nCount_RNA', 'nFeature_RNA', 'percent.mt', 'percent.rb', 'batch', 'sample_other', 'S.Score', 'G2M.Score', 'Phase', 'RNA_snn_res.0.3', 'seurat_clusters_other', 'ExN1', 'InN2', 'MG3', 'Astro4', 'Oligo5', 'OPC6', 'VC7', 'type', 'UMAP_1', 'UMAP_2', 'clean', 'test', 'train', 'tmp'
    var: 'feat'

In [56]:
# now we need to copy the PCs to the test set and compute loadings.
from lbl8r.utils import transfer_pcs

test_ad = transfer_pcs(train_ad, test_ad)

In [57]:
# the *_cnt files have the PCAs computed.
outfilen = data_path / XYLENA_TEST
test_ad.write_h5ad(outfilen)


Make "sparse" versions to control memory


In [58]:
import scipy.sparse

train_ad.X = scipy.sparse.csr_matrix(train_ad.X)
outfilen = data_path / XYLENA_TRAIN.replace(".h5ad", "_sparse.h5ad")
train_ad.write_h5ad(outfilen)

In [59]:
test_ad.X = scipy.sparse.csr_matrix(test_ad.X)
outfilen = data_path / XYLENA_TEST.replace(".h5ad", "_sparse.h5ad")
test_ad.write_h5ad(outfilen)

In [60]:
del test_ad
del train_ad


In [61]:
# RAW / FULL

full_ad = ad.read_h5ad(raw_data_path / XYLENA_ANNDATA2)

full_ad.X = scipy.sparse.csr_matrix(full_ad.X)

outfilen = raw_data_path / XYLENA_ANNDATA.replace(".h5ad", "_sparse.h5ad")
full_ad.write_h5ad(outfilen)
del full_ad