# Create anndata object with counts for:

# all our BM, PB and mPB samples

In [1]:
%matplotlib widget

import warnings
warnings.filterwarnings('ignore')

import os, sys, math, re
import natsort, json, operator, getpass
from pathlib import Path
from datetime import datetime

import numpy as np
import pandas as pd

import scanpy as sc
import scanpy.external as sce

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

import ipywidgets as widgets

In [2]:
with open('/.singularity.d/labels.json') as fh:
    singularity = json.load(fh)
    
singularity['Version']

'metztli.25j'

In [3]:
sc.settings.verbosity = 3             # show some output
sc.settings.file_format_figs = 'svg'  # set this to 'svg' (notebook) or 'pdf' (files) if you want vector graphics
sc.settings.savefigs = False

In [4]:
home = str(Path.home())
user = getpass.getuser()

In [5]:
"""
Allows to create a persistent png image from
the last plotted matplotlib plot widget
"""
def perma_plot():
    
    import base64
    from io import BytesIO
    from IPython.display import HTML
    
    # TODO: maybe pass parameters for savefig
    #       to control quality/type of img
    buffer = BytesIO()
    plt.savefig(buffer, format='png')
    buffer.seek(0)
    
    encoded_string = base64.b64encode(buffer.read()).decode('utf-8')
    html_string = '<img src=\'data:image/png;base64,{}\'>'.format(encoded_string)
    
    return HTML(html_string)
    

In [6]:
basedir = os.path.join(home, 'databoard/users', user, '2021/BloodPaper/')

sc.settings.writedir = os.path.join(basedir, 'h5ad/')

In [7]:
now = datetime.now()
prefix = now.strftime('%Y%m%d')
print(prefix)

20211110


In [8]:
reference = sc.read('20211102_COMBO10_annotated_reference_filtered_and_clean')

In [9]:
reference

AnnData object with n_obs × n_vars = 117200 × 24332
    obs: 'batch', 'n_counts', 'n_genes', 'library', 'donor', 'organ', 'leiden.1.2', 'annot', 'silhouette.1.2', 'is_root', 'dpt_pseudotime', 'dpt_pseudotime_rank', 'S_score', 'G2M_score', 'phase'
    var: 'gene_ids', 'n_cells', 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
    uns: 'annot_colors', 'dex_leiden_1_2', 'diffmap_evals', 'donor_colors', 'draw_graph', 'hvg', 'iroot', 'leiden', 'leiden.1.2_colors', 'log1p', 'neighbors', 'phase_colors'
    obsm: 'X_diffmap', 'X_draw_graph_fa', 'X_pca', 'X_umap'
    layers: 'counts', 'lognorm'
    obsp: 'connectivities', 'distances'

Subset to just SPL cells for ref

In [10]:
ref = reference[reference.obs.organ.isin(['BM', 'PB']), :].copy()
ref.shape

(95132, 24332)

In [11]:
ref.X.min(), ref.X.max()

(0.0, 9.661618)

In [16]:
ref.obs.head()

Unnamed: 0,batch,n_counts,n_genes,library,donor,organ,leiden.1.2,annot,silhouette.1.2,is_root,dpt_pseudotime,dpt_pseudotime_rank,S_score,G2M_score,phase
_01_AAACCTGCACGTCTCT.1.0,0,10052.0,2399,SIGAD9,DOD1,PB,2,02_MEP2,-0.095935,False,0.224698,66554.0,-0.156682,-0.06228,G1
_01_AAACCTGGTGTGACGA.1.0,0,14884.0,3610,SIGAD9,DOD1,PB,13,13_MEP cycle,0.295117,False,0.396628,85875.0,0.383165,0.065524,S
_01_AAACCTGGTTCCGGCA.1.0,0,5547.0,1544,SIGAD9,DOD1,PB,0,00_HSC/MPP2,0.198961,False,0.082326,23761.0,-0.166414,-0.085699,G1
_01_AAACGGGGTCAAAGCG.1.0,0,4184.0,1159,SIGAD9,DOD1,PB,0,00_HSC/MPP2,0.031663,False,0.097205,34985.0,-0.096252,-0.099328,G1
_01_AAAGATGCACATGACT.1.0,0,12027.0,2548,SIGAD9,DOD1,PB,0,00_HSC/MPP2,0.034625,False,0.093352,32259.0,-0.135317,-0.064835,G1


---

---

In [12]:
target = sc.read('20211108_CONCAT_4x_mPB_lognorm')

In [13]:
target.X.min(), target.X.max()

(0.0, 11.212971)

In [14]:
target.shape

(28026, 20187)

In [15]:
target.obs.head()

Unnamed: 0,organ,library,doublet_scores,barcode,status,assignment,log_prob_singleton,log_prob_doublet,cluster0,cluster1,genotype,n_counts,n_counts_log,n_genes,percent_mitoc,leiden.1.0,donor,annot,dataset
AAACCCAAGGAGACCT-1-0,PB,SITTA6,0.228916,AAACCCAAGGAGACCT-1,singlet,0,-489.115073,-686.336309,-489.115073,-1615.466288,0,21339.0,4.329174,3882,0.036459,0,a_0,06_MyP1,0
AAACCCAAGGGAGATA-1-0,PB,SITTA6,0.242637,AAACCCAAGGGAGATA-1,singlet,0,-358.557057,-481.137968,-358.557057,-1107.146993,0,17929.0,4.253556,3197,0.050867,2,a_0,06_MyP1,0
AAACCCAAGTTGCTCA-1-0,PB,SITTA6,0.142056,AAACCCAAGTTGCTCA-1,singlet,1,-1035.739013,-1471.492468,-3158.403231,-1035.739013,1,25065.0,4.399068,4920,0.055057,9,a_1,09_MyP2,0
AAACCCAGTACGGCAA-1-0,PB,SITTA6,0.175573,AAACCCAGTACGGCAA-1,singlet,0,-607.267486,-811.240879,-607.267486,-1754.079661,0,18397.0,4.264747,3681,0.068435,6,a_0,06_MyP1,0
AAACCCAGTCATGCAT-1-0,PB,SITTA6,0.145299,AAACCCAGTCATGCAT-1,singlet,0,-460.927737,-628.601506,-460.927737,-1328.441485,0,15533.0,4.191256,3272,0.040816,0,a_0,05_HSC/MPP3,0


Adjust index for 'Seurat compliance'

In [19]:
target.obs.index = [ x.replace('-', '.') for x in target.obs.index ]

In [20]:
combo = ref.concatenate(target, batch_key='groupset', batch_categories=['ref', 'target'], index_unique=None)

In [21]:
combo

AnnData object with n_obs × n_vars = 123158 × 20169
    obs: 'batch', 'n_counts', 'n_genes', 'library', 'donor', 'organ', 'leiden.1.2', 'annot', 'silhouette.1.2', 'is_root', 'dpt_pseudotime', 'dpt_pseudotime_rank', 'S_score', 'G2M_score', 'phase', 'doublet_scores', 'barcode', 'status', 'assignment', 'log_prob_singleton', 'log_prob_doublet', 'cluster0', 'cluster1', 'genotype', 'n_counts_log', 'percent_mitoc', 'leiden.1.0', 'dataset', 'groupset'
    var: 'gene_ids', 'n_cells-ref', 'highly_variable-ref', 'means-ref', 'dispersions-ref', 'dispersions_norm-ref', 'feature_types-target', 'genome-target', 'n_cells-0-target', 'highly_variable-0-target', 'means-0-target', 'dispersions-0-target', 'dispersions_norm-0-target', 'mean-0-target', 'std-0-target', 'n_cells-1-target', 'highly_variable-1-target', 'means-1-target', 'dispersions-1-target', 'dispersions_norm-1-target', 'mean-1-target', 'std-1-target'
    obsm: 'X_pca', 'X_umap'
    layers: 'counts', 'lognorm'

In [22]:
combo.obs.head()

Unnamed: 0,batch,n_counts,n_genes,library,donor,organ,leiden.1.2,annot,silhouette.1.2,is_root,...,log_prob_singleton,log_prob_doublet,cluster0,cluster1,genotype,n_counts_log,percent_mitoc,leiden.1.0,dataset,groupset
_01_AAACCTGCACGTCTCT.1.0,0,10052.0,2399,SIGAD9,DOD1,PB,2,02_MEP2,-0.095935,False,...,,,,,,,,,,ref
_01_AAACCTGGTGTGACGA.1.0,0,14884.0,3610,SIGAD9,DOD1,PB,13,13_MEP cycle,0.295117,False,...,,,,,,,,,,ref
_01_AAACCTGGTTCCGGCA.1.0,0,5547.0,1544,SIGAD9,DOD1,PB,0,00_HSC/MPP2,0.198961,False,...,,,,,,,,,,ref
_01_AAACGGGGTCAAAGCG.1.0,0,4184.0,1159,SIGAD9,DOD1,PB,0,00_HSC/MPP2,0.031663,False,...,,,,,,,,,,ref
_01_AAAGATGCACATGACT.1.0,0,12027.0,2548,SIGAD9,DOD1,PB,0,00_HSC/MPP2,0.034625,False,...,,,,,,,,,,ref


In [23]:
combo.obs.tail()

Unnamed: 0,batch,n_counts,n_genes,library,donor,organ,leiden.1.2,annot,silhouette.1.2,is_root,...,log_prob_singleton,log_prob_doublet,cluster0,cluster1,genotype,n_counts_log,percent_mitoc,leiden.1.0,dataset,groupset
TTTGTTGCAAGCTACT.1.1,,27911.0,4841,SITTC8,b_1,PB,,07_MPP-to-CMP,,,...,-1275.60993,-1605.20313,-3526.124283,-1275.60993,1,4.445776,0.038121,1,1,target
TTTGTTGCACACGGTC.1.1,,29528.0,5371,SITTC8,b_0,PB,,06_MyP1,,,...,-1277.471931,-1836.788855,-1277.471931,-3733.587694,0,4.470234,0.044771,0,1,target
TTTGTTGGTGGTCTAT.1.1,,9930.0,2662,SITTC8,b_0,PB,,06_MyP1,,,...,-409.00824,-598.676178,-409.00824,-1195.806278,0,3.996949,0.037664,0,1,target
TTTGTTGTCACTTGTT.1.1,,17219.0,3893,SITTC8,b_0,PB,,06_MyP1,,,...,-816.248605,-1210.665789,-816.248605,-2389.389793,0,4.236008,0.041001,0,1,target
TTTGTTGTCGCAGTCG.1.1,,12823.0,3016,SITTC8,b_0,PB,,00_HSC/MPP2,,,...,-428.890376,-616.4889,-428.890376,-1231.941425,0,4.10799,0.057709,2,1,target


In [24]:
sex_linked = ['RPS4Y1','NACA2','RPL10L','TIPIN','ZNF90','UQCRHL','DDX3Y','EIF1AY',
              'MIF-AS1','ATP5L2','GREM1','EDARADD','AC009501.4','NBEAL1','MTRNR2L12',
              'FKBP1C','AC090498.1','NHSL2','LRRC69','MTRNR2L8','HNRNPA1L2','PABPC3',
              'RP11-302B13.5','RP5-940J5.9','EIF5AL','XIST']

In [25]:
combo = combo[:, [x for x in combo.var.index if x not in sex_linked]].copy()

In [26]:
combo[:, combo.var.index].X

<123158x20150 sparse matrix of type '<class 'numpy.float32'>'
	with 284958961 stored elements in Compressed Sparse Row format>

In [27]:
combo.obs.columns

Index(['batch', 'n_counts', 'n_genes', 'library', 'donor', 'organ',
       'leiden.1.2', 'annot', 'silhouette.1.2', 'is_root', 'dpt_pseudotime',
       'dpt_pseudotime_rank', 'S_score', 'G2M_score', 'phase',
       'doublet_scores', 'barcode', 'status', 'assignment',
       'log_prob_singleton', 'log_prob_doublet', 'cluster0', 'cluster1',
       'genotype', 'n_counts_log', 'percent_mitoc', 'leiden.1.0', 'dataset',
       'groupset'],
      dtype='object')

In [29]:
combo.obs[['groupset', 'library', 'donor', 'organ', 'annot']]

Unnamed: 0,groupset,library,donor,organ,annot
_01_AAACCTGCACGTCTCT.1.0,ref,SIGAD9,DOD1,PB,02_MEP2
_01_AAACCTGGTGTGACGA.1.0,ref,SIGAD9,DOD1,PB,13_MEP cycle
_01_AAACCTGGTTCCGGCA.1.0,ref,SIGAD9,DOD1,PB,00_HSC/MPP2
_01_AAACGGGGTCAAAGCG.1.0,ref,SIGAD9,DOD1,PB,00_HSC/MPP2
_01_AAAGATGCACATGACT.1.0,ref,SIGAD9,DOD1,PB,00_HSC/MPP2
...,...,...,...,...,...
TTTGTTGCAAGCTACT.1.1,target,SITTC8,b_1,PB,07_MPP-to-CMP
TTTGTTGCACACGGTC.1.1,target,SITTC8,b_0,PB,06_MyP1
TTTGTTGGTGGTCTAT.1.1,target,SITTC8,b_0,PB,06_MyP1
TTTGTTGTCACTTGTT.1.1,target,SITTC8,b_0,PB,06_MyP1


In [30]:
meta = combo.obs[['groupset', 'library', 'donor', 'organ', 'annot']].copy()

In [31]:
meta.to_csv('output/'+prefix+'_COMBO_PB10PLUS_filtered_metadata.txt', sep='\t')

In [32]:
combo.shape, meta.shape

((123158, 20150), (123158, 5))

In [34]:
combo.X.min(), combo.X.max()

(0.0, 11.212971)

In [35]:
combo.X = combo.layers['counts'].copy()

In [36]:
combo.obs = combo.obs[['groupset', 'library', 'donor', 'organ', 'annot']].copy()

In [37]:
combo.obs.head()

Unnamed: 0,groupset,library,donor,organ,annot
_01_AAACCTGCACGTCTCT.1.0,ref,SIGAD9,DOD1,PB,02_MEP2
_01_AAACCTGGTGTGACGA.1.0,ref,SIGAD9,DOD1,PB,13_MEP cycle
_01_AAACCTGGTTCCGGCA.1.0,ref,SIGAD9,DOD1,PB,00_HSC/MPP2
_01_AAACGGGGTCAAAGCG.1.0,ref,SIGAD9,DOD1,PB,00_HSC/MPP2
_01_AAAGATGCACATGACT.1.0,ref,SIGAD9,DOD1,PB,00_HSC/MPP2


In [38]:
%%time
sc.write(prefix+'_COMBO_PB10PLUS_clean', combo)

... storing 'library' as categorical
... storing 'donor' as categorical
... storing 'organ' as categorical
... storing 'annot' as categorical


CPU times: user 2min 39s, sys: 2.99 s, total: 2min 42s
Wall time: 2min 42s


In [40]:
prefix+'_COMBO_PB10PLUS_clean'

'20211110_COMBO_PB10PLUS_clean'