# Statistical tests for cell type proportions

**Data**
- donor deconvolved scRNAseq data
- flow cytometry data

**Tests**
- unpaired t-tests

## Imports

In [1]:
DATA_DIR = '/lustre/groups/ml01/workspace/louis.kuemmerle/projects/A1/data2/' # "./../"
DATA_VERSION = 'oct22'#'april21'
RESULTS_DIR = '/lustre/groups/ml01/workspace/louis.kuemmerle/projects/A1/results/'
SHAM = True

########################################################################################
sham_str = '_wSham' if SHAM else ''

In [2]:
import numpy as np
from scipy import stats
import anndata as ad
import pandas as pd
import scanpy as sc
import matplotlib.pyplot as plt

In [4]:
#print(adata.obs["region"].cat.categories())
#print(adata.uns["region_colors"])

for r,c in zip(adata.obs["condition"].cat.categories,adata.uns["condition_colors"]):
    print(f"{r} : {c},")

MCAO : #1f77b4,
Naive : #ff7f0e,
Sham : #279e68,


## Load deconvolution data
(erythroid cells are not included since they messed up the deconvolution)

In [3]:
samples = [
    'MUC12819', 'MUC12820', 'MUC12821', 'MUC12822', 'MUC12823', 'MUC12824', 'MUC12825', 'MUC12826', 
    'MUC12827', 'MUC12828', 'MUC12829', 'MUC12830', 'MUC12831', 'MUC12832', 'MUC12833', 'MUC12834', 
    'MUC12835', 'MUC12836', 'MUC12837', 'MUC12838', 'MUC12839', 'MUC12840', 'MUC12841', 'MUC12842', 
    'MUC12843', 'MUC12844', 'MUC12845', 'MUC12846', 'MUC12847', 'MUC12848', 'MUC12849', 'MUC12850'
]

In [3]:
adata = sc.read(DATA_DIR+f'cellxgene_{DATA_VERSION}{sham_str}_umaps.h5ad')

In [5]:
# Get donor deconvolution info
dfs = []
for i,sample in enumerate(samples):
    df = pd.read_csv(RESULTS_DIR + f"donor_deconv/vireo_noeryt/{sample}/donor_ids.tsv", sep='\t')
    df.index = [idx+f"-{i}" for idx in df["cell"].values]
    df["sample_donor_id"] = sample + "_" + df["donor_id"]
    df = df[["donor_id","sample_donor_id"]]
    dfs.append(df)
df = pd.concat(dfs)

# Add info to adata
adata = adata[df.loc[df.index.isin(adata.obs_names)].index] # with new celltype annotations not all obs_names are still there
#adata = adata[df.index]
adata.obs["donor_id"] = df["donor_id"]
adata.obs["sample_donor_id"] = df["sample_donor_id"]

# filter out Meninges and Brain + unassigned and doublet cells from deconvolution
bones = ['Femur', 'Humerus', 'Pelvis', 'Scapula', 'Skull', 'Vertebra']
adata = adata[adata.obs["region"].isin(bones)]
adata = adata[adata.obs["donor_id"].isin(["donor0","donor1","donor2"])]

# Create column for plot
rename_bones = {
    "Skull":"Calvaria",
    "Pelvis":"Pelvis",
    "Femur":"Femur",
    "Shoulder":"Scapula",
    "Forearm":"Humerus",
    "Vertebra":"Vertebra",
    #"Meninges":"Meninges",
    #"Brain":"Brain"
}
rename_donor = {"donor0":"1","donor1":"2","donor2":"3"}#,"unassigned":"4","doublet":"5"}
adata.obs["region_sample_donor"] = adata.obs[["sample_name","donor_id"]].apply(lambda s: rename_bones[s[0].split("_")[0]] + "_" + s[0].split("_")[2] + "." + rename_donor[s[1]],axis=1)

  res = method(*args, **kwargs)
Trying to set attribute `.obs` of view, copying.
Trying to set attribute `.obs` of view, copying.


In [6]:
def get_proportions_df(adata,condition=None):
    """
    """
    
    a = adata
    if condition:
        a = a[a.obs["condition"]==condition]
    
    df = pd.crosstab(a.obs["region_sample_donor"],a.obs["level1"]).copy()
    df = df.div(df.sum(axis=1), axis=0)

    df.index = df.index.astype(str)
    df.columns = df.columns.astype(str)
    df = df.reset_index()
    df = df.melt(id_vars='region_sample_donor')
    df = df.rename(columns={'value': 'perc', 'level1': 'celltype'})
    if condition:
        df["condition"] = condition
    
    return df

def get_proportions_df_level2(adata,condition=None):
    """
    """
    
    a = adata
    if condition:
        a = a[a.obs["condition"]==condition]
    
    df = pd.crosstab(a.obs["region_sample_donor"],a.obs["level2"]).copy()
    df = df.div(df.sum(axis=1), axis=0)

    df.index = df.index.astype(str)
    df.columns = df.columns.astype(str)
    df = df.reset_index()
    df = df.melt(id_vars='region_sample_donor')
    df["level1"] = df["level2"].apply(lambda ct2: a.obs.loc[a.obs["level2"]==ct2,"level1"].unique()[0])
    df = df.rename(columns={'value': 'perc', 'level2': 'celltype'})
    if condition:
        df["condition"] = condition
    
    return df

In [7]:
# Order for regions and conditions
cond_order = ["Naive","Sham","MCAo"]
#region_order = ["Calvaria","Vertebra","Scapula","Humerus","Pelvis","Femur"]
region_sample_donor_order = [
    "Calvaria_1.1","Calvaria_1.2","Calvaria_1.3","Calvaria_2.1","Calvaria_2.2","Calvaria_2.3",
    "Vertebra_1.1","Vertebra_1.2","Vertebra_1.3","Vertebra_2.1","Vertebra_2.2","Vertebra_2.3",
    "Scapula_1.1","Scapula_1.2","Scapula_1.3","Scapula_2.1","Scapula_2.2","Scapula_2.3",
    "Humerus_1.1","Humerus_1.2","Humerus_1.3","Humerus_2.1","Humerus_2.2","Humerus_2.3",
    "Pelvis_1.1","Pelvis_1.2","Pelvis_1.3","Pelvis_2.1","Pelvis_2.2","Pelvis_2.3",
    "Femur_1.1","Femur_1.2","Femur_1.3","Femur_2.1","Femur_2.2","Femur_2.3",
]
ct_order = [
    'progenitors', 'neutrophil', 'monocyte', 'B cell', 'T cell', 'NK cell',
    'NK-T cell', 'dendritic cell', 'macrophage', #'microglia',
    'basophil', 'structural cell', #'erythroid precursor', 'erythroid cell', 
    'brain cell', 'megakaryocyte', 'innate lymphoid cell'
]
ct_colors = adata.uns["level1_colors"].tolist()

# Get df
#bones = ['Femur', 'Humerus', 'Pelvis', 'Scapula', 'Skull','Vertebra']
dfs = [
    get_proportions_df(adata,condition="Naive"),
    get_proportions_df(adata,condition="Sham"),
    get_proportions_df(adata,condition="MCAO"),
]

df = pd.concat(dfs)
df.loc[df["condition"]=="MCAO","condition"] = "MCAo"
#df.loc[df["region"]=="Skull","region"] = "Calvaria"

df["region"] = df["region_sample_donor"].astype(str).apply(lambda x: x.split("_")[0])
df["condition"] = df["condition"].astype("category")
df["condition"] = df["condition"].cat.reorder_categories(cond_order)
#df["region"] = df["region"].astype("category")
#df["region"] = df["region"].cat.reorder_categories(region_order)
df["region_sample_donor"] = df["region_sample_donor"].astype("category")
df["region_sample_donor"] = df["region_sample_donor"].cat.reorder_categories(region_sample_donor_order)
df["celltype"] = df["celltype"].astype("category")
df["celltype"] = df["celltype"].cat.reorder_categories(ct_order)#[::-1])
df["perc"] *= 100

  res = method(*args, **kwargs)
  res = method(*args, **kwargs)


In [8]:
df_RNAseq = df
df_RNAseq

Unnamed: 0,region_sample_donor,celltype,perc,condition,region
0,Calvaria_1.1,progenitors,0.746269,Naive,Calvaria
1,Calvaria_1.2,progenitors,0.454545,Naive,Calvaria
2,Calvaria_1.3,progenitors,6.377079,Naive,Calvaria
3,Femur_1.1,progenitors,2.810811,Naive,Femur
4,Femur_1.2,progenitors,10.964230,Naive,Femur
...,...,...,...,...,...
463,Vertebra_1.2,innate lymphoid cell,0.000000,MCAo,Vertebra
464,Vertebra_1.3,innate lymphoid cell,0.041000,MCAo,Vertebra
465,Vertebra_2.1,innate lymphoid cell,0.199800,MCAo,Vertebra
466,Vertebra_2.2,innate lymphoid cell,0.000000,MCAo,Vertebra


In [9]:
# Order for regions and conditions
cond_order = ["Naive","Sham","MCAo"]
#region_order = ["Calvaria","Vertebra","Scapula","Humerus","Pelvis","Femur"]
region_sample_donor_order = [
    "Calvaria_1.1","Calvaria_1.2","Calvaria_1.3","Calvaria_2.1","Calvaria_2.2","Calvaria_2.3",
    "Vertebra_1.1","Vertebra_1.2","Vertebra_1.3","Vertebra_2.1","Vertebra_2.2","Vertebra_2.3",
    "Scapula_1.1","Scapula_1.2","Scapula_1.3","Scapula_2.1","Scapula_2.2","Scapula_2.3",
    "Humerus_1.1","Humerus_1.2","Humerus_1.3","Humerus_2.1","Humerus_2.2","Humerus_2.3",
    "Pelvis_1.1","Pelvis_1.2","Pelvis_1.3","Pelvis_2.1","Pelvis_2.2","Pelvis_2.3",
    "Femur_1.1","Femur_1.2","Femur_1.3","Femur_2.1","Femur_2.2","Femur_2.3",
]
ct_order = [
    'hematopoietic stem cell', 'common myeloid progenitor', 'granulocyte-monocyte progenitor', 'neutrophil-primed GMP', 
    'monocyte-primed GMP', 'monocyte-DC progenitor (MDP)', 'common DC progenitor (CDP)', 'pro neutrophil', 'pre neutrophil', 
    'immature neutrophil', 'mature neutrophil', 'monocyte progenitor', 'classical monocyte', 'non-classical monocyte', 
    'pro B cell', 'pre B cell', 'immature B cell', 'mature B cell', 'plasma cell', 'Cd8 T cell', 'Cd4 T cell', 'gdT cell', 
    'NK-T cell', 'NK cell', 'plasmacytoid DC', 'conventional DC1', 'conventional DC2', 'monocyte-derived DC', 'B cell-DC hybrid', 
    'neutrophil-DC hybrid', 'perivascular macrophage', 'macrophage', 'antigen-presenting macrophage', 'basophil progenitor', 
    'basophil', 'fibroblast', 'dural fibroblast', 'endothelial cell', 'Gnb3+ cell', 'megakaryocyte', 'innate lymphoid cell', 
    'meningeal-Choroid Plexus cell', 'oligodendrocyte', 'adipose-derived stromal cell'
]
ct_order = [ct for ct in ct_order if ct in adata.obs["level2"].cat.categories]
#ct_order = [
#    'hematopoietic stem cell', 'common myeloid progenitor', 'granulocyte-monocyte progenitor', 'neutrophil-primed GMP', 
#    'monocyte-primed GMP', 'monocyte-DC progenitor (MDP)', 'common DC progenitor (CDP)', 'pro neutrophil', 'pre neutrophil', 
#    'immature neutrophil', 'mature neutrophil', 'monocyte progenitor', 'classical monocyte', 'non-classical monocyte', 
#    'pro B cell', 'pre B cell', 'immature B cell', 'mature B cell', 'plasma cell', 'Cd8 T cell', 'Cd4 T cell', 'gdT cell', 
#    'NK-T cell', 'NK cell', 'plasmacytoid DC', 'conventional DC1', 'conventional DC2', 'monocyte-derived DC', 'B cell-DC hybrid', 
#    'neutrophil-DC hybrid', 'perivascular macrophage', 'macrophage', 'antigen-presenting macrophage', 'basophil progenitor', 
#    'basophil', 'fibroblast', 'dural fibroblast', 'endothelial cell', 'Gnb3+ cell', 'megakaryocyte', 'innate lymphoid cell', 
#    'meningeal-Choroid Plexus cell', 'oligodendrocyte', 'adipose-derived stromal cell'
#    
#    #'progenitors', 'neutrophil', 'monocyte', 'B cell', 'T cell', 'NK cell',
#    #'NK-T cell', 'dendritic cell', 'macrophage', #'microglia',
#    #'basophil', 'structural cell', #'erythroid precursor', 'erythroid cell', 
#    #'brain cell', 'megakaryocyte', 'innate lymphoid cell'
#]
ct_colors = adata.uns["level2_colors"].tolist()

# Get df
#bones = ['Femur', 'Humerus', 'Pelvis', 'Scapula', 'Skull','Vertebra']
dfs = [
    get_proportions_df_level2(adata,condition="Naive"),
    get_proportions_df_level2(adata,condition="Sham"),
    get_proportions_df_level2(adata,condition="MCAO"),
]

df = pd.concat(dfs)
df.loc[df["condition"]=="MCAO","condition"] = "MCAo"
#df.loc[df["region"]=="Skull","region"] = "Calvaria"

df["region"] = df["region_sample_donor"].astype(str).apply(lambda x: x.split("_")[0])
df["condition"] = df["condition"].astype("category")
df["condition"] = df["condition"].cat.reorder_categories(cond_order)
#df["region"] = df["region"].astype("category")
#df["region"] = df["region"].cat.reorder_categories(region_order)
df["region_sample_donor"] = df["region_sample_donor"].astype("category")
df["region_sample_donor"] = df["region_sample_donor"].cat.reorder_categories(region_sample_donor_order)
df["celltype"] = df["celltype"].astype("category")
df["celltype"] = df["celltype"].cat.reorder_categories(ct_order)#[::-1])
df["perc"] *= 100

  res = method(*args, **kwargs)
  res = method(*args, **kwargs)
  res = method(*args, **kwargs)


In [10]:
df_RNAseq_level2 = df
df_RNAseq_level2

Unnamed: 0,region_sample_donor,celltype,perc,level1,condition,region
0,Calvaria_1.1,hematopoietic stem cell,0.149254,progenitors,Naive,Calvaria
1,Calvaria_1.2,hematopoietic stem cell,0.151515,progenitors,Naive,Calvaria
2,Calvaria_1.3,hematopoietic stem cell,1.016636,progenitors,Naive,Calvaria
3,Femur_1.1,hematopoietic stem cell,0.648649,progenitors,Naive,Femur
4,Femur_1.2,hematopoietic stem cell,1.321928,progenitors,Naive,Femur
...,...,...,...,...,...,...
1363,Vertebra_1.2,innate lymphoid cell,0.000000,innate lymphoid cell,MCAo,Vertebra
1364,Vertebra_1.3,innate lymphoid cell,0.041000,innate lymphoid cell,MCAo,Vertebra
1365,Vertebra_2.1,innate lymphoid cell,0.199800,innate lymphoid cell,MCAo,Vertebra
1366,Vertebra_2.2,innate lymphoid cell,0.000000,innate lymphoid cell,MCAo,Vertebra


## Load flow cytometry data

In [11]:
sample_files = {
    '220822_FACS':'Analysis_Ilgin.xlsx',
    '221019_FACS':'20221014_Ilgin_newEO.xlsx',
}
SAMPLE = '221019_FACS'


dfs = {}
for cond in ["naive","SHAM","MCAO"]:
    dfs[cond] = pd.read_excel(DATA_DIR+SAMPLE+"/"+sample_files[SAMPLE], index_col=0, sheet_name=cond) 
    
# We want to pool the neutrophils for the coarse annotations plot and create an extra plot for neutrophils

for cond in ["naive","SHAM","MCAO"]:
    dfs[cond]["neutrophils (%CD45+)"] = dfs[cond][['early neutrophils (%CD45+)', 'late neutrophils (%CD45+)']].sum(axis=1)
    dfs[cond] = dfs[cond].rename(columns={
        'early neutrophils (%CD45+)': 'early neutrophils (%neutrophils)', 
        'late neutrophils (%CD45+)': 'late neutrophils (%neutrophils)'
    })
    dfs[cond][['early neutrophils (%neutrophils)', 'late neutrophils (%neutrophils)']] *= (100/dfs[cond]["neutrophils (%CD45+)"].values[:,np.newaxis])

In [12]:
for group in ["%alive", "%CD45+", "%Bcells", "%neutrophils"]:
    for cond in ["naive","SHAM","MCAO"]:
        dfs[cond][f"other {group[1:]} ({group})"] = 100 - dfs[cond][[col for col in dfs[cond].columns if group in col]].sum(axis=1)

In [13]:
# Here just for colors
adata = sc.read(DATA_DIR+f'cellxgene_{DATA_VERSION}{sham_str}_umaps.h5ad')

In [14]:
def get_cond_sample_region_from_idx(idx):
    """
    """

    # condition
    if "naive" in idx:
        cond = "Naive"
    elif "SHAM" in idx:
        cond = "Sham"
    elif "MCAO" in idx:
        cond = "MCAo"
    else: # for samples Specimen_001_blood_015.fcs, Specimen_001_femur_002.fcs, Specimen_001_skull  seq_003_008.fcs, Specimen_001_spleen 3_011.fcs, Specimen_001_spleen_003.fcs, Specimen_001_vertebra  seq_002_007.fcs
        cond = "Naive"
        
    # sample
    SAMPLES = ["naive 1", "naive 2", "naive 3", "SHAM1", "SHAM2", "SHAM3", "MCAO1", "MCAO2", "MCAO3"]
    SPECIAL_SAMPLES = {
        'Specimen_001_blood_015.fcs'             :"naive ?",
        'Specimen_001_femur_002.fcs'             :"naive ?",
        'Specimen_001_skull  seq_003_008.fcs'    :"naive ?",
        'Specimen_001_spleen 3_011.fcs'          :"naive ?",
        'Specimen_001_spleen_003.fcs'            :"naive ??",
        'Specimen_001_vertebra  seq_002_007.fcs' :"naive ?",
    }
    sample = None
    for s in SAMPLES:
        
        if idx in SPECIAL_SAMPLES:
            sample = SPECIAL_SAMPLES[idx]
        # No sample info:
        #Specimen_001_blood_015.fcs
        #
        #Specimen_001_femur_002.fcs
        #Specimen_001_skull  seq_003_008.fcs
        #
        #Specimen_001_spleen 3_011.fcs
        #Specimen_001_spleen_003.fcs
        #Specimen_001_vertebra  seq_002_007.fcs
        
        elif s in idx:
            sample = s
            
    # region
    REGIONS = {"blood": "Blood", "spleen":"Spleen", "femur":"Femur", "skull":"Calvaria", "vertebra":"Vertebra", "vertebtra":"Vertebra"}
    region = None
    for r_data, r in REGIONS.items():
        if r_data in idx:
            region = r
        
    return cond, sample, region

In [15]:
#
dfs_tmp = []
for c in ["naive", "SHAM", "MCAO"]:
    for idx, perc in dfs[c].iterrows():
        cond, sample, region = get_cond_sample_region_from_idx(idx)
        dfs_tmp.append(pd.DataFrame(
            data={"idx":idx,"celltype_group":perc.index, "perc":perc.values, "condition":cond, "region":region, "sample":sample})
        )
        
df = pd.concat(dfs_tmp)

#
df["cell_group"] = df["celltype_group"].apply(lambda x: x.split("(%")[-1].split(")")[0])
df["celltype"] = df["celltype_group"].apply(lambda x: x.split(" (")[0])
df["region_sample"] = df.apply(lambda x: str(x["region"])+"_"+str(x["sample"]), axis=1)
df["region_condition"] = df["region"].astype(str) + "_" + df["condition"].astype(str)
df = df.loc[~df.idx.isin(["Mean","SD"])]


#
cond_order = ["Naive","Sham","MCAo"]
region_order = ["Calvaria","Vertebra","Scapula","Humerus","Pelvis","Femur","pooled","Blood","Spleen"]
ct_order = [
    'LSK',
    'neutrophils',
    'early neutrophils', 
    'late neutrophils', 
    'Monocytes',
    'B-cells', #'Bcells',
    'immature B-cells', #'immature/maturing Bcells', 
    'mature B-cells', #'mature Bcells', 
    'T-cells', #'Tcells',
    'NK-cells', #'NKcells',
    'CD45+', 
    'Erythrocyte(progenitor)', #'Erythrocytes/Ery-progenitor',
    'eosinophils',
    'other alive',
    'other CD45+',
    'other Bcells',
    'other neutrophils',    
]


region_sample_order = [
    'Calvaria_naive 1', 'Calvaria_naive 2', 'Calvaria_naive 3', 'Calvaria_naive ?', 
    'Calvaria_SHAM1', 'Calvaria_SHAM2', 'Calvaria_SHAM3',
    'Calvaria_MCAO1','Calvaria_MCAO2','Calvaria_MCAO3',

    'Vertebra_naive 1', 'Vertebra_naive 2', 'Vertebra_naive 3', 'Vertebra_naive ?', 
    'Vertebra_SHAM1', 'Vertebra_SHAM2', 'Vertebra_SHAM3', 
    'Vertebra_MCAO1', 'Vertebra_MCAO2', 'Vertebra_MCAO3',
        
    'Femur_naive 1', 'Femur_naive 2', 'Femur_naive 3', 'Femur_naive ?',
    'Femur_SHAM1', 'Femur_SHAM2', 'Femur_SHAM3', 
    'Femur_MCAO1', 'Femur_MCAO2', 'Femur_MCAO3', 

    'Blood_naive ?',
    'Blood_SHAM1', 
    'Blood_SHAM3',
    'Blood_MCAO1',
    'Blood_MCAO2',
    'Blood_MCAO3',
            
    'Spleen_naive ?',
    'Spleen_naive ??',
    'Spleen_SHAM2',
    'Spleen_MCAO2'
]
region_condition_order = [
    'Calvaria_Naive', 'Vertebra_Naive', 'Femur_Naive', 'Blood_Naive', 'Spleen_Naive',
    'Calvaria_Sham', 'Vertebra_Sham', 'Femur_Sham', 'Blood_Sham', 'Spleen_Sham',
    'Calvaria_MCAo', 'Vertebra_MCAo', 'Femur_MCAo', 'Blood_MCAo', 'Spleen_MCAo'
]
#ct_colors = adata.uns["level1_colors"].tolist()

df["condition"] = df["condition"].astype("category")
df["condition"] = df["condition"].cat.reorder_categories(cond_order)
df["region"] = df["region"].astype("category")
df["region"] = df["region"].cat.reorder_categories([r for r in region_order if r in df["region"].cat.categories])
df["celltype"] = df["celltype"].astype("category")
df["celltype"] = df["celltype"].cat.reorder_categories(ct_order)#[::-1])
df["region_sample"] = df["region_sample"].astype("category")
df["region_sample"] = df["region_sample"].cat.reorder_categories(region_sample_order)#[::-1])
df["region_condition"] = df["region_condition"].astype("category")
df["region_condition"] = df["region_condition"].cat.reorder_categories(region_condition_order)#[::-1])



#
# cell types are not perfectly mapped, just to get similar colors
colormap_cts_level1 = {
    'LSK'                         : "progenitors",
    'neutrophils'                 : "neutrophil",
    'early neutrophils'           : None, 
    'late neutrophils'            : None, 
    'Monocytes'                   : "monocyte",
    'B-cells'                     : "B cell", #'Bcells'                      : "B cell",
    'immature B-cells'            : None, #'immature/maturing Bcells'    : None, 
    'mature B-cells'              : None, #'mature Bcells'               : None, 
    'T-cells'                     : "T cell", #'Tcells'                      : "T cell",
    'NK-cells'                    : "NK cell", #'NKcells'                     : "NK cell",
    'CD45+'                       : None, 
    'Erythrocyte(progenitor)'     : "erythroid cell", #'Erythrocytes/Ery-progenitor' : "erythroid cell",
    'eosinophils'                 : "basophil", 
    'other alive'                 : "microglia",
    'other CD45+'                 : "microglia",
    'other Bcells'                : "microglia",
    'other neutrophils'           : "microglia",
}

colormap_cts_level2 = {
    'early neutrophils'           : 'pre neutrophil', 
    'late neutrophils'            : 'mature neutrophil', 
    'immature B-cells'            : 'immature B cell', #'immature/maturing Bcells'    : 'immature B cell', 
    'mature B-cells'              : 'mature B cell', #'mature Bcells'               : 'mature B cell', 
    "CD45+"                       : "macrophage", # just to have some color
}


ct_colors = []
for ct in ct_order:
    if colormap_cts_level1[ct] is not None:
        ct_colors.append(
            adata.uns["level1_colors"].tolist()[adata.obs["level1"].cat.categories.tolist().index(colormap_cts_level1[ct])]
        )
    else:
        ct_colors.append(
            adata.uns["level2_colors"].tolist()[adata.obs["level2"].cat.categories.tolist().index(colormap_cts_level2[ct])]
        )        

In [16]:
df_flow = df
df_flow

Unnamed: 0,idx,celltype_group,perc,condition,region,sample,cell_group,celltype,region_sample,region_condition
0,Specimen_001_blood_015.fcs,Erythrocyte(progenitor) (% alive),91.100,Naive,Blood,naive ?,alive,Erythrocyte(progenitor),Blood_naive ?,Blood_Naive
1,Specimen_001_blood_015.fcs,CD45+ (% alive),6.310,Naive,Blood,naive ?,alive,CD45+,Blood_naive ?,Blood_Naive
2,Specimen_001_blood_015.fcs,T-cells (%CD45+),12.800,Naive,Blood,naive ?,CD45+,T-cells,Blood_naive ?,Blood_Naive
3,Specimen_001_blood_015.fcs,NK-cells (%CD45+),1.590,Naive,Blood,naive ?,CD45+,NK-cells,Blood_naive ?,Blood_Naive
4,Specimen_001_blood_015.fcs,B-cells (%CD45+),64.300,Naive,Blood,naive ?,CD45+,B-cells,Blood_naive ?,Blood_Naive
...,...,...,...,...,...,...,...,...,...,...
12,Specimen_001_MCAO3-skull_020.fcs,neutrophils (%CD45+),59.000,MCAo,Calvaria,MCAO3,CD45+,neutrophils,Calvaria_MCAO3,Calvaria_MCAo
13,Specimen_001_MCAO3-skull_020.fcs,other alive (%alive),100.000,MCAo,Calvaria,MCAO3,alive,other alive,Calvaria_MCAO3,Calvaria_MCAo
14,Specimen_001_MCAO3-skull_020.fcs,other CD45+ (%CD45+),13.118,MCAo,Calvaria,MCAO3,CD45+,other CD45+,Calvaria_MCAO3,Calvaria_MCAo
15,Specimen_001_MCAO3-skull_020.fcs,other Bcells (%Bcells),3.200,MCAo,Calvaria,MCAO3,Bcells,other Bcells,Calvaria_MCAO3,Calvaria_MCAo


## Run tests

### t-tests

In [17]:
display(df_RNAseq.head())
display(df_flow.head())

Unnamed: 0,region_sample_donor,celltype,perc,condition,region
0,Calvaria_1.1,progenitors,0.746269,Naive,Calvaria
1,Calvaria_1.2,progenitors,0.454545,Naive,Calvaria
2,Calvaria_1.3,progenitors,6.377079,Naive,Calvaria
3,Femur_1.1,progenitors,2.810811,Naive,Femur
4,Femur_1.2,progenitors,10.96423,Naive,Femur


Unnamed: 0,idx,celltype_group,perc,condition,region,sample,cell_group,celltype,region_sample,region_condition
0,Specimen_001_blood_015.fcs,Erythrocyte(progenitor) (% alive),91.1,Naive,Blood,naive ?,alive,Erythrocyte(progenitor),Blood_naive ?,Blood_Naive
1,Specimen_001_blood_015.fcs,CD45+ (% alive),6.31,Naive,Blood,naive ?,alive,CD45+,Blood_naive ?,Blood_Naive
2,Specimen_001_blood_015.fcs,T-cells (%CD45+),12.8,Naive,Blood,naive ?,CD45+,T-cells,Blood_naive ?,Blood_Naive
3,Specimen_001_blood_015.fcs,NK-cells (%CD45+),1.59,Naive,Blood,naive ?,CD45+,NK-cells,Blood_naive ?,Blood_Naive
4,Specimen_001_blood_015.fcs,B-cells (%CD45+),64.3,Naive,Blood,naive ?,CD45+,B-cells,Blood_naive ?,Blood_Naive


#### Calvaria vs other bones

In [38]:
other_bones = ['Femur', 'Humerus', 'Pelvis', 'Scapula', 'Vertebra']


rows = {
    "test":"Calvaria vs other bones", "condition":[], "method":[], "coarse/fine":[], 
    "cell type":[], "higher in":[], "t-statistic":[], "pval":[], "significant":[]
}
def append_row(rows, method="flow", condition="MCAo", precision="coarse", celltype="B cell", higher_in="Calvaria", statistic=1.0, pval=0.35):
    rows["method"].append(method)
    rows["condition"].append(condition)
    rows["coarse/fine"].append(precision)
    rows["cell type"].append(celltype)
    rows["higher in"].append(higher_in)
    rows["t-statistic"].append(statistic)
    rows["pval"].append(pval)
    rows["significant"].append(pval < 0.05)

higher_in = lambda s: "Calvaria" if s > 0 else "other bones"
    

print("###### Calvaria vs other bones ######")

for cond in ["Naive","Sham","MCAo"]:
    print(f"### {cond} ###")
    print("\t ### coarse ###")
    for ct in df_RNAseq["celltype"].unique():
        d1 = df_RNAseq.loc[(df_RNAseq["condition"] == cond) & (df_RNAseq["celltype"] == ct) & (df_RNAseq["region"] == "Calvaria"),"perc"].values
        d2 = df_RNAseq.loc[(df_RNAseq["condition"] == cond) & (df_RNAseq["celltype"] == ct) & (df_RNAseq["region"].isin(other_bones)),"perc"].values
        statistic, pval = stats.ttest_ind(d1, d2)
        append_row(rows, method="scRNAseq", condition=cond, precision="coarse", celltype=ct, higher_in=higher_in(statistic), statistic=statistic, pval=pval)
        if pval < 0.05:
            print("\t", ct.ljust(df_RNAseq["celltype"].apply(lambda x:len(x)).max()), str(round(statistic,4)).ljust(4+3), str(round(pval,4)).ljust(4+3))
    tmp = df_RNAseq_level2.loc[df_RNAseq_level2["level1"].isin(['neutrophil', 'B cell'])].copy()
    print("\t ### fine ###")
    for ct in tmp["celltype"].unique():
        d1 = tmp.loc[(tmp["condition"] == cond) & (tmp["celltype"] == ct) & (tmp["region"] == "Calvaria"),"perc"].values
        d2 = tmp.loc[(tmp["condition"] == cond) & (tmp["celltype"] == ct) & (tmp["region"].isin(other_bones)),"perc"].values
        statistic, pval = stats.ttest_ind(d1, d2)
        append_row(rows, method="scRNAseq", condition=cond, precision="fine", celltype=ct, higher_in=higher_in(statistic), statistic=statistic, pval=pval)
        if pval < 0.05:
            print("\t", ct.ljust(tmp["celltype"].apply(lambda x:len(x)).max()), str(round(statistic,4)).ljust(4+3), str(round(pval,4)).ljust(4+3))    
    

df_flow_coarse = df_flow.loc[df_flow["cell_group"].isin(['CD45+'])] #'alive', 
df_flow_fine = df_flow.loc[df_flow["cell_group"].isin(['Bcells', 'neutrophils'])] #['Bcells', 'neutrophils'])]    
for cond in ["Naive","Sham","MCAo"]:
    print(f"### {cond} ###")
    print("\t ### coarse ###")    
    for ct in df_flow["celltype"].unique():
        d1 = df_flow_coarse.loc[(df_flow_coarse["condition"] == cond) & (df_flow_coarse["celltype"] == ct) & (df_flow_coarse["region"] == "Calvaria"),"perc"].values
        d2 = df_flow_coarse.loc[(df_flow_coarse["condition"] == cond) & (df_flow_coarse["celltype"] == ct) & (df_flow_coarse["region"].isin(other_bones)),"perc"].values
        statistic, pval = stats.ttest_ind(d1, d2)
        append_row(rows, method="flow", condition=cond, precision="coarse", celltype=ct, higher_in=higher_in(statistic), statistic=statistic, pval=pval)
        if pval < 0.05:
            print("\t", ct.ljust(df_flow["celltype"].apply(lambda x:len(x)).max()), str(round(statistic,4)).ljust(4+3), str(round(pval,4)).ljust(4+3))
    print("\t ### fine ###")            
    for ct in df_flow["celltype"].unique():
        d1 = df_flow_fine.loc[(df_flow_fine["condition"] == cond) & (df_flow_fine["celltype"] == ct) & (df_flow_fine["region"] == "Calvaria"),"perc"].values
        d2 = df_flow_fine.loc[(df_flow_fine["condition"] == cond) & (df_flow_fine["celltype"] == ct) & (df_flow_fine["region"].isin(other_bones)),"perc"].values
        statistic, pval = stats.ttest_ind(d1, d2)
        append_row(rows, method="flow", condition=cond, precision="fine", celltype=ct, higher_in=higher_in(statistic), statistic=statistic, pval=pval)
        if pval < 0.05:
            print("\t", ct.ljust(df_flow["celltype"].apply(lambda x:len(x)).max()), str(round(statistic,4)).ljust(4+3), str(round(pval,4)).ljust(4+3))            
            
df_skull_vs_bones = pd.DataFrame(data=rows)
df_skull_vs_bones = df_skull_vs_bones.loc[~df_skull_vs_bones["pval"].isnull()]
df_skull_vs_bones = df_skull_vs_bones.sort_values(["method", "condition", "coarse/fine", "t-statistic"], ascending=[True,True,True,False])
df_skull_vs_bones = df_skull_vs_bones.reset_index(drop=True)

###### Calvaria vs other bones ######
### Naive ###
	 ### coarse ###
	 monocyte             -2.1776 0.0447 
	 T cell               2.8496  0.0116 
	 ### fine ###
	 pro neutrophil      -2.3663 0.0309 
	 pre neutrophil      -2.377  0.0303 
	 mature neutrophil   2.6443  0.0177 
	 immature B cell     -3.1942 0.0056 
### Sham ###
	 ### coarse ###
	 monocyte             -2.4188 0.0279 
	 B cell               3.0359  0.0079 
	 T cell               4.8394  0.0002 
	 basophil             -3.1528 0.0062 
	 ### fine ###
	 pro neutrophil      -2.9025 0.0104 
	 pre neutrophil      -2.1546 0.0468 
	 immature neutrophil -3.3267 0.0043 
	 mature neutrophil   3.185   0.0058 
	 immature B cell     2.8732  0.011  
	 mature B cell       2.773   0.0136 
### MCAo ###
	 ### coarse ###
	 monocyte             -2.1793 0.0363 
	 NK cell              2.5507  0.0154 
	 dendritic cell       -2.329  0.0259 
	 ### fine ###
	 mature neutrophil   2.8401  0.0076 
### Naive ###
	 ### coarse ###
	 B-cells                 

#### Naive vs injury

In [39]:
bones = ['Calvaria', 'Femur', 'Humerus', 'Pelvis', 'Scapula', 'Vertebra']

dfs = {
    "scRNAseq": {
        "coarse":df_RNAseq,
        "fine"  :df_RNAseq_level2.loc[df_RNAseq_level2["level1"].isin(['neutrophil', 'B cell'])],
    },
    "flow": {
        "coarse":df_flow.loc[df_flow["cell_group"].isin(['CD45+'])], #'alive', 
        "fine"  :df_flow.loc[df_flow["cell_group"].isin(['Bcells', 'neutrophils'])]  , #['Bcells', 'neutrophils'])]  ,
    }
}

print("###### Naive vs Injury ######")

rows = {
    "test":"Naive vs Injury", "bones":"all", "method":[], "coarse/fine":[], 
    "cell type":[], "higher in":[], "t-statistic":[], "pval":[]
}
for method in dfs:
    print(f"### {method} ###")
    for precision, df in dfs[method].items():
        print(f"\t ### {precision} ###")
        for ct in df["celltype"].unique():
            d1 = df.loc[(df["condition"] == "Naive") & (df["celltype"] == ct) & (df["region"].isin(bones)),"perc"].values
            d2 = df.loc[(df["condition"].isin(["Sham","MCAo"])) & (df["celltype"] == ct) & (df["region"].isin(bones)),"perc"].values
            statistic, pval = stats.ttest_ind(d1, d2)
            rows["method"].append(method)
            rows["coarse/fine"].append(precision)
            rows["cell type"].append(ct)
            rows["higher in"].append("Naive" if statistic > 0 else "Injury")
            rows["t-statistic"].append(statistic)
            rows["pval"].append(pval)
            if pval < 0.05:
                print("\t", ct.ljust(df["celltype"].apply(lambda x:len(x)).max()), str(round(statistic,4)).ljust(4+3), str(round(pval,4)).ljust(4+3))

df_naive_vs_injury = pd.DataFrame(data=rows)
df_naive_vs_injury["significant"] = df_naive_vs_injury["pval"] < 0.05
df_naive_vs_injury = df_naive_vs_injury.loc[~df_naive_vs_injury["pval"].isnull()]
df_naive_vs_injury = df_naive_vs_injury.sort_values(["method", "coarse/fine", "t-statistic"], ascending=[True,True,False])
df_naive_vs_injury = df_naive_vs_injury.reset_index(drop=True)
#print("### scRNAseq ###")
#for ct in df_RNAseq["celltype"].unique():
#    d1 = df_RNAseq.loc[(df_RNAseq["condition"] == "Naive") & (df_RNAseq["celltype"] == ct) & (df_RNAseq["region"].isin(bones)),"perc"].values
#    d2 = df_RNAseq.loc[(df_RNAseq["condition"].isin(["Sham","MCAo"])) & (df_RNAseq["celltype"] == ct) & (df_RNAseq["region"].isin(bones)),"perc"].values
#    statistic, pval = stats.ttest_ind(d1, d2)
#    if pval < 0.05:
#        print("\t", ct.ljust(df_RNAseq["celltype"].apply(lambda x:len(x)).max()), str(round(statistic,4)).ljust(4+3), str(round(pval,4)).ljust(4+3))
#            
#print("### flow ###")
#for ct in df_flow["celltype"].unique():
#    d1 = df_flow.loc[(df_flow["condition"] == "Naive") & (df_flow["celltype"] == ct) & (df_flow["region"].isin(bones)),"perc"].values
#    d2 = df_flow.loc[(df_flow["condition"].isin(["Sham","MCAo"])) & (df_flow["celltype"] == ct) & (df_flow["region"].isin(bones)),"perc"].values
#    statistic, pval = stats.ttest_ind(d1, d2)
#    if pval < 0.05:
#        print("\t", ct.ljust(df_flow["celltype"].apply(lambda x:len(x)).max()), str(round(statistic,4)).ljust(4+3), str(round(pval,4)).ljust(4+3))

###### Naive vs Injury ######
### scRNAseq ###
	 ### coarse ###
	 T cell               -4.156  0.0001 
	 dendritic cell       4.4518  0.0    
	 macrophage           2.9146  0.0048 
	 ### fine ###
	 immature neutrophil -2.6393 0.0102 
	 mature neutrophil   2.4382  0.0173 
	 pro B cell          11.8439 0.0    
	 pre B cell          34.6326 0.0    
	 immature B cell     13.0151 0.0    
	 mature B cell       -4.8022 0.0    
### flow ###
	 ### coarse ###
	 T-cells     -4.7782 0.0001 
	 NK-cells    7.9479  0.0    
	 B-cells     2.0568  0.0491 
	 Monocytes   2.937   0.0066 
	 LSK         2.6538  0.013  
	 ### fine ###
	 immature B-cells  7.9814  0.0    
	 mature B-cells    -9.1064 0.0    
	 other Bcells      8.212   0.0    


In [28]:
from pandas import ExcelWriter

In [40]:
xls_path = "./proportions_stats_tests.xlsx"
with ExcelWriter(xls_path) as writer:
    
    df_naive_vs_injury.to_excel(writer, "Naive vs Injury")
    
    df_skull_vs_bones.to_excel(writer, "Calvaria vs other bones")
    

In [31]:
from math import log10, floor
round_to_n = lambda x, n: round(x, -int(floor(log10(x))) + (n - 1))
round_to_3 = lambda x: str(round(x, -int(floor(log10(x))) + (3 - 1)))

df_naive_vs_injury.loc[~df_naive_vs_injury["pval"].isnull(),"pval"].apply(round_to_3)

0        0.478
1        0.582
2        0.267
3       0.0702
4     9.03e-05
5        0.976
6       0.0529
7     3.14e-05
8      0.00478
9        0.101
10       0.875
11       0.951
12       0.944
14      0.0751
15       0.136
16      0.0102
17      0.0173
18     2.2e-18
19    8.75e-46
20    2.29e-20
21    8.62e-06
22       0.934
23    5.09e-05
24    1.18e-08
25      0.0491
26     0.00656
27       0.107
28       0.013
29      0.0601
30       0.848
31    1.08e-08
32    7.29e-10
33       0.114
34       0.114
35    6.14e-09
36       0.324
Name: pval, dtype: object

In [28]:
bones = ['Calvaria', 'Femur', 'Humerus', 'Pelvis', 'Scapula', 'Vertebra']

dfs = {
    "scRNAseq": {
        "coarse":df_RNAseq,
        "fine"  :df_RNAseq_level2.loc[df_RNAseq_level2["level1"].isin(['neutrophil', 'B cell'])],
    },
    "flow": {
        "coarse":df_flow.loc[df_flow["cell_group"].isin(['CD45+'])], #'alive', 
        "fine"  :df_flow.loc[df_flow["cell_group"].isin(['Bcells', 'neutrophils'])]  , #['Bcells', 'neutrophils'])]  ,
    }
}

print("###### Naive vs Injury ######")

for method in dfs:
    print(f"### {method} ###")
    for precision, df in dfs[method].items():
        print(f"\t ### {precision} ###")
        for ct in df["celltype"].unique():
            d1 = df.loc[(df["condition"] == "Naive") & (df["celltype"] == ct) & (df["region"].isin(bones)),"perc"].values
            d2 = df.loc[(df["condition"].isin(["Sham","MCAo"])) & (df["celltype"] == ct) & (df["region"].isin(bones)),"perc"].values
            statistic, pval = stats.ttest_ind(d1, d2)
            if pval < 0.05:
                print("\t", ct.ljust(df["celltype"].apply(lambda x:len(x)).max()), str(round(statistic,4)).ljust(4+3), str(round(pval,4)).ljust(4+3))

#print("### scRNAseq ###")
#for ct in df_RNAseq["celltype"].unique():
#    d1 = df_RNAseq.loc[(df_RNAseq["condition"] == "Naive") & (df_RNAseq["celltype"] == ct) & (df_RNAseq["region"].isin(bones)),"perc"].values
#    d2 = df_RNAseq.loc[(df_RNAseq["condition"].isin(["Sham","MCAo"])) & (df_RNAseq["celltype"] == ct) & (df_RNAseq["region"].isin(bones)),"perc"].values
#    statistic, pval = stats.ttest_ind(d1, d2)
#    if pval < 0.05:
#        print("\t", ct.ljust(df_RNAseq["celltype"].apply(lambda x:len(x)).max()), str(round(statistic,4)).ljust(4+3), str(round(pval,4)).ljust(4+3))
#            
#print("### flow ###")
#for ct in df_flow["celltype"].unique():
#    d1 = df_flow.loc[(df_flow["condition"] == "Naive") & (df_flow["celltype"] == ct) & (df_flow["region"].isin(bones)),"perc"].values
#    d2 = df_flow.loc[(df_flow["condition"].isin(["Sham","MCAo"])) & (df_flow["celltype"] == ct) & (df_flow["region"].isin(bones)),"perc"].values
#    statistic, pval = stats.ttest_ind(d1, d2)
#    if pval < 0.05:
#        print("\t", ct.ljust(df_flow["celltype"].apply(lambda x:len(x)).max()), str(round(statistic,4)).ljust(4+3), str(round(pval,4)).ljust(4+3))

###### Naive vs Injury ######
### scRNAseq ###
	 ### coarse ###
	 T cell               -4.1573 0.0001 
	 macrophage           2.9135  0.0048 
	 ### fine ###
	 immature neutrophil -2.6436 0.0101 
	 mature neutrophil   2.4292  0.0177 
	 pro B cell          11.8349 0.0    
	 pre B cell          34.6748 0.0    
	 immature B cell     13.0017 0.0    
	 mature B cell       -4.8132 0.0    
### flow ###
	 ### coarse ###
	 T-cells     -4.7782 0.0001 
	 NK-cells    7.9479  0.0    
	 B-cells     2.0568  0.0491 
	 Monocytes   2.937   0.0066 
	 LSK         2.6538  0.013  
	 ### fine ###
	 immature B-cells  7.9814  0.0    
	 mature B-cells    -9.1064 0.0    
	 other Bcells      8.212   0.0    


In [29]:
dfs["flow"]["fine"]["celltype_group"].unique()

array(['immature B-cells (%Bcells)', 'mature B-cells (%Bcells)',
       'early neutrophils (%neutrophils)',
       'late neutrophils (%neutrophils)', 'other Bcells (%Bcells)',
       'other neutrophils (%neutrophils)'], dtype=object)