In [1]:
DATA_DIR = '/lustre/groups/ml01/workspace/louis.kuemmerle/projects/A1/data2/' # "./../"
DATA_VERSION = 'oct22'
RESULTS_DIR = '/lustre/groups/ml01/workspace/louis.kuemmerle/projects/A1/results/'
SHAM = True

########################################################################################
sham_str = '_wSham' if SHAM else ''

In [49]:
import numpy as np
import pandas as pd
import scanpy as sc
from pathlib import Path

# Compute and save cell count tables for celltypes

We generate cell count tables at:  
`DATA_DIR+f'table_{DATA_VERSION}/cell_counts_{group}.csv'`  
`DATA_DIR+f'table_{DATA_VERSION}/immune_only/cell_counts_{group}.csv'`  
`DATA_DIR+f'table_{DATA_VERSION}/bones_pooled/cell_counts_{group}.csv'`  

(these tables are needed for the proportion plots)

In [15]:
#sham_str = '_wSham' if WITH_SHAM else ''
adata = sc.read(DATA_DIR+f'cellxgene_{DATA_VERSION}{sham_str}_umaps.h5ad')

In [16]:
# Rename Skull to Calvaria
adata.obs["region"].cat.rename_categories({'Skull': 'Calvaria'}, inplace=True)

In [5]:
umap_groups = dict({'full set' : adata.obs['level1'].unique().tolist()},**adata.uns['umap_groups'])
colors = {ct:color for ct,color in zip(adata.obs['level2'].cat.categories.tolist(),adata.uns['level2_colors'].tolist())}
colors_full_set = {ct:color for ct,color in zip(adata.obs['level1'].cat.categories.tolist(),adata.uns['level1_colors'].tolist())}

region_order = ['Brain','Meninges','Calvaria','Vertebra','Scapula','Humerus','Femur','Pelvis']
#region_order = ['Brain','Meninges','Skull','Vertebra','Scapula','Humerus','Femur','Pelvis']
# actually the reordering here did not change the plots (I needed to add reordering in the R script as well)

for group, cts in umap_groups.items():
    count_dfs = []
    count_dfs_bones_only = []
    count_dfs_bones_pooled = []
    for c in ['Naive','Sham','MCAO']:
        df = adata.obs.loc[(adata.obs['condition'] == c) & adata.obs['level1'].isin(cts)].copy()
        key2 = 'level1' if (group == 'full set') else 'level2'
        count_dfs.append(pd.crosstab(df['region'],df[key2]))
        count_dfs[-1] = count_dfs[-1].loc[[r for r in region_order if (r in count_dfs[-1].index)]]
        count_dfs[-1].columns = count_dfs[-1].columns.astype(str)
        count_dfs_bones_only.append(count_dfs[-1].loc[[r for r in ['Calvaria','Vertebra','Scapula','Humerus','Femur','Pelvis'] if r in count_dfs[-1].index]].copy())
        #count_dfs_bones_only.append(count_dfs[-1].loc[[r for r in ['Skull','Vertebra','Scapula','Humerus','Femur','Pelvis'] if r in count_dfs[-1].index]].copy())
        count_dfs_bones_pooled.append(count_dfs[-1].loc[[r for r in ['Brain','Meninges'] if r in count_dfs[-1].index]].copy())
        count_dfs_bones_pooled[-1].loc['Bones'] = count_dfs_bones_only[-1].sum(axis=0).values        
        for dfs_tmp in [count_dfs,count_dfs_bones_only,count_dfs_bones_pooled]:
            dfs_tmp[-1].reset_index(inplace=True)  
            dfs_tmp[-1].columns.name = None
            dfs_tmp[-1].insert(0, 'condition', c)
    counts = pd.concat(count_dfs,ignore_index=True)
    counts_bones_only = pd.concat(count_dfs_bones_only,ignore_index=True)
    counts_bones_pooled = pd.concat(count_dfs_bones_pooled,ignore_index=True)
    
    #display(counts)
    #display(counts_bones_only)
    #display(counts_bones_pooled)
    #break
    
    data_dir = DATA_DIR+f'table_{DATA_VERSION}/'
    
    Path(data_dir).mkdir(parents=True, exist_ok=True)
    counts.to_csv(data_dir+f'cell_counts_{group}.csv')
    
    # We don't need this, we have an option in the plotting function to plot bones only
    #Path(data_dir+'bones_only/').mkdir(parents=True, exist_ok=True)
    #counts_bones_only.to_csv(data_dir+f'bones_only/cell_counts_{group}.csv')
    
    Path(data_dir+'bones_pooled/').mkdir(parents=True, exist_ok=True)
    counts_bones_pooled.to_csv(data_dir+f'bones_pooled/cell_counts_{group}.csv')
    
    Path(data_dir+'colors/').mkdir(parents=True, exist_ok=True)
    colors_tmp = colors_full_set if (group == 'full set') else colors
    c_tmp = pd.DataFrame(data={'celltype':[col for col in counts.columns if col not in ['condition','region']],
                               'color':[colors_tmp[ct] for ct in counts.columns  if ct not in ['condition','region']]})
    c_tmp.to_csv(data_dir+f'colors/cell_counts_{group}.csv')

#### immune cells only

In [22]:
exclude = ['erythroid precursor', 'erythroid cell', 'structural cell', 'brain cell']
adata = adata[~adata.obs["level1"].isin(exclude)]

umap_groups = dict({'full set' : adata.obs['level1'].unique().tolist()},**adata.uns['umap_groups'])
colors = {ct:color for ct,color in zip(adata.obs['level2'].cat.categories.tolist(),adata.uns['level2_colors'].tolist())}
colors_full_set = {ct:color for ct,color in zip(adata.obs['level1'].cat.categories.tolist(),adata.uns['level1_colors'].tolist())}

region_order = ['Brain','Meninges','Calvaria','Vertebra','Scapula','Humerus','Femur','Pelvis']
#region_order = ['Brain','Meninges','Skull','Vertebra','Scapula','Humerus','Femur','Pelvis']
# actually the reordering here did not change the plots (I needed to add reordering in the R script as well)

for group, cts in umap_groups.items():
    count_dfs = []
    for c in ['Naive','Sham','MCAO']:
        df = adata.obs.loc[(adata.obs['condition'] == c) & adata.obs['level1'].isin(cts)].copy()
        if len(df) == 0:
            continue
        key2 = 'level1' if (group == 'full set') else 'level2'
        count_dfs.append(pd.crosstab(df['region'],df[key2]))
        count_dfs[-1] = count_dfs[-1].loc[[r for r in region_order if (r in count_dfs[-1].index)]]
        count_dfs[-1].columns = count_dfs[-1].columns.astype(str)
        for dfs_tmp in [count_dfs]:
            dfs_tmp[-1].reset_index(inplace=True)
            dfs_tmp[-1].columns.name = None
            dfs_tmp[-1].insert(0, 'condition', c)

    if len(count_dfs) == 0:
        continue
        
    counts = pd.concat(count_dfs,ignore_index=True)
    
    data_dir = DATA_DIR+f'table_{DATA_VERSION}/'
    
    Path(data_dir+'immune_only/').mkdir(parents=True, exist_ok=True)
    counts.to_csv(data_dir+f'immune_only/cell_counts_{group}.csv')
    
    #Path(data_dir+'colors/').mkdir(parents=True, exist_ok=True)
    #colors_tmp = colors_full_set if (group == 'full set') else colors
    #c_tmp = pd.DataFrame(data={'celltype':[col for col in counts.columns if col not in ['condition','region']],
    #                           'color':[colors_tmp[ct] for ct in counts.columns  if ct not in ['condition','region']]})
    #c_tmp.to_csv(data_dir+f'colors/cell_counts_{group}.csv')
    

# Delete adata since it was changed in this cell and shouldn't be used for other cells.
#del adata

  res = method(*args, **kwargs)


### Celltype percentage plots
To produce the plots run `Rscript 02eS_cell_count_plots.R`. Eventually adjust the path variables at the beginning of `02eS_cell_count_plots.R`.

# Cell count look up tables

For quick lookups we generate two excel tables:

`DATA_DIR+'cell_count_lookup_tables/cell_counts_per_region.xlsx'`  
`DATA_DIR+'cell_count_lookup_tables/cell_counts_percentage_per_region.xlsx'`  

In [4]:
sham_str = '_wSham' if WITH_SHAM else ''
adata = sc.read(DATA_DIR+f'cellxgene_{DATA_VERSION}{sham_str}_umaps.h5ad')

In [5]:
# Rename Skull to Calvaria
adata.obs["region"].cat.rename_categories({'Skull': 'Calvaria'}, inplace=True)

In [6]:
adata.obs['region'].unique()

['Calvaria', 'Pelvis', 'Femur', 'Meninges', 'Vertebra', 'Humerus', 'Scapula', 'Brain']
Categories (8, object): ['Calvaria', 'Pelvis', 'Femur', 'Meninges', 'Vertebra', 'Humerus', 'Scapula', 'Brain']

In [9]:
regions_order = ['Brain','Meninges','Calvaria','Vertebra','Scapula','Humerus','Pelvis','Femur']
#regions_order = ['Brain','Meninges','Skull','Vertebra','Scapula','Humerus','Pelvis','Femur']
data_dir = DATA_DIR+'cell_count_lookup_tables/'
Path(data_dir).mkdir(parents=True, exist_ok=True)
dfs = []
sheet_names = []
for key in ['level1','level2']:
    for cond in ['all','Naive','Sham','MCAO']:
        if cond == 'all':
            df = adata.obs.copy()
        else:
            df = adata.obs.loc[adata.obs['condition'] == cond].copy()
        dfs.append(pd.crosstab(df[key],df['region'])[regions_order])
        #dfs[-1].to_csv(data_dir+f"cell_counts_{key}_{cond}_per_region.csv")
        sheet_names.append(f'{key}_{cond}')
        
with pd.ExcelWriter(data_dir+'cell_counts_per_region.xlsx') as writer:
    for i,df in enumerate(dfs):
        df.to_excel(writer, sheet_name=sheet_names[i])

In [23]:
regions_order = ['Brain','Meninges','Calvaria','Vertebra','Scapula','Humerus','Pelvis','Femur']
#regions_order = ['Brain','Meninges','Skull','Vertebra','Scapula','Humerus','Pelvis','Femur']
data_dir = DATA_DIR+'cell_count_lookup_tables/'
Path(data_dir).mkdir(parents=True, exist_ok=True)
dfs = []
sheet_names = []
for key in ['level1','level2']:
    for cond in ['all','Naive','Sham','MCAO']:
        if cond == 'all':
            df = adata.obs.copy()
        else:
            df = adata.obs.loc[adata.obs['condition'] == cond].copy()
        dfs.append(pd.crosstab(df[key],df['region'])[regions_order])
        #dfs[-1].to_csv(data_dir+f"cell_counts_{key}_{cond}_per_region.csv")
        sheet_names.append(f'{key}_{cond}')
        
with pd.ExcelWriter(data_dir+'cell_counts_percentage_per_region.xlsx') as writer:
    for i,df in enumerate(dfs):
        ((df/df.sum()) * 100).to_excel(writer, sheet_name=sheet_names[i])

# Supplementary cell count and percentage table (sc and flow)

1 excel file with two sheets, one for sc and one for flow. 


This is the final look up table that's also in the supplementary info of the paper, here saved at  
`"./cell_type_proportions_sc_flow.xlsx"`

### sc
columns: `cell type level, cell type, condition, region, sample, cell numbers, proportion`


In [83]:
adata = sc.read(DATA_DIR+f'cellxgene_{DATA_VERSION}{sham_str}_umaps.h5ad')
adata.obs["region"].cat.rename_categories({'Skull': 'Calvaria'}, inplace=True)
adata.obs["condition"].cat.rename_categories({'MCAO': 'MCAo'}, inplace=True)

In [84]:
# variable lists and ordering
level1_cts = adata.obs["level1"].cat.categories.tolist()
level2_cts = adata.obs["level2"].cat.categories.tolist()
ct_level = ["level1" for _ in level1_cts] + ["level2" for _ in level2_cts]
index = [f"{ct} {lvl}" for ct,lvl in zip(level1_cts+level2_cts,ct_level)]
regions = ['Calvaria', 'Vertebra', 'Scapula', 'Femur', 'Humerus', 'Pelvis', 'Meninges',  'Brain']
conditions = ["Naive","Sham","MCAo"]

# Get cell counts and proportions
dfs = []
for c in conditions:
    for r in regions:
        df = pd.DataFrame(
            index = index,
            data = {
                "cell type level" : ct_level,
                "cell type"       : level1_cts+level2_cts,
                "condition"       : c,
                "region"          : r,
                "cell number"     : 0,
                "proportion"      : 0,
            }
        )
        a = adata[(adata.obs["region"]==r) & (adata.obs["condition"]==c)].copy()
        for ct_key in ["level1","level2"]:
            cell_count = a.obs[ct_key].value_counts()
            proportion = a.obs[ct_key].value_counts(normalize=True)
            idxs = cell_count.index.to_series().apply(lambda i: i+" "+ct_key)
            df.loc[idxs,"cell number"] = cell_count.values
            df.loc[idxs,"proportion"] = proportion.values
        
        dfs.append(df)
        
# Concatenate to one table and format table
df = pd.concat(dfs)
df.index.name = "ct and level"
df = df.reset_index(drop=False)
df["cell type level"] = df["cell type level"].astype("category").cat.reorder_categories(["level1","level2"])
df["ct and level"] = df["ct and level"].astype("category").cat.reorder_categories(index)
#df["cell type"] = df["cell type"].astype("category").cat.reorder_categories(level1_cts+level2_cts)
df["condition"] = df["condition"].astype("category").cat.reorder_categories(conditions)
df["region"] = df["region"].astype("category").cat.reorder_categories(regions)
df = df.sort_values(["cell type level","ct and level","condition","region"])
df = df.reset_index(drop=True)
del df["ct and level"]
df_sc = df

  res = method(*args, **kwargs)


In [85]:
df_sc.head()

Unnamed: 0,cell type level,cell type,condition,region,cell number,proportion
0,level1,progenitors,Naive,Calvaria,100,0.020194
1,level1,progenitors,Naive,Vertebra,120,0.034884
2,level1,progenitors,Naive,Scapula,207,0.030699
3,level1,progenitors,Naive,Femur,243,0.031274
4,level1,progenitors,Naive,Humerus,213,0.032559


### flow
columns: `cell type, reference group, condition, region, sample, proportion`

In [86]:
def get_cond_sample_region_from_idx(idx):
    """
    """

    # condition
    if "naive" in idx:
        cond = "Naive"
    elif "SHAM" in idx:
        cond = "Sham"
    elif "MCAO" in idx:
        cond = "MCAo"
    else: # for samples Specimen_001_blood_015.fcs, Specimen_001_femur_002.fcs, Specimen_001_skull  seq_003_008.fcs, Specimen_001_spleen 3_011.fcs, Specimen_001_spleen_003.fcs, Specimen_001_vertebra  seq_002_007.fcs
        cond = "Naive"
        
    # sample
    SAMPLES = ["naive 1", "naive 2", "naive 3", "SHAM1", "SHAM2", "SHAM3", "MCAO1", "MCAO2", "MCAO3"]
    SPECIAL_SAMPLES = {
        'Specimen_001_blood_015.fcs'             :"naive ?",
        'Specimen_001_femur_002.fcs'             :"naive 4",#"naive ?",
        'Specimen_001_skull  seq_003_008.fcs'    :"naive 4",#"naive ?",
        'Specimen_001_spleen 3_011.fcs'          :"naive ?",
        'Specimen_001_spleen_003.fcs'            :"naive ??",
        'Specimen_001_vertebra  seq_002_007.fcs' :"naive 4",#"naive ?",
    }
    sample = None
    for s in SAMPLES:
        
        if idx in SPECIAL_SAMPLES:
            sample = SPECIAL_SAMPLES[idx]
        
        elif s in idx:
            sample = s
            
    # region
    REGIONS = {"blood": "Blood", "spleen":"Spleen", "femur":"Femur", "skull":"Calvaria", "vertebra":"Vertebra", "vertebtra":"Vertebra"}
    region = None
    for r_data, r in REGIONS.items():
        if r_data in idx:
            region = r
        
    return cond, sample, region



In [87]:
flow_bones = ["Calvaria","Vertebra","Femur"] #,"Scapula","Humerus","Pelvis"
cond_to_flow_cond = {"Naive":"naive", "Sham":"SHAM", "MCAO":"MCAO"}
#df = pd.DataFrame(index = [key for key in aggregation_info.keys()], columns=["perc sc", "perc flow", "std flow", "region", "condition"])

ct_order = [
    'Erythrocyte(progenitor)', 'CD45+', 'T-cells', 'NK-cells',
    'B-cells', 'immature B-cells', 'mature B-cells', 'Monocytes',
    'early neutrophils', 'late neutrophils', 'eosinophils', 'LSK'
]
condition_order = ["Naive", "Sham", "MCAo"]
region_order = ["Calvaria","Vertebra","Femur"]
sample_order = [
    "naive 1","naive 2","naive 3","naive 4",
    'SHAM1', 'SHAM2','SHAM3',
    'MCAO1', 'MCAO2', 'MCAO3'
]


dfs = []

for condition in ["Naive", "Sham", "MCAO"]:
    

    # Load and process flow df
    df = pd.read_excel(DATA_DIR+'221019_FACS/20221014_Ilgin_newEO.xlsx', index_col=0, sheet_name=cond_to_flow_cond[condition])
    df = df/100
        
    df["condition"] = [v[0] for v in df.index.to_series().apply(get_cond_sample_region_from_idx).values]
    df["sample"] = [v[1] for v in df.index.to_series().apply(get_cond_sample_region_from_idx).values]
    df["region"] = [v[2] for v in df.index.to_series().apply(get_cond_sample_region_from_idx).values]
        
    #n_cts = len(df.columns[:-3])
    
    
    df_cts = []
    for ct in df.columns[:-3]:
        df_cts.append(pd.DataFrame(
            data = {
                "cell type"       : ct.split(" (")[0],
                "reference group" : ct.split(" (")[1][:-1],
                "condition"       : df["condition"].values,
                "region"          : df["region"].values,
                "sample"          : df["sample"].values,
                #"sample (original)" : df.index.tolist(),
                "proportion"      : df[ct].values,
            }
        ))
    
    dfs.append(pd.concat(df_cts))
    
df = pd.concat(dfs)
df = df.loc[df["region"].isin(flow_bones)]
    
df["cell type"] = df["cell type"].astype("category").cat.reorder_categories(ct_order)
df["condition"] = df["condition"].astype("category").cat.reorder_categories(condition_order)
df["region"] = df["region"].astype("category").cat.reorder_categories(region_order)
df["sample"] = df["sample"].astype("category").cat.reorder_categories(sample_order)
df = df.sort_values(["cell type","condition","region","sample"])    
df = df.reset_index(drop=True)
df_flow = df


In [88]:
df_flow.head()

Unnamed: 0,cell type,reference group,condition,region,sample,proportion
0,Erythrocyte(progenitor),% alive,Naive,Calvaria,naive 1,0.249
1,Erythrocyte(progenitor),% alive,Naive,Calvaria,naive 2,0.279
2,Erythrocyte(progenitor),% alive,Naive,Calvaria,naive 3,0.425
3,Erythrocyte(progenitor),% alive,Naive,Calvaria,naive 4,0.408
4,Erythrocyte(progenitor),% alive,Naive,Vertebra,naive 1,0.157


### save to excel

In [90]:
from pandas import ExcelWriter

In [91]:
with ExcelWriter("./cell_type_proportions_sc_flow.xlsx") as writer:
    df_sc.to_excel(writer,"scRNAseq")
    df_flow.to_excel(writer,"flow cyto")