## Aim of the notebook

In this notebook we **transform scRNAseq** from census, extracted in 6_1 to look like Xenium datasets. See methods section of the manuscript for further details

In [1]:
# import txsim as tx
import scanpy as sc
import os
import pandas as pd
from anndata import AnnData
import numpy as np
import xb.formatting as xf
import seaborn as sns
import random 
import matplotlib.pyplot as plt
import sklearn.metrics as sk
from tqdm import tqdm
import scanpy as sc
import random

from xb.calculating import entropy,compute_vi,compute_fmi
from xb.formatting import keep_nuclei_and_quality
from xb.simulating import missegmentation_simulation,noise_adder,subset_of_single_cell

# We specify the paths to downloaded data from Census

In [2]:
mainpath='../../data/scRNAseq_for_simulations/'
datasets=os.listdir(mainpath)
datasets=[d for d in datasets if d!='.ipynb_checkpoints']

## Next, we read, apply transfromation to scRNA-seq to make the data look like Xen and save

In [3]:
allsamples=[]
for d in range(0,len(datasets)):
    try:
        print('##################################### Dataset '+str(d))
        dataset_path=mainpath+datasets[d]
        #reread single cell data since we need the updated version LUNG DEV
        adata_sc_sub=sc.read(dataset_path+'/original_adata.h5ad')
        adata_sc_sub.X=adata_sc_sub.X.todense()
        markers=pd.read_csv(dataset_path+'/markers.csv',index_col=0)
        total_genes=250
        random_markers_perc=0
        reads_x_cell=None
        n_reads_x_gene=20
        percentage_of_noise=0.05
        misseg_percentage=0.05
        adata_sc=subset_of_single_cell(adata_sc_sub,markers,random_markers_percentage=random_markers_perc,
                                       reads_x_cell=reads_x_cell,number_of_markers=total_genes,
                                      n_reads_x_gene=n_reads_x_gene,percentage_of_noise=percentage_of_noise,
                                      ms_percentage=misseg_percentage)

        ###extract characteristics
        total_cells=adata_sc.shape[0]
        total_genes=adata_sc.shape[1]
        n_ct=len(np.unique(adata_sc.obs['cell_type']))
        ct_less_cells=np.min(adata_sc.obs.groupby('cell_type').count().iloc[:,0])/total_cells
        ct_more_cells=np.max(adata_sc.obs.groupby('cell_type').count().iloc[:,0])/total_cells
        mean_highestgen=np.mean(np.max(adata_sc.X,axis=1))
        mean_readsxcell=np.mean(np.sum(adata_sc.X,axis=1))
        headers=['total_cells','total_genes','n_ct','ct_less_cells','ct_more_cells','highest_expressed','readsxcell']
        sample_info=pd.DataFrame([total_cells,total_genes,n_ct,ct_less_cells,ct_more_cells,mean_highestgen,mean_readsxcell],
                    index=headers,columns=[datasets[d]]).transpose()
        allsamples.append(sample_info)
    except Exception as e:
        print(e)

##################################### Dataset 0


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 1204.39it/s]


(9313, 212)
##################################### Dataset 1


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 1171.68it/s]


(9634, 166)
##################################### Dataset 2


0it [00:00, ?it/s]


(417, 226)
##################################### Dataset 3


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 1405.44it/s]


(7420, 220)
##################################### Dataset 4


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 1163.95it/s]


(8839, 174)
##################################### Dataset 5


TypeError: Population must be a sequence or set.  For dicts, use list(d).

Finally, we save the sample information of the simulated data in figures

In [None]:
sampleinfo=pd.concat(allsamples)
sampleinfo.to_csv('../../figures/7.spatial_architecture/simulated_sample_info.csv')