In [1]:
import pandas as pd
import scanpy as sc
import anndata as ad
import mygene
import numpy as np
from sklearn.neighbors import NearestNeighbors
from tqdm.notebook import tqdm
from data_utils import *
from scipy import sparse
import gc



In [2]:
qualities = np.logspace(-3, 0, 10)
replicates = 3

In [3]:
%%time
meta = pd.read_csv("data/S1R1_metadata.csv", index_col=0)
cxg = pd.read_csv('data/S1R1_cell_x_gene.csv', index_col=0)
rnas = [x for x in cxg.keys() if 'Blank' not in x]

CPU times: user 1.68 s, sys: 332 ms, total: 2.01 s
Wall time: 2.01 s


In [4]:
adata = ad.AnnData(cxg)
sc.pp.filter_cells(adata, min_counts=100)
sparse_X = sparse.csr_matrix(adata.X)
adata.X = sparse_X
adata

AnnData object with n_obs × n_vars = 67821 × 649
    obs: 'n_counts'

In [5]:
adata.X.sum()/len(adata)

414.3105380339423

In [6]:
adata.obs['center_x'] = meta['center_x']
adata.obs['center_y'] = meta['center_y']

In [7]:
for q in tqdm(qualities, desc='qualities'):

    for r in tqdm(range(replicates), desc='replicates', leave=False):

        np.random.seed(r)

        ad = get_ad_with_quality(adata, quality=q)
        ad.write_h5ad('data/MERFISH_Q%.3f_rep%d.h5ad'%(q, r))

        print("Q %.3f counts per cell" %q)
        print(ad.raw.X.sum()/len(ad))

        del ad
        gc.collect()

qualities:   0%|          | 0/10 [00:00<?, ?it/s]

replicates:   0%|          | 0/3 [00:00<?, ?it/s]



Q 0.001 counts per cell
0.41429645684964833




Q 0.001 counts per cell
0.41429645684964833




Q 0.001 counts per cell
0.41429645684964833


replicates:   0%|          | 0/3 [00:00<?, ?it/s]



Q 0.002 counts per cell
0.8925996372804883




Q 0.002 counts per cell
0.8925996372804883




Q 0.002 counts per cell
0.8925996372804883


replicates:   0%|          | 0/3 [00:00<?, ?it/s]



Q 0.005 counts per cell
1.9230474336857315




Q 0.005 counts per cell
1.9230474336857315




Q 0.005 counts per cell
1.9230474336857315


replicates:   0%|          | 0/3 [00:00<?, ?it/s]



Q 0.010 counts per cell
4.143097270756845




Q 0.010 counts per cell
4.143097270756845




Q 0.010 counts per cell
4.143097270756845


replicates:   0%|          | 0/3 [00:00<?, ?it/s]



Q 0.022 counts per cell
8.926040606891672




Q 0.022 counts per cell
8.926040606891672




Q 0.022 counts per cell
8.926040606891672


replicates:   0%|          | 0/3 [00:00<?, ?it/s]



Q 0.046 counts per cell
19.230577549726487




Q 0.046 counts per cell
19.230577549726487




Q 0.046 counts per cell
19.230577549726487


replicates:   0%|          | 0/3 [00:00<?, ?it/s]



Q 0.100 counts per cell
41.43104643104643
Q 0.100 counts per cell
41.43104643104643
Q 0.100 counts per cell
41.43104643104643


replicates:   0%|          | 0/3 [00:00<?, ?it/s]

Q 0.215 counts per cell
89.26049453709028
Q 0.215 counts per cell
89.26049453709028
Q 0.215 counts per cell
89.26049453709028


replicates:   0%|          | 0/3 [00:00<?, ?it/s]

Q 0.464 counts per cell
192.30590819952522
Q 0.464 counts per cell
192.30590819952522
Q 0.464 counts per cell
192.30590819952522


replicates:   0%|          | 0/3 [00:00<?, ?it/s]

Q 1.000 counts per cell
352.50653757038964
Q 1.000 counts per cell
352.50653757038964
Q 1.000 counts per cell
352.50653757038964
