# Generate scRNA-Seq data using scBoolSeq

Extends https://github.com/bnediction/scBoolSeq-supplementary/blob/main/synthetic%20scRNA-Seq%20from%20Random%20Network.ipynb

In [6]:
from scboolseq import scBoolSeq

import glob 
import numpy as np
import pandas as pd
import sklearn

In [7]:
ground_truth_prefix = "../ground-truth/"
workdir = "_workdir"
background_scRNA_seq = "GSE81682_Hematopoiesis.csv"
background_scRNA_seq_src = f"https://github.com/bnediction/scBoolSeq-supplementary/raw/main/data_filtered_vargenes/{background_scRNA_seq}"
output_prefix = "../"

SEED = 20382

## Load background scRNA-seq data

In [8]:
background_scRNA_seq_file = f"{workdir}/{background_scRNA_seq}"
! test -f {background_scRNA_seq_file} || (mkdir {workdir} && curl -fL {background_scRNA_seq_src} -o {background_scRNA_seq_file})

In [9]:
ref_data = pd.read_csv(background_scRNA_seq_file, index_col=0)
ref_data.head()

Unnamed: 0,8430408G22Rik,Plp1,Zfp947,Bhlhb9,Vps35,Slc18a1,Fam107b,Gm14230,Plekhn1,Ankrd6,...,Gpr4,Nectin2,Tyrobp,Plekhf1,Nkg7,Osbpl1a,Slc27a6,Gm4951,Zfp438,Rab18
HSPC_025,0.0,0.0,0.0,5.392129,8.852337,0.0,2.614548,0.0,0.0,0.0,...,0.0,0.0,1.189716,0.0,2.614548,0.0,0.0,0.0,1.189716,9.26359
HSPC_031,0.0,0.0,0.0,0.686872,7.637939,0.0,6.838205,0.0,0.0,0.0,...,2.827391,0.0,3.158217,0.0,2.219938,2.017546,0.0,0.0,0.0,0.686872
HSPC_037,0.0,0.0,0.0,1.869808,7.93808,0.0,9.405107,0.0,0.0,0.0,...,0.0,0.0,7.336441,0.0,8.407233,1.218731,0.0,0.0,1.218731,8.641061
LT-HSC_001,0.0,0.0,0.0,7.965715,5.885018,0.0,8.962827,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,3.74947,6.820749,0.0,7.107933,2.364517,6.242153
HSPC_001,0.0,0.0,0.0,8.3955,0.0,0.377367,0.676211,0.0,0.676211,0.0,...,0.0,0.0,0.377367,0.0,1.134796,1.134796,0.0,1.883902,0.0,1.31895


In [10]:
with sklearn.config_context(transform_output="pandas"):
    scbool = scBoolSeq().fit(ref_data)

  return bound(*args, **kwds)


## Generate sample scRNA-seq from Boolean trajectories

In [11]:
!mkdir -p {output_prefix}traj/
!mkdir -p {output_prefix}steady/

In [12]:
def make_nb_cells(traj_df, nb_cells_transient=(150,250), nb_cells_steady=(500,600), SEED=SEED):
    rng = np.random.default_rng(SEED)
    n_samples = rng.integers(*nb_cells_transient, size=len(traj_df.index))
    _steady = np.where(traj_df.index.map(lambda x: "steady" in x and "_to_" not in x))[0]
    n_samples[_steady] = rng.integers(*nb_cells_steady, size=len(_steady))
    return n_samples

def expand_bindata(traj_df, n_samples):
    d = traj_df.copy(deep=True).values.repeat(n_samples, axis=0)
    return pd.DataFrame(d, columns=traj_df.columns)

def push_mutants_counts(counts, name):
    for label, mutant_counts in counts.groupby(lambda idx: idx.split("#")[0]):
        mutant_counts.index = [idx[idx.index("#")+1:] for idx in mutant_counts.index]
        print(label, name)
        mutant_counts.T.to_csv(f"{output_prefix}traj/{label}-{name}.csv")
        sel = [i for i in mutant_counts.index if i.startswith("steady")]
        mutant_counts.loc[sel].T.to_csv(f"{output_prefix}steady/{label}-{name}.csv")

def make_mutant_counts(traj_df, n_samples, SEED=SEED):
    bindata = expand_bindata(traj_df, n_samples)
    for args, name in [({}, "normalized-scRNAseq-dropouts"), 
                       ({"dropout_mode": None}, "normalized-scRNAseq-nodropouts")]:
        counts = scbool.sample_counts(bindata, n_samples_per_state=1, random_state=SEED)
        counts.index = [f"{x}_{y}" for i,x in enumerate(traj_df.index) for y in range(n_samples[i])]
        counts.index.name = "cellID"
        push_mutants_counts(counts, name)

In [13]:
_suffix = "-boolean-trajectories.csv"
def pull_traj_df(traj_file):
    label = traj_file[len(ground_truth_prefix):-len(_suffix)]
    traj_df = pd.read_csv(traj_file, index_col=0)
    traj_df.index = [f"{label}#{i}" for i in traj_df.index]
    return traj_df

trajs_df = pd.concat([pull_traj_df(traj_file) for traj_file in glob.glob(f"{ground_truth_prefix}*{_suffix}")])
n_samples = make_nb_cells(trajs_df)
make_mutant_counts(trajs_df, n_samples)

  "Skewness": ss.skew(trajectory),
  "Kurtosis": ss.kurtosis(trajectory),
  return bound(*args, **kwds)


wt normalized-scRNAseq-dropouts
x14KO normalized-scRNAseq-dropouts
x1KO normalized-scRNAseq-dropouts
x1KOx2KO normalized-scRNAseq-dropouts
x1KOx2UP normalized-scRNAseq-dropouts
x1UPx2KO normalized-scRNAseq-dropouts
x1UPx2UP normalized-scRNAseq-dropouts
x2KO normalized-scRNAseq-dropouts
x7KO normalized-scRNAseq-dropouts
x9KO normalized-scRNAseq-dropouts


  "Skewness": ss.skew(trajectory),
  "Kurtosis": ss.kurtosis(trajectory),
  return bound(*args, **kwds)


wt normalized-scRNAseq-nodropouts
x14KO normalized-scRNAseq-nodropouts
x1KO normalized-scRNAseq-nodropouts
x1KOx2KO normalized-scRNAseq-nodropouts
x1KOx2UP normalized-scRNAseq-nodropouts
x1UPx2KO normalized-scRNAseq-nodropouts
x1UPx2UP normalized-scRNAseq-nodropouts
x2KO normalized-scRNAseq-nodropouts
x7KO normalized-scRNAseq-nodropouts
x9KO normalized-scRNAseq-nodropouts
