# Load Tools

In [None]:
import numpy as np
import numpy.random as rand

# Set parameters

In [80]:
# number of groups
num_groups = 2 

# number of samples for each group (dim = m)
num_samples_tot = 100
num_samples = np.r_[70, 30]

# number of cells for each sample
num_cells_tot = 100000

# number of cell types for each sample
# for example: T cell and Bnum_samples_tot cell
num_celltypes = 2

# proportion of cell types (dim = k) 
# each vector sum to one
pi_1 = [0.9, 0.1]
pi_2 = [0.1, 0.9]
pi = np.stack([pi_1, pi_2])

# number of genes / markers / features (dim = p)
# for example: CD3, CD8, IL-1-R
num_features = 3 

# mean value for gene expression measured by an experiment
mu_11 = np.r_[ 0,  0,  0] # first  cell type of group 1
mu_12 = np.r_[10, 10, 10] # second cell type of group 1
mu_21 = np.r_[ 0,  0,  0] # first  cell type of group 2
mu_22 = np.r_[10, 10, 10] # second cell type of group 2
mu = np.stack((mu_11, mu_12, mu_21, mu_22), axis = 0)
mu = np.expand_dims(mu, axis = 0)
mu = mu.reshape(num_groups, num_celltypes, num_features)

# var-cov of mu (mean value of gene expression) 
# --- covariance of sample mean (statistics)
sig2_mu_11 = np.eye(num_features)
sig2_mu_12 = np.eye(num_features)
sig2_mu_21 = np.eye(num_features)
sig2_mu_22 = np.eye(num_features)
sig2_mu = np.stack((sig2_mu_11, sig2_mu_12, sig2_mu_21, sig2_mu_22), axis = 0)
sig2_mu = np.expand_dims(sig2_mu, axis = 0)
sig2_mu = np.expand_dims(sig2_mu, axis = 0)
sig2_mu = sig2_mu.reshape(num_groups, num_celltypes, num_features, num_features)

# var-cov of sample (gene expression) 
# --- covariance of sample value (random variables)
sig2_11 = np.eye(num_features)
sig2_12 = np.eye(num_features)
sig2_21 = np.eye(num_features)
sig2_22 = np.eye(num_features)
sig2 = np.stack((sig2_11, sig2_12, sig2_21, sig2_22), axis = 0)
sig2 = np.expand_dims(sig2, axis = 0)
sig2 = np.expand_dims(sig2, axis = 0)
sig2 = sig2.reshape(num_groups, num_celltypes, num_features, num_features)

In [36]:
def data_simulation(
    num_groups, num_samples, num_cells_tot, num_celltypes, 
    pi, mu, sig2_mu, sig2,
    rand_seed = 0):
    """Simulating gene expression by multivariate normal distribution

    Position arguments:
    num_groups    --- number of groups
    num_samples   --- # of samples for each group:
    num_cells_tot --- # of cells for each sample:
    num_celltypes --- # of cell types for each sample:
    num_features  --- # of genes / markers / features
    pi            --- Proportion of cell types in different group
    mu            --- Mean values of the gene expression for each cell types in each groups
    sig2_mu       --- Var-Cov matrix of mu (mean value of gene expression)
    sig2          --- Var-Cov matrix of features / gene expression
    
    Keyword argumenst:
    rand_seed     --- random seed; default to 0
    
    Return:
    label_groups  --- label the samples in each group
    num_cells     --- number of each cell types based on pi for each sample in each group
    samples       --- the simulated data
    """
    
    # set parameters and check dimensions
    np.random.seed(rand_seed)
    num_samples_tot = np.sum(num_samples)
    
    ##### Label the samples in each group #####
    # initialization
    label_groups = []

    # Create labels of each sample
    for idx in range(num_groups):
        label_groups += [idx] * num_samples[idx]

    label_groups = np.array(label_groups)
    #print("Check Length")
    #print(len(label_groups) == num_samples_tot)
    
    ##### create number of each cell types based on pi for each sample in each group ######
    num_cells = np.vstack([
        rand.multinomial(
            num_cells_tot, 
            pvals = pi[idx], 
            size = num_samples[idx]) 
        for idx in range(num_groups)
    ]) # end vstack / list comprehension

    #print(num_cells)
    
    ##### create mean value of gene expression for each cell type in each sample #####
    # Generate mean values of gene expression for each sample
    # from mulitvariate normal
    tmp = [
        rand.multivariate_normal(
            mu[idx_group][idx_celltype], 
            sig2_mu[idx_group][idx_celltype], 
            num_samples[idx_group])         
        for idx_group in range(num_groups) 
        for idx_celltype in range(num_celltypes)]

    #print(len(tmp))
    #print(tmp[0].shape)
    #print(tmp[1].shape)


    # arranage the dimension to (num_groups, num_celltypes, num_features)
    tmp = np.array(tmp)
    tmp = np.expand_dims(tmp, axis = 0)
    tmp = tmp.reshape(num_groups, num_celltypes)
    tmp = [np.stack(tmp[idx_group], axis = 2) for idx_group in range(num_groups)]
    tmp = [np.swapaxes(tmp[idx_group], 1, 2)  for idx_group in range(num_groups)]
    tmp = np.array(tmp)
    tmp = np.vstack(tmp)

    #print("==========")
    #print(tmp.shape)

    # assign the result
    mean_features = tmp
    
    
    ##### create data for each sample #####
    # initialize a container for all samples
    samples = list()

    # append array in loop
    # note: http://akuederle.com/create-numpy-array-with-for-loop
    for idx_sample in range(num_cells.shape[0]): # each sample can be viewed as a fcs file
        a_sample = np.empty((0, 3))
        idx_group = label_groups[idx_sample]
    
        for idx_celltype in range(num_cells.shape[1]): # each sample contains different cell types
        
            a_celltype = rand.multivariate_normal(
                mean_features[idx_sample][idx_celltype], # mean
                sig2[idx_group][idx_celltype],           # var-cov
                num_cells[idx_sample][idx_celltype])     # size
        
            a_sample = np.r_[a_sample, a_celltype]
        
        samples.append(a_sample)

    # convert output in an array
    samples = np.array(samples)

    # Check dimension (num_samples_tot, num_cells_tot, num_features)
    #print(samples.shape) 
    
    ##### return results #####
    return label_groups, num_cells, samples

In [37]:
label_groups, num_cells, samples = data_simulation(num_groups, num_samples, num_cells_tot, num_celltypes, pi, mu, sig2_mu, sig2)

# Check the results

In [42]:
print(len(label_groups))
print(num_cells.shape)
print(samples.shape)

100
(100, 2)
(100, 10000, 3)


In [43]:
print(num_samples)

[70 30]


In [45]:
print(label_groups)
print(sum(label_groups == 0), sum(label_groups == 1))

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
70 30


In [46]:
print(num_cells[[0, 1, num_samples[0]-1, num_samples[0], -1]])

[[8997 1003]
 [8982 1018]
 [9048  952]
 [ 999 9001]
 [1040 8960]]


In [47]:
print(samples)

[[[-1.10191262 -1.52549171  3.03762104]
  [-1.214382   -2.04378773  1.20938102]
  [-1.91932879 -1.35201725  0.82297458]
  ...
  [10.53040522  8.31655456  9.04669809]
  [10.99140359  8.56022423  8.90590493]
  [ 9.4879437   7.61942902 10.73293044]]

 [[ 0.99113434  1.47603428  2.70711178]
  [ 0.38069921  0.62023047  2.1118866 ]
  [ 0.9518702   1.01826669  2.2984175 ]
  ...
  [11.23400959  7.66246366 11.01828855]
  [11.14748359  9.69451001  9.83535592]
  [10.23141513  8.90921104  9.97583979]]

 [[ 1.23300445 -2.11165289 -2.51016569]
  [ 2.61822097  0.55930279 -0.3821884 ]
  [ 0.79136933 -1.52344528  0.25442344]
  ...
  [ 7.18310854  8.92582139  8.04712339]
  [ 6.92992611  8.81767393  9.56531383]
  [ 8.05121296  8.52239954 10.53147221]]

 ...

 [[ 2.30053592 -2.43259247 -0.23464719]
  [ 1.34419874 -1.9241848   1.35645815]
  [ 0.48238724 -0.83932942 -0.54178364]
  ...
  [ 8.74199836 11.44930109  8.62235207]
  [ 9.07357258  7.93862767  8.27567736]
  [ 8.68557861  8.581595    9.59659766]]

 [

# Store the results

In [64]:
# output the value
file_name = "data_simulation"

# open the file for writing
file_object = open(file_name, 'wb') # wb --- write binary

# write data to the file
np.savez(
    file_object, 
    label_groups = label_groups, 
    num_cells = num_cells, 
    samples = samples)

# close the file
file_object.close()

# reload data

In [74]:
# output the value
file_name_r = "data_simulation"

# open the file for reading
file_object_r = open(file_name_r, 'rb') # wb --- write binary

# read data
data = np.load(file_object_r)

In [75]:
print(type(data))
print(data.files)

<class 'numpy.lib.npyio.NpzFile'>
['label_groups', 'num_cells', 'samples']


In [76]:
data['label_groups']

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [77]:
data['num_cells']

array([[8997, 1003],
       [8982, 1018],
       [9004,  996],
       [9018,  982],
       [9075,  925],
       [9024,  976],
       [9009,  991],
       [8999, 1001],
       [9049,  951],
       [8935, 1065],
       [9009,  991],
       [9029,  971],
       [9046,  954],
       [8981, 1019],
       [9025,  975],
       [8996, 1004],
       [9036,  964],
       [8986, 1014],
       [8940, 1060],
       [8996, 1004],
       [8944, 1056],
       [8985, 1015],
       [8990, 1010],
       [8996, 1004],
       [8979, 1021],
       [8992, 1008],
       [8960, 1040],
       [8975, 1025],
       [8993, 1007],
       [8953, 1047],
       [9006,  994],
       [9008,  992],
       [8999, 1001],
       [8927, 1073],
       [8931, 1069],
       [9053,  947],
       [8983, 1017],
       [8982, 1018],
       [8998, 1002],
       [9037,  963],
       [8968, 1032],
       [8961, 1039],
       [9008,  992],
       [9014,  986],
       [9033,  967],
       [8987, 1013],
       [8951, 1049],
       [9003,

In [78]:
data['samples']

array([[[-1.10191262, -1.52549171,  3.03762104],
        [-1.214382  , -2.04378773,  1.20938102],
        [-1.91932879, -1.35201725,  0.82297458],
        ...,
        [10.53040522,  8.31655456,  9.04669809],
        [10.99140359,  8.56022423,  8.90590493],
        [ 9.4879437 ,  7.61942902, 10.73293044]],

       [[ 0.99113434,  1.47603428,  2.70711178],
        [ 0.38069921,  0.62023047,  2.1118866 ],
        [ 0.9518702 ,  1.01826669,  2.2984175 ],
        ...,
        [11.23400959,  7.66246366, 11.01828855],
        [11.14748359,  9.69451001,  9.83535592],
        [10.23141513,  8.90921104,  9.97583979]],

       [[ 1.23300445, -2.11165289, -2.51016569],
        [ 2.61822097,  0.55930279, -0.3821884 ],
        [ 0.79136933, -1.52344528,  0.25442344],
        ...,
        [ 7.18310854,  8.92582139,  8.04712339],
        [ 6.92992611,  8.81767393,  9.56531383],
        [ 8.05121296,  8.52239954, 10.53147221]],

       ...,

       [[ 2.30053592, -2.43259247, -0.23464719],
        [ 1

In [79]:
# close the file
file_object_r.close()