# Load Tools

In [1]:
import numpy as np
import numpy.random as rand

# Set parameters

Design about parameters; try different combinations
- pi_1 = [0.9, 0.1]
- pi_2 = [0.8, 0.2]
- pi_3 = [0.1, 0.9]
- mu_1 = [0,   0,   0  ]
- mu_2 = [0,   0.1, 0.2]
- mu_3 = [1,   1,   1  ]
- mu_sd = eye
- sd_1 = no correlation
- sd_2 = correlation = 0~0.1
- sd_3 = correlation = -1~1

In [2]:
# number of groups
num_groups = 2 

# number of samples for each group (dim = m)
num_samples_tot = 100
num_samples = np.r_[70, 30]

# number of cells for each sample
num_cells_tot = 100000

# number of cell types for each sample
# for example: T cell and Bnum_samples_tot cell
num_celltypes = 2

# proportion of cell types (dim = k) 
# each vector sum to one
pi_1 = [0.9, 0.1]
pi_2 = [0.1, 0.9]

pi = np.stack([pi_1, pi_2])

# number of genes / markers / features (dim = p)
# for example: CD3, CD8, IL-1-R
num_features = 3 

# mean value for gene expression measured by an experiment
mu_11 = np.r_[ 0,  0,  0] # first  cell type of group 1
mu_12 = np.r_[10, 10, 10] # second cell type of group 1
mu_21 = np.r_[ 0,  0,  0] # first  cell type of group 2
mu_22 = np.r_[10, 10, 10] # second cell type of group 2
mu = np.stack((mu_11, mu_12, mu_21, mu_22), axis = 0)
mu = np.expand_dims(mu, axis = 0)
mu = mu.reshape(num_groups, num_celltypes, num_features)

# var-cov of mu (mean value of gene expression) 
# --- covariance of sample mean (statistics)
sig2_mu_11 = np.eye(num_features)
sig2_mu_12 = np.eye(num_features)
sig2_mu_21 = np.eye(num_features)
sig2_mu_22 = np.eye(num_features)
sig2_mu = np.stack((sig2_mu_11, sig2_mu_12, sig2_mu_21, sig2_mu_22), axis = 0)
sig2_mu = np.expand_dims(sig2_mu, axis = 0)
sig2_mu = np.expand_dims(sig2_mu, axis = 0)
sig2_mu = sig2_mu.reshape(num_groups, num_celltypes, num_features, num_features)

# var-cov of sample (gene expression) 
# --- covariance of sample value (random variables)
sig2_11 = np.eye(num_features)
sig2_12 = np.eye(num_features)
sig2_21 = np.eye(num_features)
sig2_22 = np.eye(num_features)
sig2 = np.stack((sig2_11, sig2_12, sig2_21, sig2_22), axis = 0)
sig2 = np.expand_dims(sig2, axis = 0)
sig2 = np.expand_dims(sig2, axis = 0)
sig2 = sig2.reshape(num_groups, num_celltypes, num_features, num_features)

In [3]:
def data_simulation(
    num_groups, num_samples, num_cells_tot, num_celltypes, 
    pi, mu, sig2_mu, sig2,
    rand_seed = 0):
    """Simulating gene expression by multivariate normal distribution

    Position arguments:
    num_groups    --- number of groups
    num_samples   --- # of samples for each group:
    num_cells_tot --- # of cells for each sample:
    num_celltypes --- # of cell types for each sample:
    num_features  --- # of genes / markers / features
    pi            --- Proportion of cell types in different group
    mu            --- Mean values of the gene expression for each cell types in each groups
    sig2_mu       --- Var-Cov matrix of mu (mean value of gene expression)
    sig2          --- Var-Cov matrix of features / gene expression
    
    Keyword arguments:
    rand_seed     --- random seed; default to 0
    
    Return:
    label_groups  --- label the samples in each group
    num_cells     --- number of each cell types based on pi for each sample in each group
    samples       --- the simulated data
    """
    
    # set parameters and check dimensions
    np.random.seed(rand_seed)
    num_samples_tot = np.sum(num_samples)
    
    ##### Label the samples in each group #####
    # initialization
    label_groups = []

    # Create labels of each sample
    for idx in range(num_groups):
        label_groups += [idx] * num_samples[idx]

    label_groups = np.array(label_groups)
    #print("Check Length")
    #print(len(label_groups) == num_samples_tot)
    
    ##### create number of each cell types based on pi for each sample in each group ######
    num_cells = np.vstack([
        rand.multinomial(
            num_cells_tot, 
            pvals = pi[idx], 
            size = num_samples[idx]) 
        for idx in range(num_groups)
    ]) # end vstack / list comprehension

    #print(num_cells)
    
    ##### create mean value of gene expression for each cell type in each sample #####
    # Generate mean values of gene expression for each sample
    # from mulitvariate normal
    tmp = [
        rand.multivariate_normal(
            mu[idx_group][idx_celltype], 
            sig2_mu[idx_group][idx_celltype], 
            num_samples[idx_group])         
        for idx_group in range(num_groups) 
        for idx_celltype in range(num_celltypes)]

    #print(len(tmp))
    #print(tmp[0].shape)
    #print(tmp[1].shape)


    # arranage the dimension to (num_groups, num_celltypes, num_features)
    tmp = np.array(tmp)
    tmp = np.expand_dims(tmp, axis = 0)
    tmp = tmp.reshape(num_groups, num_celltypes)
    tmp = [np.stack(tmp[idx_group], axis = 2) for idx_group in range(num_groups)]
    tmp = [np.swapaxes(tmp[idx_group], 1, 2)  for idx_group in range(num_groups)]
    tmp = np.array(tmp)
    tmp = np.vstack(tmp)

    #print("==========")
    #print(tmp.shape)

    # assign the result
    mean_features = tmp
    
    
    ##### create data for each sample #####
    # initialize a container for all samples
    samples = list()

    # append array in loop
    # note: http://akuederle.com/create-numpy-array-with-for-loop
    for idx_sample in range(num_cells.shape[0]): # each sample can be viewed as a fcs file
        a_sample = np.empty((0, 3))
        idx_group = label_groups[idx_sample]
    
        for idx_celltype in range(num_cells.shape[1]): # each sample contains different cell types
        
            a_celltype = rand.multivariate_normal(
                mean_features[idx_sample][idx_celltype], # mean
                sig2[idx_group][idx_celltype],           # var-cov
                num_cells[idx_sample][idx_celltype])     # size
        
            a_sample = np.r_[a_sample, a_celltype]
        
        samples.append(a_sample)

    # convert output in an array
    samples = np.array(samples)

    # Check dimension (num_samples_tot, num_cells_tot, num_features)
    #print(samples.shape) 
    
    ##### return results #####
    return label_groups, num_cells, samples

In [4]:
label_groups, num_cells, samples = data_simulation(num_groups, num_samples, num_cells_tot, num_celltypes, pi, mu, sig2_mu, sig2)

# Check the results

In [5]:
print(len(label_groups))
print(num_cells.shape)
print(samples.shape)

100
(100, 2)
(100, 100000, 3)


In [6]:
print(num_samples)

[70 30]


In [7]:
print(label_groups)
print(sum(label_groups == 0), sum(label_groups == 1))

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
70 30


In [8]:
print(num_cells[[0, 1, num_samples[0]-1, num_samples[0], -1]])

[[89997 10003]
 [89947 10053]
 [90082  9918]
 [ 9860 90140]
 [ 9990 90010]]


In [9]:
print(samples)

[[[ 4.21842186e-01  3.40078877e-01 -6.22828330e-02]
  [ 1.11361267e+00 -4.63275636e-02 -9.34496936e-01]
  [ 8.62272273e-01  1.18455901e-03 -7.75592690e-01]
  ...
  [ 9.48818096e+00  1.14705167e+01  7.47222129e+00]
  [ 8.41849621e+00  1.19509432e+01  7.95035127e+00]
  [ 7.95200069e+00  9.90332280e+00  7.35098483e+00]]

 [[-3.39105183e-01 -1.30444243e+00  4.98085965e-01]
  [-9.29506791e-01 -1.53587049e+00  6.47259013e-01]
  [-9.82015262e-01 -7.71320833e-02  5.93060158e-01]
  ...
  [ 1.08165682e+01  1.05264122e+01  8.29273427e+00]
  [ 8.95368689e+00  8.83191931e+00  6.97581849e+00]
  [ 9.66690534e+00  1.00582417e+01  7.82199366e+00]]

 [[ 2.83574827e-01 -1.01257808e+00 -1.86674833e+00]
  [-5.56673582e-01 -9.74032245e-01  1.44219546e+00]
  [ 9.56763413e-01  1.19713423e+00 -3.41712900e-01]
  ...
  [ 8.17089211e+00  8.12353728e+00  1.12285377e+01]
  [ 8.44289952e+00  9.33269705e+00  1.22390142e+01]
  [ 9.61667649e+00  7.05948102e+00  1.16211025e+01]]

 ...

 [[-2.65185911e-01 -1.52764097e+00

# Store the results

In [10]:
# output the value
file_name = "data_simulation.npz"

# open the file for writing
file_object = open(file_name, 'wb') # wb --- write binary

# write data to the file
np.savez(
    file_object, 
    label_groups = label_groups, 
    num_cells = num_cells, 
    samples = samples)

# close the file
file_object.close()

# reload data

In [20]:
# output the value
file_name_r = "data_simulation.npz"

# open the file for reading
file_object_r = open(file_name_r, 'rb') # wb --- write binary

# read data
data         = np.load(file_object_r)
x = data['label_groups']
y = data['num_cells']
z = data['samples']

# close the file
file_object_r.close()

In [21]:
print(type(data))
print(data.files)

<class 'numpy.lib.npyio.NpzFile'>
['label_groups', 'num_cells', 'samples']


In [22]:
# check dimension
print(x)
print(y.shape)
print(z.shape)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
(100, 2)
(100, 100000, 3)
