In [2]:
import numpy as np
import numpy.random as rand
from collections import Counter
from scipy.interpolate import Rbf
from sklearn.manifold import TSNE

import glob
import time

import matplotlib.pyplot as plt
import seaborn as sns

# Import EQAPOL Normal Flow Data

In [3]:
# input the value
tmp_dir = "/data/deep-learning-group/test_data/flow_EQAPOL/"
fname_Costim = "data_Costim.np"
fname_CMV    = "data_CMV.np"
fname_SEB    = "data_SEB.np"

# open and write the file
print("read Costim")
file_object = open(tmp_dir + fname_Costim, 'rb')
data_Costim = np.load(file_object)
file_object.close()

print("read CMV")
file_object = open(tmp_dir + fname_CMV, 'rb')
data_CMV = np.load(file_object)
file_object.close()

print("read SEB")
file_object = open(tmp_dir + fname_SEB, 'rb')
data_SEB = np.load(file_object)
file_object.close()

print("read markers")
file_object = open(tmp_dir + "markers.np", 'rb')
markers = np.load(file_object)
markers_dict = {items[1]: idx for idx, items in enumerate(markers)}
file_object.close()

print("The data are input.")

read Costim
read CMV
read SEB
read markers
The data are input.


In [6]:
data_samples = np.array(list(data_Costim) + list(data_CMV) + list(data_SEB))
label_groups = np.array(            \
    [0] * len(list(data_Costim)) +  \
    [1] * len(list(data_CMV)) +     \
    [2] * len(list(data_SEB)))
    
print("check dimension")
print("================")
print("Labels:  ", label_groups.shape)
print(Counter(label_groups))
print("================")
print("Samples: ", data_samples.shape)
print("Samples: ", data_samples[0].shape)
print("Samples: ", data_samples[1].shape)
print("================")
print("Costim: ", data_Costim.shape)
print("Costim: ", data_Costim[0].shape)
print("Costim: ", data_Costim[1].shape)
print("================")
print("CMV:    ", data_CMV.shape)
print("CMV:    ", data_CMV[0].shape)
print("CMV:    ", data_CMV[1].shape)
print("================")
print("SEB:    ", data_SEB.shape)
print("SEB:    ", data_SEB[0].shape)
print("SEB:    ", data_SEB[1].shape)

check dimension
Labels:   (33,)
Counter({0: 11, 1: 11, 2: 11})
Samples:  (33,)
Samples:  (455472, 14)
Samples:  (607410, 14)
Costim:  (11,)
Costim:  (455472, 14)
Costim:  (607410, 14)
CMV:     (11,)
CMV:     (411894, 14)
CMV:     (515178, 14)
SEB:     (11,)
SEB:     (422658, 14)
SEB:     (519426, 14)


# Convert flow data to image

In [13]:
print(data_samples.shape)
print(data_samples[0].shape)
print(data_samples[1].shape)

(33,)
(455472, 14)
(607410, 14)


In [14]:
455472 + 607410

1062882

In [11]:
samples = data_samples[0:2]

In [21]:
tmp = np.vstack(samples)

In [22]:
tmp.shape

(1062882, 14)

In [18]:
z.shape

(2,)

In [20]:
np.vstack(z)

(5, 3)

In [None]:
tsne = TSNE(
    n_components = tsne_dimension, 
    verbose      = tsne_verbose, 
    perplexity   = tsne_perplexity, 
    n_iter       = tsne_iter, 
    random_state = tsne_rand_seed)

In [29]:
tmp = data_samples
samples_std = data_standardization(tmp)
idx = [0, 1, 11, 12, 22, 23]

#################################
print("=====  1000  (E3) cells ======")
data_labels_E3, data_samples_E3 = data_subsetting(
    samples_std, 
    label_groups, 
    k = 1000, 
    num_subsets = 1, 
    verbose = True)


# stack everything 
tmp = data_samples_E3[idx]
tmp = np.vstack(tmp)
tmp = np.expand_dims(tmp, axis = 0)
data_samples_E3_subset = tmp

# choose the first sample of each group to create tsne
# since the num_subsets is one, the idx of first sample
# in each group = 0, 11, 22
data_tsne_E3_subset = create_tsne(
    data_samples_E3_subset, 
    verbose = True)

Data Subsetting...
...Finish
Create t-SNE plots...
	Prepare t-SNE plot of the 0 sample
...Finish


In [31]:
tsne, sample = data_tsne_E3_subset, data_samples_E3_subset
print(tsne.shape)
print(sample.shape)

(1, 6000, 2)
(1, 6000, 14)


In [35]:
tmp = tsne.reshape(6, 1000, 2)
tmp.shape

(6, 1000, 2)

In [38]:
tmp = data_samples_E3[idx]
tmp.shape

(6, 1000, 14)

In [40]:
tmp = tmp.reshape([1, -1, 14])

In [41]:
tmp.shape

(1, 6000, 14)

In [32]:
# convert tsne to image
tsne, sample = data_tsne_E3_subset, data_samples_E3_subset
data_img_E3_subset  = create_img(
    tsne, 
    sample, 
    verbose = True, 
    verbose_sample = True, 
    verbose_marker = True)

Create images from t-SNE plot...
	Prepare image of the 0 sample
		interpolating the 0 feature
		interpolating the 1 feature
		interpolating the 2 feature
		interpolating the 3 feature
		interpolating the 4 feature
		interpolating the 5 feature
		interpolating the 6 feature
		interpolating the 7 feature
		interpolating the 8 feature
		interpolating the 9 feature
		interpolating the 10 feature
		interpolating the 11 feature
		interpolating the 12 feature
		interpolating the 13 feature
...Finish


In [33]:
data_img_E3_subset.shape

(1, 14, 128, 128)

In [25]:
tmp = np.vstack(data_samples_E3_subset)
tmp.shape

(6000, 14)

In [27]:
tmp = np.expand_dims(tmp, axis = 0)

In [28]:
tmp.shape

(1, 6000, 14)

In [None]:
data_tsne_E3_subset = create_tsne(
    data_samples_E3_subset, 
    verbose = True)


# convert tsne to image
tsne, sample = data_tsne_E3_subset, data_samples_E3_subset
data_img_E3_subset  = create_img(
    tsne, 
    sample, 
    verbose = True, 
    verbose_sample = True, 
    verbose_marker = True)

In [5]:
def check_dimension(samples):
    """ Check the dimension of an numpy array, the function 
    allows different number of the second dimention
    
    For example:
        len(shapes.shape) = 2 && shapes.shape[1] = 2
        => result: 2 - 1 + 2 = 3
    
    >>> import numpy as np
    >>> arr = np.array([[[1], [2]], [[3], [4]]])
    >>> arr.shape
    (2, 2, 1)
    >>> check_dimension(arr)
    (3, 'Wrong input dimension; Expected 3 but 3 given; the samples should contain (samples, events, markers)')
    
    >>> x = np.array([[11, 12, 13], [14, 15, 16]])
    >>> y = np.array([[21, 22, 23], [24, 25, 26], [27, 28, 29]])
    >>> z = np.array([[31, 32, 33]])
    >>> arr = np.array([x, y, z])
    >>> check_dimension(arr)
    (3, 'Wrong input dimension; Expected 3 but 3 given; the samples should contain (samples, events, markers)')
    """
    shapes = np.array([sample.shape for sample in samples])
    checked_value = len(shapes.shape) - 1 + shapes.shape[1] 
    
    error_message = \
        "Wrong input dimension; Expected 3 but " + str(checked_value) + \
        " given; the samples should contain (samples, events, markers)"
        
    return checked_value, error_message

################################################################################

def data_standardization(samples, idx_sample = 0, verbose = False):
    """standardized features of all samples using one group of samples
    
    Position arguments:
    samples    --- numpy array; (num_samples_tot, num_cells_tot, num_features)
                   note: the num_cells_tot is not fixed for each samples
    idx_sample --- int; which sample is used for standardization; default to 0
    
    Keyword arguments:
    verbose --- print out the running process
    """
    
    if (verbose):
        print("Data Standardization...")
    
    # num_samples_tot, num_cells_tot, num_features
    # => correct value: len(shapes.shape) = 2 && shapes.shape[1] = 2
    checked_value, error_message = check_dimension(samples)
    assert (checked_value == 3), error_message
    assert samples.shape[0] > idx_sample, "Incorrect input of idx_sample"
    
    # calculate mu and sd
    res = samples[idx_sample] 
    mu  = np.mean(res, axis=0)
    sd  = np.std( res, axis=0)
    
    # standardize
    samples_stdard = np.array( [(sample - mu) / sd for sample in samples] )    
    
    if (verbose):
        print("...Finish")
        
    return(samples_stdard)

##############################################################################

def data_subsetting(samples, label_groups, k = 1000, num_subsets = 10, rand_seed = 0, verbose = False):
    """create subsets for each sample
    
    Position arguments:
    samples      --- numpy array; (num_samples_tot, num_cells_tot, num_features)
    label_groups --- list or numpy array; label of each sample in samples
    
    Keyword arguments:
    k           --- number of cells get from the each sample
    num_subsets --- number of subsets; each subset contain k cells
    rand_seed   --- random seed
    verbose     --- print out the running process
    """
    
    if (verbose):
        print("Data Subsetting...")
        
    # check the dimension
    checked_value, error_message = check_dimension(samples)
    assert (checked_value == 3), error_message
    
    # initialization
    #num_samples_tot, num_cells_tot, num_features = samples.shape
    num_samples_tot = samples.shape[0]
    rand.seed(rand_seed)
    result_samples = []
    result_labels  = []
    
    # iterate through all samples
    for idx_sample in range(num_samples_tot):
        
        # initialization in each loop
        sample = samples[idx_sample]
        num_cells_tot = sample.shape[0]
        num_features  = sample.shape[1]
        
        # record the corresponding label
        group  = label_groups[idx_sample]
        result_labels += ([group] * num_subsets)
        
        # generate subsets in each sample
        for _ in range(num_subsets):
            
            # choose k cells randomly
            idx = rand.permutation(num_cells_tot)[:k]
            result_samples.append(sample[idx])
              
    # convert results from list to numpy array
    result_samples = np.array(result_samples) # (num_samples_tot * num_subsets, k, num_genes)
    result_labels  = np.array(result_labels)  # (num_samples_tot * num_subsets,)
    
    if (verbose):
        print("...Finish")
    
    return result_labels, result_samples

##############################################################################

def create_tsne(samples, verbose = False,
                tsne_dimension  = 2,
                tsne_perplexity = 40, 
                tsne_iter       = 300, 
                tsne_verbose    = 0,
                tsne_rand_seed  = 0):
    """create t-SNE plot for each sample
    
    Position arguments:
    samples --- numpy array; (num_samples_tot, num_cells_tot, num_features)
    
    Keyword arguments:
    rand_seed --- random seed
    verbose   --- print out the running process
    """
    
    if (verbose):
        print("Create t-SNE plots...")
        
    # check the dimension
    checked_value, error_message = check_dimension(samples)
    assert (checked_value == 3), error_message
    
    # initialization
    num_samples_tot = samples.shape[0]
    result_tsne = []
    
    # generate tsne plot for each sample
    for idx_sample in range(num_samples_tot):
        if (verbose):
            print("\tPrepare t-SNE plot of the", idx_sample, "sample")
        
        # initialization in each loop
        sample = samples[idx_sample]
        num_cells_tot = sample.shape[0]
        num_features  = sample.shape[1]
        
        # for each sample, generate a t-SNE plot
        tsne = TSNE(n_components = tsne_dimension, 
                    verbose      = tsne_verbose, 
                    perplexity   = tsne_perplexity, 
                    n_iter       = tsne_iter, 
                    random_state = tsne_rand_seed)
        res = tsne.fit_transform(sample)
        result_tsne.append(res)
    
    # convert the result from list to numpy array
    result_tsne = np.array(result_tsne)
    
    if (verbose):
        print("...Finish")
        
    return result_tsne

##############################################################################

def create_img(tsne_plots, samples, n_grid = 128, 
               verbose = False, 
               verbose_sample = False, 
               verbose_marker = False):
    """create image from t-SNE plot
    
    Position arguments:
    tsne_plots --- numpy array; (num_samples_tot, num_cells_tot, 2)
    samples    --- numpy array; (num_samples_tot, num_cells_tot, num_features)
    
    Keyword arguments:
    n_grid  --- the dimension of image (n_grid x n_grid)
    verbose --- print out the running process
    """
    
    if (verbose):
        print("Create images from t-SNE plot...")
    
    # check the dimension
    checked_value, error_message = check_dimension(samples)
    assert (checked_value == 3), error_message
    
    # initialization
    num_samples_tot = np.min([tsne_plots.shape[0], samples.shape[0]])
    result_img = []
    
    # iterate though each samples
    for idx_sample in range(num_samples_tot):
        if (verbose_sample):
            print("\tPrepare image of the", idx_sample, "sample")
        
        # initialization in each loop
        sample = samples[idx_sample]
        num_cells_tot = sample.shape[0]
        num_features  = sample.shape[1]
        
        # get x, y coordinate of a plot
        tsne_plot = tsne_plots[idx_sample]
        x = tsne_plot[:, 0]
        y = tsne_plot[:, 1]
        
        # generate a grid
        x_c = np.linspace(min(x), max(x), n_grid)
        y_c = np.linspace(min(y), max(y), n_grid)
        x_c, y_c = np.meshgrid(x_c, y_c)
        
        # each feature is a layer/channel for the plot
        # to get each layer, perform interpolation to convert tSNE plot in a image
        img = []
        for idx_feature in range(num_features):
            
            if (verbose_marker):
                print("\t\tinterpolating the", idx_feature, "feature")
            
            # interpolation
            z = sample[:, idx_feature]
            rbfi = Rbf(x, y, z, function='multiquadric', smooth=1)
            
            # store into a list "img"
            z_c = rbfi(x_c, y_c)
            img.append(z_c)
            
        # normalize & arrange the interpolated feature values    
        img = np.array(img)
        img = (img - np.min(img)) / (np.max(img) - np.min(img))
        
        # append each interpolated result to the result
        result_img.append(img)
        
    # standardize images
    result_img = [(img - np.min(img)) / (np.max(img) - np.min(img)) for img in result_img]
    result_img = np.array(result_img)
    
    if (verbose):
        print("...Finish")
        
    return result_img