In [1]:
# Import core libraries
import json
import pandas as pd
import numpy as np

from singlecelldata import SingleCell

# Import our libraries
from feats import GeneFilter, LogFilter, FeatureNormalize, CorrectBatches, IntegrateBatches
 
from scplotlib import tSNEPlot, PCAPlot, LinePlot



In [2]:
def Cell2Cluster(data):
    
    cell_labels = data
    cell_types = np.unique(cell_labels)
    
    for i in range(cell_types.shape[0]):
        mask = (cell_labels == cell_types[i])
        cell_labels[mask] = i+1
    
    return cell_labels

In [3]:
batch1 = pd.read_csv("data/batch1.csv", index_col=0)
batch2 = pd.read_csv("data/batch2i.csv", index_col=0)
batch3 = pd.read_csv("data/batch2ii.csv", index_col=0)

b1_celldata = pd.read_csv("data/batch1_labels.csv", index_col=0)
b1_celldata = pd.DataFrame(Cell2Cluster(b1_celldata.values), columns = ["cell type"])

b2_celldata = pd.read_csv("data/batch2i_labels.csv", index_col=0)
b2_celldata = b2_celldata.T
b2_celldata = pd.DataFrame(Cell2Cluster(b2_celldata.values), columns = ["cell type"])

b3_celldata = pd.read_csv("data/batch2ii_labels.csv", index_col=0)
b3_celldata = b3_celldata.T
b3_celldata = pd.DataFrame(Cell2Cluster(b3_celldata.values), columns = ["cell type"])


In [4]:
# Create a single cell object   
bat1 = SingleCell(dataset = "simulation1", data = batch1, celldata = b1_celldata)
bat2 = SingleCell(dataset = "simulation2", data = batch2, celldata = b2_celldata)
bat3 = SingleCell(dataset = "simulation3", data = batch3, celldata = b3_celldata)

bat3.print()

------------------------------------------------------------------------------
Dataset:  simulation3
------------------------------------------------------------------------------
Dimension:  (100, 1000)
Cell Metadata:  ['cell type']
Gene Metadata:  ['gene_names']
------------------------------------------------------------------------------


In [5]:
batches = [bat1, bat3]

In [6]:
batches = IntegrateBatches(batches, name_by = ['gene_names', 'gene_names'])

Integrating Batches . . .
Number of common genes in all batches:  100
Merging Batches . . .


In [7]:
batches.print()

------------------------------------------------------------------------------
Dataset:  simulation1+simulation3
------------------------------------------------------------------------------
Dimension:  (100, 2000)
Cell Metadata:  ['cell type' 'batch']
Gene Metadata:  ['gene_names']
------------------------------------------------------------------------------


In [8]:
fig1 = tSNEPlot(batches, marker_by = 'batch', color_by = 'cell type', tsne_init = 'pca')
fig1

In [9]:
from sklearn.metrics import pairwise_distances
def ComputeMNNPairs(X_ref, X, k):
    # X_ref, X - (d x n) gene expression matrix
    # k - number of nearest neighbours

    n1 = X_ref.shape[1]
    n2 = X.shape[1]
    mnn_pair1 = np.zeros([n1, n2], dtype = bool)
    mnn_pair2 = np.zeros([n1, n2], dtype = bool)

    # Pairwise distance between X_ref and X
    D = pairwise_distances(X = X_ref.T, Y = X.T, metric = "euclidean")

    for i in range(n1):
        idx = np.argsort(D[i, :])
        idx = idx[0:k]
        mnn_pair1[i, idx] = True

    for i in range(n2):
        idx = np.argsort(D[:, i])
        idx = idx[0:k]
        mnn_pair2[idx, i] = True  

    return np.logical_and(mnn_pair1, mnn_pair2), D

In [10]:
batches.printSummary()
batch = batches.getCellColumnValues('batch')
tsne1 = batches.getCellColumnValues('t-SNE 1')
tsne2 = batches.getCellColumnValues('t-SNE 2')
xr = tsne1[batch=='simulation1']
yr = tsne2[batch=='simulation1']
xt = tsne1[batch=='simulation3']
yt = tsne2[batch=='simulation3']

------------------------------------------------------------------------------
Dataset:  simulation1+simulation3
------------------------------------------------------------------------------
Dimension:  (100, 2000)
Cell Metadata:  ['cell type' 'batch' 't-SNE 1' 't-SNE 2']
Gene Metadata:  ['gene_names']
------------------------------------------------------------------------------


In [13]:
X_ref = bat1.getCounts()
X_tar = bat3.getCounts()
mnn, _ = ComputeMNNPairs(X_ref, X_tar, 20)
ref_idxs, tar_idxs = np.nonzero(mnn)


x1 = xr[ref_idxs[0]]
y1 = yr[ref_idxs[0]]
x2 = xt[tar_idxs[0]]
y2 = yt[tar_idxs[0]]
df = pd.DataFrame({'x': [x1, x2], 'y': [y1, y2]})

line = LinePlot(
    
    df, 
    x = 'x',
    y = 'y',
    color_by = None, 
    style_by = None, 
    size_by = None,
    line_color = 'black',
    line_style = [1, 0],
    line_size = 2,
    xlabel = None,
    ylabel = None
)

for i in range(1, 500):
    x1 = xr[ref_idxs[i]]
    y1 = yr[ref_idxs[i]]
    x2 = xt[tar_idxs[i]]
    y2 = yt[tar_idxs[i]]
    df = pd.DataFrame({'x': [x1, x2], 'y': [y1, y2]})
    line = line + LinePlot(
        df, 
        x = 'x',
        y = 'y',
        color_by = None, 
        style_by = None, 
        size_by = None,
        line_color = 'black',
        line_style = [1, 0],
        line_size = 0.1,
        xlabel = None,
        ylabel = None
    )

In [14]:
(fig1 + line).configure_axis(grid = False).configure_view(strokeWidth = 0).show()

Displaying chart at http://localhost:15948/


In [9]:
batches = CorrectBatches(batches, correct_order = ['simulation1', 'simulation3'], sigma = 10, svd_dim = 2)

(100, 1000)
Correcting batches < simulation1  and  simulation3 >
Merging Batches . . .


In [None]:
print(tar_idxs.shape)

In [11]:
fig2 = tSNEPlot(batches, color_by = 'cell type', marker_by = 'batch', tsne_init = 'pca')
fig2

Removing ' t-SNE 1 ' from CellData assay
Removing ' t-SNE 2 ' from CellData assay


In [None]:
batches = [bat1, bat2]

In [None]:
batches = IntegrateBatches(batches, name_by = ['gene_names', 'gene_names'])

In [None]:
batches = MergeBatches(batches)

In [None]:
fig3 = tSNEPlot(batches, marker_by = 'batch', color_by = 'cell type', tsne_init = 'pca')
fig3.show()

In [None]:
batches = CorrectBatches(batches, correct_order = ['simulation1', 'simulation2'], sigma = 10, svd_dim = 2)

In [None]:
fig4 = tSNEPlot(batches, color_by = 'cell type', marker_by = 'batch', tsne_init = 'pca')
fig4.show()