In [1]:
# Import core libraries
import json
import pandas as pd
import numpy as np

from newsinglecell import SingleCell

# Import our libraries
from Filtering import GeneFilter, LogFilter
from Normalization import FeatureNormalize
from feats_batchcorrection import CorrectBatches, IntegrateBatches, MergeBatches
from scplot import tSNEPlot, PCAPlot
# Scanorama Stuffs
import scanorama


In [2]:
def Cell2Cluster(data):
    
    cell_labels = data
    cell_types = np.unique(cell_labels)
    
    for i in range(cell_types.shape[0]):
        mask = (cell_labels == cell_types[i])
        cell_labels[mask] = i+1
    
    return cell_labels

In [3]:
batch1 = pd.read_csv("data/batch1.csv", index_col=0)
batch2 = pd.read_csv("data/batch2i.csv", index_col=0)
batch3 = pd.read_csv("data/batch2ii.csv", index_col=0)

b1_celldata = pd.read_csv("data/batch1_labels.csv", index_col=0)
b1_celldata = pd.DataFrame(Cell2Cluster(b1_celldata.values), columns = ["cell type"])

b2_celldata = pd.read_csv("data/batch2i_labels.csv", index_col=0)
b2_celldata = b2_celldata.T
b2_celldata = pd.DataFrame(Cell2Cluster(b2_celldata.values), columns = ["cell type"])

b3_celldata = pd.read_csv("data/batch2ii_labels.csv", index_col=0)
b3_celldata = b3_celldata.T
b3_celldata = pd.DataFrame(Cell2Cluster(b3_celldata.values), columns = ["cell type"])


In [4]:
# Create a single cell object   
bat1 = SingleCell(dataset = "simulation1", data = batch1, celldata = b1_celldata)
bat2 = SingleCell(dataset = "simulation2", data = batch2, celldata = b2_celldata)
bat3 = SingleCell(dataset = "simulation3", data = batch3, celldata = b3_celldata)

bat3.printSummary()

------------------------------------------------------------------------------
Dataset:  simulation3
------------------------------------------------------------------------------
Dimension:  (100, 1000)
Cell Metadata:  ['cell type']
Gene Metadata:  ['gene_names']
------------------------------------------------------------------------------


In [5]:
datasets = [bat1.getCounts().T, bat3.getCounts().T]
genes_list = [bat1.getGeneColumnValues('gene_names'), bat2.getGeneColumnValues('gene_names')]

In [6]:
integrated, corrected, genes = scanorama.correct(datasets, genes_list, return_dimred=True, return_dense = True)

Found 100 genes among all datasets
[[0.    0.641]
 [0.    0.   ]]
Processing datasets (0, 1)


In [7]:

bat1.setCounts(integrated[0].T)
bat3.setCounts(integrated[1].T)

# Form batches
batches = [bat1, bat3]

In [8]:
batches = IntegrateBatches(batches, name_by = ['gene_names', 'gene_names'])
batches = MergeBatches(batches)
batches.printSummary()

Number of common genes in all batches:  100
------------------------------------------------------------------------------
Dataset:  simulation1+simulation3
------------------------------------------------------------------------------
Dimension:  (100, 2000)
Cell Metadata:  ['cell type' 'batch']
Gene Metadata:  ['gene_names']
------------------------------------------------------------------------------


In [9]:
fig5 = tSNEPlot(batches, color_by = 'cell type', marker_by = 'batch', tsne_init = 'pca')
fig5.show()

Displaying chart at http://localhost:21560/
