In [1]:
import pandas as pd
import numpy as np

from scipy.sparse import csr_matrix, load_npz
import scanorama
from newsinglecell import SingleCell
from feats_batchcorrection import IntegrateBatches, MergeBatches
from scplot import tSNEPlot

In [2]:
# Load Datasets
X1 = load_npz('data/293t/tab.npz')
with open('data/293t/tab.genes.txt') as f:
    genes1 = np.array(f.read().rstrip().split())
    
X2 = load_npz('data/jurkat/tab.npz')
with open('data/jurkat/tab.genes.txt') as f:
    genes2 = np.array(f.read().rstrip().split())
    
X3 = load_npz('data/jurkat_293t_50_50/tab.npz')
with open('data/jurkat_293t_50_50/tab.genes.txt') as f:
    genes3 = np.array(f.read().rstrip().split())

In [3]:
def filter_genes(X, genes):
    
    X = X.toarray()
    X = X.T
    expr_count = np.sum(X > 0, axis = 1, keepdims = False)
    gene_filter = (expr_count >= 1)
    X = X[gene_filter, :]
    genes = genes[gene_filter]
    
    X = X.T
    X = csr_matrix(X)
    
    return X, genes

In [5]:
X1, genes1 = filter_genes(X1, genes1)
X2, genes2 = filter_genes(X2, genes2)
X3, genes3 = filter_genes(X3, genes3)

datasets = [X1, X2, X3]
genes_list = [genes1, genes2, genes3]

In [7]:
integrated, corrected, genes = scanorama.correct(datasets, genes_list, return_dimred=True, return_dense = True)

Found 15461 genes among all datasets
[[0.         0.00828984 0.70571924]
 [0.         0.         0.28369665]
 [0.         0.         0.        ]]
Processing datasets (0, 2)
Processing datasets (1, 2)


In [8]:
df1 = pd.DataFrame(integrated[0].T)
df2 = pd.DataFrame(integrated[1].T)
df3 = pd.DataFrame(integrated[2].T)


bat1 = SingleCell(dataset = "293t", data = df1)
bat2 = SingleCell(dataset = "jurkat", data = df2)
bat3 = SingleCell(dataset = "jurkat_293t_50_50", data = df3)

# Form batches
batches = [bat1, bat2, bat3]

In [9]:
bat3.printSummary()

------------------------------------------------------------------------------
Dataset:  jurkat_293t_50_50
------------------------------------------------------------------------------
Dimension:  (100, 3388)
Cell Metadata:  ['cell_names']
Gene Metadata:  ['gene_names']
------------------------------------------------------------------------------


In [13]:
batches = IntegrateBatches(batches, name_by = ['gene_names', 'gene_names', 'gene_names'])
batches = MergeBatches(batches)
batches.printSummary()

Number of common genes in all batches:  100
------------------------------------------------------------------------------
Dataset:  293t+jurkat+jurkat_293t_50_50
------------------------------------------------------------------------------
Dimension:  (100, 9530)
Cell Metadata:  ['cell_names' 'batch']
Gene Metadata:  ['gene_names']
------------------------------------------------------------------------------


In [14]:
fig1 = tSNEPlot(batches, color_by = 'batch', marker_by = 'batch', marker_size = 2,   tsne_perplexity = 600, tsne_iterations = 400)
fig1.show()

Displaying chart at http://localhost:15961/
