In [1]:
# Import core libraries
import pandas as pd
import numpy as np
import scipy.sparse

# Import our libraries
from singlecelldata import SingleCell
from feats import CorrectBatches, IntegrateBatches, MergeBatches, LogFilter, GeneFilter, HVGFilter, FeatureNormalize, PCA
from scplotlib import tSNEPlot

In [2]:
# Load Datasets

# Change the path to the location of the datasets. 
path = 'D:/OneDrive/PhD Research/Code Repositories/ahc-clustering/data'

X1 = scipy.sparse.load_npz(path + '/293t/tab.npz').toarray()
with open(path + '/293t/tab.genes.txt') as f:
    genes1 = np.array(f.read().rstrip().split())
    
X2 = scipy.sparse.load_npz(path + '/jurkat/tab.npz').toarray()
with open(path + '/jurkat/tab.genes.txt') as f:
    genes2 = np.array(f.read().rstrip().split())
    
X3 = scipy.sparse.load_npz(path + '/jurkat_293t_50_50/tab.npz').toarray()
with open(path + '/jurkat_293t_50_50/tab.genes.txt') as f:
    genes3 = np.array(f.read().rstrip().split())

In [3]:
df1 = pd.DataFrame(X1.T)
df2 = pd.DataFrame(X2.T)
df3 = pd.DataFrame(X3.T)

genedata1 = pd.DataFrame(genes1, index = df1.index, columns = ['gene_names'])
bat1 = SingleCell(dataset = "293t", data = df1, genedata = genedata1)
genedata2 = pd.DataFrame(genes2, index = df2.index, columns = ['gene_names'])
bat2 = SingleCell(dataset = "jurkat", data = df2, genedata = genedata2)
genedata3 = pd.DataFrame(genes3, index = df3.index, columns = ['gene_names'])
bat3 = SingleCell(dataset = "jurkat_293t_50_50", data = df3, genedata = genedata3)

# Form batches
batches = [bat1, bat2, bat3]

In [4]:
# Filter Genes
for i in range(len(batches)):
    # batches[i] = LogFilter(batches[i])
    batches[i] = GeneFilter(batches[i], min_cells = 1, max_cells = batches[i].dim[1])
    batches[i] = FeatureNormalize(batches[i], 'cosine')


Applying Gene Filter . . .
Number of features remaining after gene filtering:  18760
Applying Gene Filter . . .
Number of features remaining after gene filtering:  17753
Applying Gene Filter . . .
Number of features remaining after gene filtering:  19536


In [5]:
batches = IntegrateBatches(batches, name_by = ['gene_names', 'gene_names', 'gene_names'])
batches.print()

Integrating Batches . . .
Number of common genes in all batches:  15461
Merging Batches . . .
------------------------------------------------------------------------------
Dataset:  293t+jurkat+jurkat_293t_50_50
------------------------------------------------------------------------------
Dimension:  (15461, 9530)
Cell Metadata:  ['cell_names' 'batch']
Gene Metadata:  ['gene_names']
------------------------------------------------------------------------------


In [6]:
batches_red = PCA(batches, n_comp = 100)
batches_red.print()

------------------------------------------------------------------------------
Dataset:  293t+jurkat+jurkat_293t_50_50_reduced
------------------------------------------------------------------------------
Dimension:  (100, 9530)
Cell Metadata:  ['cell_names' 'batch']
Gene Metadata:  ['principal_components']
------------------------------------------------------------------------------


In [None]:
fig1 = tSNEPlot(batches_red, color_by = 'batch', marker_by = 'batch', marker_size = 2,   tsne_perplexity = 600, tsne_iterations = 400)
fig1.show()

In [None]:
batches_red = CorrectBatches(batches_red, correct_order = ["jurkat+293t", "jurkat_293t_50_50"], sigma = 15, svd_dim = 2)

In [None]:
batches_red.print()

In [None]:
fig2 = tSNEPlot(batches_red, color_by = 'batch', marker_by = 'batch', marker_size = 2,   tsne_perplexity = 600, tsne_iterations = 400)
fig2.show()