In [1]:
import numpy as np
import scanorama
import scipy.sparse as sparse
import pandas as pd



In [20]:
num_batches = 6
odd_batches = [10,9,9,8,8,7]
odd_cutpoints = np.cumsum(odd_batches)[:-1]

log_maqc = np.loadtxt("data/scanorama/odd_log_maqc.tsv")
print(log_maqc[:5,:5])
print(log_maqc.shape)

[[10.4429197  10.0894816  10.29571872 10.35757347 10.21380008]
 [10.72762168 10.59069949 10.57948343 10.76824546 10.61812718]
 [ 7.00130038  7.06207895  6.61484286  0.          6.58594095]
 [ 8.97386004  8.79325509  8.88049813  8.91367007  8.8296496 ]
 [ 0.          0.          0.          0.          0.        ]]
(54675, 51)


In [23]:
# Transpose array into samples x genes
# list_arr = np.split(np.transpose(log_maqc), num_batches, axis = 0)
# When batches are in odd proportions
list_arr = np.split(np.transpose(log_maqc), odd_cutpoints, axis = 0)
for elem in list_arr:
    print(elem.shape)

(10, 54675)
(9, 54675)
(9, 54675)
(8, 54675)
(8, 54675)
(7, 54675)


In [22]:
file = "data/scanorama/probeset_names.txt"
with open(file, "r") as f:
    list_genes = f.readlines()

list_genes = [gene.strip() for gene in list_genes]
# Repeat list of genes for each batch
list_list_genes = [list_genes] * num_batches
print(len(list_list_genes))

6


In [27]:
# # Integration (Returns SVD embeddings)
# integrated, genes = scanorama.integrate(list_arr, list_list_genes)

# Batch correction.
list_corrected_arr, gene_colnames = scanorama.correct(list_arr, list_list_genes, knn = 5)

# # Integration and batch correction.
# integrated, corrected, genes = scanorama.correct(list_arr, list_list_genes, return_dimred=True)

Found 54675 genes among all datasets
[[0.         0.88888889 0.         0.875      0.125      0.6       ]
 [0.         0.         0.77777778 0.22222222 0.44444444 0.57142857]
 [0.         0.         0.         0.66666667 0.875      0.28571429]
 [0.         0.         0.         0.         0.25       0.42857143]
 [0.         0.         0.         0.         0.         0.        ]
 [0.         0.         0.         0.         0.         0.        ]]
Processing datasets (0, 1)
Processing datasets (2, 4)
Processing datasets (0, 3)
Processing datasets (1, 2)
Processing datasets (2, 3)
Processing datasets (0, 5)
Processing datasets (1, 5)
Processing datasets (1, 4)
Processing datasets (3, 5)
Processing datasets (2, 5)
Processing datasets (3, 4)
Processing datasets (1, 3)
Processing datasets (0, 4)


In [28]:
corrected_arr = sparse.vstack(list_corrected_arr)
corrected_arr1 = sparse.csr_matrix.transpose(corrected_arr)
print(corrected_arr1.shape)
# Replace negative values with 0
corrected_arr1[corrected_arr1 < 0] = 0
corrected_df = pd.DataFrame(corrected_arr1.toarray())
# Name rownames of df with output list from scanorama
corrected_df.index = gene_colnames
# Rows of genes not ordered the same way as input
# Sort rows according to initial rownames
corrected_df1 = corrected_df.reindex(list_genes)
print(corrected_df1.iloc[:5,:5])

(54675, 51)
                  0         1         2         3         4
1007_s_at  0.006896  0.006615  0.006770  0.006811  0.006735
1053_at    0.006670  0.006488  0.006540  0.006646  0.006558
117_at     0.006690  0.006808  0.006428  0.002282  0.006499
121_at     0.005744  0.005572  0.005656  0.005675  0.005634
1255_g_at  0.000024  0.000095  0.000039  0.000064  0.000097


In [29]:
corrected_df1.to_csv("data/scanorama/scanorama_data_odd_k5.tsv",
                     sep = "\t", header = False)