In [1]:
import numpy as np
import scanorama
import scipy.sparse as sparse
import pandas as pd



In [2]:
# # Equal-sized batches
# num_batches = 6

# # Odd-sized batches (MAQC)
# odd_batches = [10,9,9,8,8,7]
# odd_cutpoints = np.cumsum(odd_batches)[:-1]

num_batches = 10
# Odd-sized batches (ALL)
yeoh_batch_sizes = [39,75,42,31,43,33,27,55,60,56]
yeoh_cutpoints = np.cumsum(yeoh_batch_sizes)[:-1]

batch_ordered_yeoh = np.loadtxt("data/scanorama/batch_ordered_yeoh.tsv")
print(batch_ordered_yeoh[:5,:5])
print(batch_ordered_yeoh.shape)

[[ 8.26152286  0.          9.42421669 10.35506169 10.10555956]
 [ 8.74450262  9.91200667  8.6617906   8.99720332  9.43414071]
 [ 6.73627818  0.          0.          0.          0.        ]
 [ 9.39194145  8.40188899  8.54336183  9.5406224   8.06558934]
 [ 8.96269031  9.3977443   9.3566957  10.92016275 11.22297172]]
(15372, 461)


In [3]:
# Transpose array into samples x genes
# # When batches are in odd proportions
list_arr = np.split(np.transpose(batch_ordered_yeoh), yeoh_cutpoints, axis = 0)
for elem in list_arr:
    print(elem.shape)

(39, 15372)
(75, 15372)
(42, 15372)
(31, 15372)
(43, 15372)
(33, 15372)
(27, 15372)
(55, 15372)
(60, 15372)
(56, 15372)


In [4]:
file = "data/scanorama/yeoh-probesets.txt"
with open(file, "r") as f:
    list_genes = f.readlines()

list_genes = [gene.strip() for gene in list_genes]
# Repeat list of genes for each batch
list_list_genes = [list_genes] * num_batches
print(len(list_list_genes))

10


In [12]:
# # Integration (Returns SVD embeddings)
# integrated, genes = scanorama.integrate(list_arr, list_list_genes)

# Batch correction.
list_corrected_arr, gene_colnames = scanorama.correct(list_arr, list_list_genes, knn = 5)

# # Integration and batch correction.
# integrated, corrected, genes = scanorama.correct(list_arr, list_list_genes, return_dimred=True)

Found 15372 genes among all datasets
[[0.         0.69230769 0.28205128 0.16129032 0.         0.03030303
  0.         0.         0.         0.        ]
 [0.         0.         0.71428571 0.25806452 0.3255814  0.06060606
  0.18518519 0.01818182 0.05       0.        ]
 [0.         0.         0.         0.19354839 0.02380952 0.06060606
  0.11111111 0.         0.04761905 0.        ]
 [0.         0.         0.         0.         0.35483871 0.35483871
  0.03703704 0.35483871 0.16129032 0.51612903]
 [0.         0.         0.         0.         0.         0.1627907
  0.1627907  0.46511628 0.39534884 0.27906977]
 [0.         0.         0.         0.         0.         0.
  0.24242424 0.51515152 0.3030303  0.15151515]
 [0.         0.         0.         0.         0.         0.
  0.         0.66666667 0.40740741 0.03703704]
 [0.         0.         0.         0.         0.         0.
  0.         0.         0.71666667 0.18181818]
 [0.         0.         0.         0.         0.         0.
  0.    

In [13]:
corrected_arr = sparse.vstack(list_corrected_arr)
corrected_arr1 = sparse.csr_matrix.transpose(corrected_arr)
print(corrected_arr1.shape)
# Replace negative values with 0
corrected_arr1[corrected_arr1 < 0] = 0
corrected_df = pd.DataFrame(corrected_arr1.toarray())
# Name rownames of df with output list from scanorama
corrected_df.index = gene_colnames
# Rows of genes not ordered the same way as input
# Sort rows according to initial rownames
corrected_df1 = corrected_df.reindex(list_genes)
print(corrected_df1.iloc[:5,:5])

(15372, 461)
                  0         1         2         3         4
1007_s_at  0.008489  0.005319  0.012243  0.013029  0.012442
1053_at    0.007214  0.008268  0.007114  0.007296  0.007703
117_at     0.011889  0.005235  0.004805  0.005061  0.005381
121_at     0.008315  0.007772  0.007659  0.008449  0.007115
1294_at    0.008548  0.009549  0.009151  0.010372  0.010103


In [14]:
corrected_df1.to_csv("data/scanorama/yeoh-scanorama_k5.tsv",
                     sep = "\t", header = False)

In [8]:
help(scanorama.correct)

Help on function correct in module scanorama.scanorama:

correct(datasets_full, genes_list, return_dimred=False, batch_size=5000, verbose=2, ds_names=None, dimred=100, approx=True, sigma=15, alpha=0.1, knn=20, return_dense=False, hvg=None, union=False, geosketch=False, geosketch_max=20000, seed=0)
    Integrate and batch correct a list of data sets.
    
    Parameters
    ----------
    datasets_full : `list` of `scipy.sparse.csr_matrix` or of `numpy.ndarray`
        Data sets to integrate and correct.
    genes_list: `list` of `list` of `string`
        List of genes for each data set.
    return_dimred: `bool`, optional (default: `False`)
        In addition to returning batch corrected matrices, also returns
        integrated low-dimesional embeddings.
    batch_size: `int`, optional (default: `5000`)
        The batch size used in the alignment vector computation. Useful when
        correcting very large (>100k samples) data sets. Set to large value
        that runs within avai