In [1]:
import h5py
import scipy
import numpy as np
import os
import zlib
import msgpack
from sklearn.utils import sparsefuncs

def read_compressed_json(json_path):
	if not os.path.isfile(json_path):
		return {}
	with open(json_path, 'rb') as f:
		o = f.read()
		o = zlib.decompress(o)
		o = msgpack.unpackb(o, strict_map_key=False)
		return o

In [2]:
GENES_ANNOTATION = read_compressed_json('/home/ub-sonvo-25d094476064960/genes_annotation.json.gz')
N_GENES = GENES_ANNOTATION['n_genes']
DEFAULT_SIZE_FACTOR = 10000
ROOT_DIR = '/home/ub-sonvo-25d094476064960/celltype_prediction'
TRAINED_STUDIES = open(ROOT_DIR + '/studies_idx_HVG.txt').read().splitlines()

In [14]:
indptr = np.load('trained_data/indptr.npy')
indices = np.load('trained_data/indices.npy')
data = np.load('trained_data/data.npy')

In [3]:
genes_bool = np.load('trained_data/genes_bool.npy')
genes_bool_idx = np.nonzero(genes_bool==True)[0]
genes_bool_idx_dict = {i: j for j, i in enumerate(genes_bool_idx)}

In [40]:
filtered_indptr = [0]
filtered_indices = []
filtered_data = []
for i in range(len(indptr) - 1):
    if i % 1000000 == 0:
        print(i)
    start = indptr[i] 
    end = indptr[i+1]
    tmp_indices = indices[start:end]
    tmp_data = data[start:end]
    tmp_bool = genes_bool[tmp_indices]
    filtered_indptr.append(filtered_indptr[-1] + np.sum(tmp_bool))
    filtered_indices.append(tmp_indices[tmp_bool])
    filtered_data.append(tmp_data[tmp_bool])

0
1000000
2000000
3000000
4000000
5000000


In [42]:
filtered_indptr_2 = np.array(filtered_indptr).astype(np.uint64)
original_filtered_indices = np.concatenate(filtered_indices).astype(np.uint16)
filtered_data_2 = np.concatenate(filtered_data).astype(np.float32)

In [46]:
np.save('trained_data/filtered_indptr.npy', filtered_indptr_2)
np.save('trained_data/original_filtered_indices.npy', original_filtered_indices)
np.save('trained_data/filtered_data.npy', filtered_data_2)

In [10]:
from numba import jit

@jit(nopython=True)
def go_loop(arr, genes_bool_arr):
    dct = {i: j for j, i in enumerate(genes_bool_arr)}
    new_arr = np.zeros(shape=arr.shape[0], dtype=np.uint32)
    for i in range(len(new_arr)):
        new_arr[i] = dct[arr[i]]
    return new_arr

In [12]:
filtered_indices = go_loop(original_filtered_indices, genes_bool_idx)

In [13]:
filtered_indices.shape

(7231077788,)

In [14]:
np.save('trained_data/filtered_indices.npy', filtered_indices)

In [3]:
filtered_indptr = np.load('trained_data/filtered_indptr.npy')
filtered_indices = np.load('trained_data/filtered_indices.npy')
filtered_data = np.load('trained_data/filtered_data.npy')

In [4]:
final_matrix = scipy.sparse.csr_matrix((filtered_data, filtered_indices, filtered_indptr), shape=(5033871, 12491))

In [5]:
meta = np.load('trained_data/trained_meta_celltypes.npy')

In [8]:
from sklearn import svm

In [10]:
clf = svm.SVC()

In [12]:
np.sum(meta==0)

111792

In [None]:
tmp_final_matrix = final_matrix[np.nonzero(meta!=0)[0], :][:3000000, :]

In [None]:
tmp_final_matrix.indptr = tmp_final_matrix.indptr.astype(np.int32)
tmp_final_matrix.indices = tmp_final_matrix.indices.astype(np.int32)

In [None]:
del final_matrix
del filtered_indptr
del filtered_indices
del filtered_data

In [None]:
clf.fit(tmp_final_matrix, meta[meta != 0][:3000000])