In [1]:
import multiprocessing as mp
import itertools as it
import functools as ft
import pickle
import sys
import numpy as np
import pandas as pd
import time
import sklearn
import sklearn.preprocessing as pre
import scipy.sparse as sp

In [2]:
dat = pd.read_csv('./cord_blood_kinases.csv', sep=',', header=0, index_col=0);

  mask |= (ar1 == a)


In [3]:
def get_cell_sets(row, oe_csr):
    return oe_csr[row['lower']:row['upper']].sum(axis=0)

def first_candidates(cells, cell_sets, min_shared_cells):
    count_filter = cell_sets.apply(len) > min_shared_cells
    return list(map(lambda x: frozenset([x]), ((cells[count_filter])))), {frozenset([x]):y for x,y in cell_sets[count_filter].to_dict().items()}

def intersector(tuple_of_candidates, tuple_of_sets):
    return ft.reduce(lambda x,y: x.union(y), tuple_of_candidates), tuple_of_sets[0] & tuple_of_sets[1]

def cell_set_getter(input_list, cell_sets):
    for i in input_list:
        yield cell_sets[i]

def pickle_cells(cells, cell_sets, k):
    '''These files are gonna be decently big. Do not want to keep them in memory.'''
    with open('cell_' + str(k) + '.pickle', 'wb') as f:
        pickle.dump(cells, f, pickle.HIGHEST_PROTOCOL)
    with open('cell_sets_' + str(k) + '.pickle', 'wb') as f:
        pickle.dump(cell_sets, f, pickle.HIGHEST_PROTOCOL)

In [4]:
def fast_gather_gene_sets(dat, min_shared_cells = 100, min_percent_cells = None, max_cluster_size = sys.maxsize):
    st = time.time()
    begin = st
    cores = max(mp.cpu_count()-1, 1)
    
    total_cells = dat['barcode'].nunique()
    
    if(min_percent_cells is not None):
        min_shared_cells = int(min_percent_cells * total_cells)

    cell_id_dict = {y:x for x,y in enumerate(dat['symbol'].unique())}
    dat['symbol'] = dat['symbol'].map(cell_id_dict)
    cells = dat['symbol'].unique()
    
    barcode_id_dict = {y:x for x,y in enumerate(dat['barcode'].unique())}
    dat['barcode'] = dat['barcode'].map(barcode_id_dict)
    
    cell_sets = dat.groupby('symbol')['barcode'].apply(set)
    
    en = time.time()
    
    print('Formatted data in ' + str(en-st) + ' seconds')
    
    cells, cell_sets = first_candidates(cells, cell_sets, min_shared_cells)
    
    print(str(len(cells)) + ' genes made have > ' + str(min_shared_cells) + ' cells')
    
    k = 2
    n = len(cells)
    
    pickle_cells(cells, cell_sets, k)
    
    while(len(cells) > 0 and k < max_cluster_size):
        st = time.time()
        
        candidates_left, candidates_right = zip(*list(filter(lambda x: len(x[0]|x[1]) == k, it.combinations(cells, 2))))
        left_gen = cell_set_getter(candidates_left, cell_sets)
        right_gen = cell_set_getter(candidates_right, cell_sets)        
        gener = ((x, y) for x, y in map(intersector, *(zip(candidates_left, candidates_right),zip(left_gen, right_gen))) if len(y)>min_shared_cells)

        try:
            cells, cell_sets = zip(*gener)
            cell_sets = dict(zip(cells, cell_sets))
            cells = list(cells)
        except:
            print('Ran out of candidates! Last size: ' + str(k))
            print('Total time: ' + str(en - begin) + ' seconds')
        
        k+= 1
        n = len(cells)
        
        en = time.time()
        
        print('Found ' + str(n) + ' remaining gene clusters with > ' + str(min_shared_cells) + ' of size: ' +str(k-1))
        print('Iteration took: ' + str(en-st) + ' seconds')
        
        if(n == 0):
            print('Terminated! Total run time: ' + str(en - begin) + ' seconds')
        else:
            print('Pickling!')
            pickle_cells(cells, cell_sets, k-1)
        

In [5]:
fast_gather_gene_sets(dat, min_percent_cells = 0.04)

Formatted data in 3.1110219955444336 seconds
156 genes made have > 10953 cells
Found 515 remaining genes with > 10953 of size: 2
Iteration took: 8.20874810218811 seconds
Pickling!
Found 807 remaining genes with > 10953 of size: 3
Iteration took: 12.141279935836792 seconds
Pickling!
Found 2052 remaining genes with > 10953 of size: 4
Iteration took: 23.532851219177246 seconds
Pickling!


ValueError: not enough values to unpack (expected 2, got 0)