In [1]:
from joblib import Parallel, delayed
import multiprocessing
import numpy as np
import pandas as pd

import sys
sys.path.append('../')
from src import *

In [13]:
PATH_DATA = '/home/disij/projects/acdc/data/'
OUTPUT_DIR = "/extra/disij0/data/flow_cytometry/flowMP_output/"
PATH_SAMPLES = OUTPUT_DIR + "AML_accepted_samples"
FILENAME_PREDICTIONS = OUTPUT_DIR + "AML_predictions.csv.gz"
FILENAME_PREDICTIONS_CORRECTED_TABLE = OUTPUT_DIR + "AML_predictions_corrected_table.csv.gz"

Load AML dataset from [ACDC paper](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5447237/pdf/btx054.pdf)...

In [3]:
# load AML data and table

### LOAD DATA ###
path = PATH_DATA + 'AML_benchmark/'
df = pd.read_csv( path + 'AML_benchmark.csv.gz', sep=',', header = 0, \
                 compression = 'gzip', engine='python')
table = pd.read_csv(path + 'AML_table.csv', sep=',', header=0, index_col=0)

### PROCESS: discard ungated events ###
df = df[df.cell_type != 'NotGated']
df = df.drop(['Time', 'Cell_length','file_number', 'event_number', 'DNA1(Ir191)Di',
              'DNA2(Ir193)Di', 'Viability(Pt195)Di', 'subject'], axis = 1)
channels = [item[:item.find('(')] for item in df.columns[:-1]]
df.columns = channels + ['cell_type']
df = df.loc[df['cell_type'] != 'NotDebrisSinglets']

table = table.fillna(0)
X = df[channels].values
table_headers = list(table)

### transform data
data = np.arcsinh((X-1.)/5.)
N, d = data.shape
emp_bounds = np.array([[data[:,d].min(), data[:,d].max()] for d in range(data.shape[1])])
ct2idx = {x:i for i,x in enumerate(table.index)}
idx2ct = [key for idx, key in enumerate(table.index)]
Y = np.array([ct2idx[_] for _ in df.cell_type])

# rename table header 'HLA-DR' to 'HLADR' to prevent error from '-'
temp_headers = list(table)
temp_headers[29] = "HLADR"
table.columns = temp_headers

Learn MP trees and write accepted samples to file...

In [5]:
%%time


###################### Parallel run #####################
# n_mcmc_chain = 50
# n_mcmc_samples = 3000
# chains = range(n_mcmc_chain)
# num_cores = multiprocessing.cpu_count()
# accepted_MP = Parallel(n_jobs=num_cores)(delayed(MP_mcmc)\
#                 (data, emp_bounds, table, i, n_mcmc_samples) for i in chains)
# write_chains_to_file(accepted_MP, PATH_SAMPLES)



n_mcmc_chain = 50
n_mcmc_samples = 3000
accepted_MP = []
for i in range(n_mcmc_chain):
    print "Sampling Chain %d..." % i
    accepted_MP.append(MP_mcmc(data, emp_bounds, table, i, n_mcmc_samples))
    burnt_samples = [sample for chain in accepted_MP for sample in chain[-20:]]   
    Y_predict = classify_cells_majority(data, burnt_samples, table, ct2idx)
    accuracy = sum(Y == Y_predict)*1.0/ N
    print "Accuracy of cell classification on all data: %.3f" % (accuracy)

write_chains_to_file(accepted_MP, PATH_SAMPLES)

Sampling Chain 0...
Accuracy of cell classification on all data: 0.936
Sampling Chain 1...
Accuracy of cell classification on all data: 0.925
Sampling Chain 2...
Accuracy of cell classification on all data: 0.921
Sampling Chain 3...
Accuracy of cell classification on all data: 0.910
Sampling Chain 4...
Accuracy of cell classification on all data: 0.924
Sampling Chain 5...
Accuracy of cell classification on all data: 0.927
Sampling Chain 6...
Accuracy of cell classification on all data: 0.930
Sampling Chain 7...
Accuracy of cell classification on all data: 0.927
Sampling Chain 8...
Accuracy of cell classification on all data: 0.929
Sampling Chain 9...
Accuracy of cell classification on all data: 0.927
Sampling Chain 10...
Accuracy of cell classification on all data: 0.927
Sampling Chain 11...
Accuracy of cell classification on all data: 0.926
Sampling Chain 12...
Accuracy of cell classification on all data: 0.921
Sampling Chain 13...
Accuracy of cell classification on all data: 0.923
Sa

Classify cells based on accepted MP trees, and write predictions to file...

In [15]:
burnt_samples = [sample for chain in accepted_MP for sample in chain[-10:]]
Y_predict = classify_cells_majority(data, burnt_samples, table, ct2idx)
accuracy = sum(Y == Y_predict)*1.0/ N
print "Accuracy of cell classification: %.3f" % (accuracy)

df['MP_prediction'] = pd.Series([idx2ct[i] for i in Y_predict], index=df.index)
df.to_csv(FILENAME_PREDICTIONS, compression='gzip', index = False)

Accuracy of cell classification: 0.919


In [7]:
print [len(i) for i in accepted_MP]

[70, 77, 49, 48, 55, 67, 50, 63, 50, 68, 55, 47, 53, 58, 62, 42, 100, 51, 69, 51, 88, 65, 56, 41, 55, 63, 48, 89, 24, 49, 55, 87, 51, 59, 86, 60, 46, 89, 63, 56, 50, 49, 66, 43, 66, 60, 42, 57, 80, 62]


In [14]:
# compute accuracy of each sample on average