In [1]:
from joblib import Parallel, delayed
import multiprocessing
import numpy as np
import pandas as pd

import sys
sys.path.append('../')
from src import *

In [13]:
PATH_DATA = '/home/disij/projects/acdc/data/'
OUTPUT_DIR = "/extra/disij0/data/flow_cytometry/flowMP_output/"
PATH_SAMPLES = OUTPUT_DIR + "AML_corrected_table_accepted_samples"
FILENAME_PREDICTIONS = OUTPUT_DIR + "AML_corrected_table_predictions.csv.gz"

Load AML dataset from [ACDC paper](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5447237/pdf/btx054.pdf)...

In [3]:
# load AML data and table

### LOAD DATA ###
path = PATH_DATA + 'AML_benchmark/'
df = pd.read_csv( path + 'AML_benchmark.csv.gz', sep=',', header = 0, \
                 compression = 'gzip', engine='python')
table = pd.read_csv(path + 'AML_table.csv', sep=',', header=0, index_col=0)

### PROCESS: discard ungated events ###
df = df[df.cell_type != 'NotGated']
df = df.drop(['Time', 'Cell_length','file_number', 'event_number', 'DNA1(Ir191)Di',
              'DNA2(Ir193)Di', 'Viability(Pt195)Di', 'subject'], axis = 1)
channels = [item[:item.find('(')] for item in df.columns[:-1]]
df.columns = channels + ['cell_type']
df = df.loc[df['cell_type'] != 'NotDebrisSinglets']

table = table.fillna(0)
X = df[channels].values
table_headers = list(table)

### transform data
data = np.arcsinh((X-1.)/5.)
N, d = data.shape
emp_bounds = np.array([[data[:,d].min(), data[:,d].max()] for d in range(data.shape[1])])
ct2idx = {x:i for i,x in enumerate(table.index)}
idx2ct = [key for idx, key in enumerate(table.index)]
Y = np.array([ct2idx[_] for _ in df.cell_type])

# rename table header 'HLA-DR' to 'HLADR' to prevent error from '-'
temp_headers = list(table)
temp_headers[29] = "HLADR"
table.columns = temp_headers

There is a mistake in the prior information table, here I change the response of Mature B cells to marker CD38 from "0" to "-1.0" and classify cells based on the corrected table, and write predictions to file...

In [4]:
table.at['Mature B cells','CD38'] = -1.0

Learn MP trees and write accepted samples to file...

In [6]:
%%time

# n_mcmc_chain = 50
# n_mcmc_samples = 3000
# chains = range(n_mcmc_chain)
# num_cores = multiprocessing.cpu_count()
# accepted_MP = Parallel(n_jobs=num_cores)(delayed(MP_mcmc)\
#                 (data, emp_bounds, table, i, n_mcmc_samples) for i in chains)
# write_chains_to_file(accepted_MP, PATH_SAMPLES)


n_mcmc_chain = 50
n_mcmc_samples = 3000
accepted_MP = []
for i in range(n_mcmc_chain):
    print "Sampling Chain %d..." % i
    accepted_MP.append(MP_mcmc(data, emp_bounds, table, i, n_mcmc_samples))  
    
    burnt_samples = [sample for chain in accepted_MP for sample in chain[-20:]]
    Y_predict = classify_cells_majority(data, burnt_samples, table, ct2idx)
    accuracy = sum(Y == Y_predict)*1.0/ N
    print "Accuracy of cell classification on all data: %.3f" % (accuracy)

write_chains_to_file(accepted_MP, PATH_SAMPLES)

Sampling Chain 0...
Accuracy of cell classification on all data: 0.963
Sampling Chain 1...
Accuracy of cell classification on all data: 0.958
Sampling Chain 2...
Accuracy of cell classification on all data: 0.966
Sampling Chain 3...
Accuracy of cell classification on all data: 0.944
Sampling Chain 4...
Accuracy of cell classification on all data: 0.979
Sampling Chain 5...
Accuracy of cell classification on all data: 0.983
Sampling Chain 6...
Accuracy of cell classification on all data: 0.983
Sampling Chain 7...
Accuracy of cell classification on all data: 0.969
Sampling Chain 8...
Accuracy of cell classification on all data: 0.971
Sampling Chain 9...
Accuracy of cell classification on all data: 0.976
Sampling Chain 10...
Accuracy of cell classification on all data: 0.979
Sampling Chain 11...
Accuracy of cell classification on all data: 0.972
Sampling Chain 12...
Accuracy of cell classification on all data: 0.970
Sampling Chain 13...
Accuracy of cell classification on all data: 0.972
Sa

Classify cells based on accepted MP trees, and write predictions to file...

In [15]:
burnt_samples = [sample for chain in accepted_MP for sample in chain[-10:]]
Y_predict = classify_cells_majority(data, burnt_samples, table, ct2idx)
accuracy = sum(Y == Y_predict)*1.0/ N
print "Chain % d accuracy on data: %.3f" % (1,accuracy)

df['MP_prediction'] = pd.Series([table.index[i] for i in Y_predict], index=df.index)
df.to_csv(FILENAME_PREDICTIONS, compression='gzip', index = False)

Chain  1 accuracy on data: 0.969


In [14]:
# compute accuracy of each sample on average