In [14]:
import os
import sys
import glob
import pickle
import itertools
import random

from IPython.display import Image

import matplotlib
import matplotlib.pyplot as plt
import matplotlib.mlab as mlab
from matplotlib.colors import ListedColormap
from scipy.stats import multivariate_normal

import numpy as np
import pandas as pd
from scipy.stats import beta
from scipy.stats import norm
from sklearn import mixture


from flowMP import *

random.seed(1234)
%matplotlib inline

In [6]:
# load AML data and table
##### X: np.array, flow cytometry data, arcsin transformed
##### T: table of expert knowledge

np.random.seed(1234)
PATH = '/home/disij/projects/acdc/data/'
#PATH = '/Users/disiji/Dropbox/current/flow_cytometry/acdc/data/'

### LOAD DATA ###
path = PATH + 'AML_benchmark/'
df = pd.read_csv( path + 'AML_benchmark.csv.gz', sep=',', header = 0, compression = 'gzip', engine='python')
table = pd.read_csv(path + 'AML_table.csv', sep=',', header=0, index_col=0)
print table.shape

### PROCESS: discard ungated events ###
df = df[df.cell_type != 'NotGated']
df = df.drop(['Time', 'Cell_length','file_number', 'event_number', 'DNA1(Ir191)Di',
              'DNA2(Ir193)Di', 'Viability(Pt195)Di', 'subject'], axis = 1)
channels = [item[:item.find('(')] for item in df.columns[:-1]]
df.columns = channels + ['cell_type']
df = df.loc[df['cell_type'] != 'NotDebrisSinglets']

table = table.fillna(0)
X = df[channels].values
table_headers = list(table)
# df2 = pd.DataFrame([[0]*table.shape[1]], columns=table.columns, index =['unknown'])
# table = table.append(df2)

### transform data
data = np.arcsinh((X-1.)/5.)
theta_space = np.array([[data[:,d].min(), data[:,d].max()] for d in range(data.shape[1])])


cell_type_name2idx = {x:i for i,x in enumerate(table.index)}
Y = np.array([cell_type_name2idx[_] for _ in df.cell_type])

print data.shape

(14, 32)
(104184, 32)


In [7]:
set(df.cell_type)

{'Basophils',
 'CD16+ NK cells',
 'CD16- NK cells',
 'CD34+CD38+CD123+ HSPCs',
 'CD34+CD38+CD123- HSPCs',
 'CD34+CD38lo HSCs',
 'CD4 T cells',
 'CD8 T cells',
 'Mature B cells',
 'Monocytes',
 'Plasma B cells',
 'Pre B cells',
 'Pro B cells',
 'pDCs'}

In [13]:
print df[df.cell_type == 'CD34+CD38+CD123+ HSPCs'].shape
print df[df.cell_type == 'CD34+CD38+CD123- HSPCs'].shape

print df[df.cell_type == 'CD34+CD38+CD123+ HSPCs'].shape[0]*1.0 / df.shape[0]
print df[df.cell_type == 'CD34+CD38+CD123- HSPCs'].shape[0]*1.0 / df.shape[0]

(304, 33)
(3295, 33)
0.00291791445903
0.0316267373109


In [1]:
cell_population_H = [[0.03775, 0.2509, 0.07165, 0.03845, 0.02695, 0.03565, 0.01245, 0.02745, 0.21375, 0.00245, 0.01155, 0.0138, 0.20885, 0.04835], [0.05695, 0.2567, 0.0362, 0.0379, 0.02785, 0.0352, 0.011, 0.01635, 0.1613, 0.0017, 0.0194, 0.01085, 0.2918, 0.0368], [0.05945, 0.2194, 0.0189, 0.05425, 0.0429, 0.0322, 0.00725, 0.01305, 0.1862, 0.00155, 0.0069, 0.01725, 0.28645, 0.05425], [0.05195, 0.29515, 0.06335, 0.04415, 0.0379, 0.01495, 0.01055, 0.0421, 0.16595, 0.001, 0.00405, 0.01715, 0.2041, 0.04765], [0.0451, 0.34775, 0.0886, 0.0463, 0.0326, 0.02175, 0.01195, 0.0298, 0.16005, 0.00085, 0.0051, 0.03115, 0.12315, 0.05585]]

cell_population_SJ = [[0.0588, 0.0448, 0.0015, 0.01805, 0.01655, 0.15305, 0.04805, 0.05315, 0.20305, 0.0112, 0.00405, 0.2002, 0.1311, 0.05645], [0.0452, 0.09025, 0.01835, 0.025, 0.02105, 0.07585, 0.10565, 0.057, 0.2579, 0.0011, 0.00945, 0.08145, 0.151, 0.06075], [0.04995, 0.0439, 0.0023, 0.01465, 0.01295, 0.09865, 0.1224, 0.0824, 0.0884, 0.0011, 0.0054, 0.2021, 0.18985, 0.08595], [0.0381, 0.05265, 0.0055, 0.01905, 0.01335, 0.1471, 0.09265, 0.09885, 0.1292, 0.00045, 0.006, 0.1335, 0.1759, 0.0877], [0.0497, 0.0913, 0.0279, 0.023, 0.01595, 0.0261, 0.0416, 0.03585, 0.27365, 0.00215, 0.00555, 0.0343, 0.24375, 0.1292], [0.07285, 0.1327, 0.02395, 0.0336, 0.0262, 0.0614, 0.0184, 0.0213, 0.25355, 0.00595, 0.0058, 0.03025, 0.2606, 0.05345], [0.0626, 0.0707, 0.0127, 0.02775, 0.02125, 0.1273, 0.0839, 0.09695, 0.0933, 0.00205, 0.00375, 0.06925, 0.2132, 0.1153], [0.1476, 0.08855, 0.0033, 0.0403, 0.0323, 0.03135, 0.021, 0.02565, 0.2893, 0.0176, 0.00395, 0.0001, 0.1461, 0.1529], [0.13945, 0.1069, 0.0075, 0.06765, 0.0476, 0.0073, 0.02515, 0.08355, 0.1399, 0.00435, 0.00105, 0.0011, 0.31585, 0.05265], [0.09635, 0.15115, 0.03045, 0.06175, 0.0481, 0.13805, 0.06075, 0.06195, 0.09405, 0.0194, 0.0117, 0.06985, 0.07305, 0.0834], [0.157, 0.08305, 0.0011, 0.04045, 0.03525, 0.0379, 0.0532, 0.01645, 0.22805, 0.0138, 0.0067, 0.0, 0.23225, 0.0948], [0.0499, 0.07895, 0.0116, 0.0549, 0.0442, 0.14605, 0.1422, 0.07515, 0.0964, 0.0092, 0.00755, 0.0941, 0.11045, 0.07935], [0.0677, 0.20365, 0.0615, 0.068, 0.0546, 0.15065, 0.0343, 0.04125, 0.09435, 0.01175, 0.0079, 0.06175, 0.1032, 0.0394], [0.0686, 0.07195, 0.00845, 0.01705, 0.0132, 0.07585, 0.10345, 0.05355, 0.16895, 0.0015, 0.0143, 0.1172, 0.1844, 0.10155], [0.0109, 0.17575, 0.04155, 0.0401, 0.03435, 0.15305, 0.07215, 0.0895, 0.2334, 0.0105, 0.02115, 0.0171, 0.0925, 0.008], [0.01415, 0.20655, 0.02845, 0.0466, 0.03735, 0.1054, 0.02725, 0.0427, 0.18905, 0.021, 0.0117, 0.13505, 0.12965, 0.0051]]

In [2]:
import csv

with open("cell_population_20k_H.csv", "wb") as f:
    writer = csv.writer(f)
    writer.writerows(cell_population_H)
    
with open("cell_population_20k_SJ.csv", "wb") as f:
    writer = csv.writer(f)
    writer.writerows(cell_population_SJ)