In [1]:
import os
import sys
import glob
import itertools
import random

from IPython.display import Image

import matplotlib
import matplotlib.pyplot as plt
import matplotlib.mlab as mlab
from matplotlib.colors import ListedColormap
from scipy.stats import multivariate_normal

import numpy as np
import pandas as pd
from scipy.stats import beta
from sklearn import mixture
from sklearn.cluster import KMeans

random.seed(1234)

%matplotlib inline

In [2]:
# load BMMC data and table
##### X: np.array, flow cytometry data, arcsin transformed
##### T: table of expert knowledge

np.random.seed(1234)
PATH = '/home/disij/projects/acdc/data/'

### LOAD DATA ###
path = PATH + 'BMMC_benchmark/'
df = pd.read_csv( path + 'BMMC_benchmark.csv.gz', sep=',', header = 0, compression = 'gzip', engine='python')
table = pd.read_csv(path + 'BMMC_table.csv', sep=',', header=0, index_col=0)
print table.shape

### PROCESS: discard ungated events ###
channels = ['CD45','CD45RA', 'CD19', 'CD11b', 'CD4', 'CD8', 'CD34',
           'CD20', 'CD33', 'CD123', 'CD38', 'CD90', 'CD3']
df.columns = channels + ['cell_type']
df = df[df.cell_type != 'NotGated']
df = df.loc[df['cell_type'] != 'NotDebrisSinglets']
df = df.loc[df['cell_type'] != 'Megakaryocyte']
df = df.loc[df['cell_type'] != 'CD11bmid Monocyte']
df = df.loc[df['cell_type'] != 'Platelet']
df = df.loc[df['cell_type'] != 'Myelocyte']
df = df.loc[df['cell_type'] != 'Erythroblast']

table = table.fillna(0)
X = df[channels].values

### transform data
data = np.arcsinh((X-1.)/5.)
theta_space = np.array([[data[:,d].min(), data[:,d].max()] for d in range(data.shape[1])])


cell_type_name2idx = {x:i for i,x in enumerate(table.index)}
cell_type_name2idx['unknown'] = len(cell_type_name2idx)
Y = np.array([cell_type_name2idx[_]  
              if _ in cell_type_name2idx else cell_type_name2idx['unknown']
             for _ in df.cell_type])

(19, 13)


In [3]:
print data.shape

(61725, 13)


In [4]:
%%time
gmm = mixture.GaussianMixture(n_components=table.shape[0], covariance_type='full').fit(data)

clusters = gmm.predict(data)
Y_predict = np.zeros_like(clusters)
# assign labels to clusters:
for k in range(table.shape[0]):
    Y_predict[clusters == k] = np.bincount(Y[clusters==k]).argmax()
print sum(Y_predict == Y)*1.0 / len(Y_predict)

0.841344673957
CPU times: user 22.5 s, sys: 54.8 s, total: 1min 17s
Wall time: 14.3 s
